Fix many linter issues

stefan-grafberger · Feb 24, 2024 · 4aebf99 · 4aebf99
1 parent 63d0991
commit 4aebf99
Show file tree

Hide file tree

Showing 26 changed files with 104 additions and 103 deletions.
diff --git a/demo/feature_overview/no_missing_embeddings.py b/demo/feature_overview/no_missing_embeddings.py
@@ -43,7 +43,7 @@ def required_inspections(self) -> Iterable[Inspection]:
 
     def evaluate(self, inspection_result: InspectionResult) -> CheckResult:
         """Evaluate the check"""
-        dag_node_to_missing_embeddings = dict()
+        dag_node_to_missing_embeddings = {}
         for dag_node, dag_node_inspection_result in inspection_result.dag_node_to_inspection_results.items():
             if MissingEmbeddings(self.example_threshold) in dag_node_inspection_result:
                 missing_embedding_info = dag_node_inspection_result[MissingEmbeddings(self.example_threshold)]

diff --git a/example_pipelines/healthcare/healthcare.py b/example_pipelines/healthcare/healthcare.py
@@ -48,4 +48,4 @@
 
 train_data, test_data = train_test_split(data)
 model = pipeline.fit(train_data, train_data['label'])
-print("Mean accuracy: {}".format(model.score(test_data, test_data['label'])))
+print(f"Mean accuracy: {model.score(test_data, test_data['label'])}")
diff --git a/experiments/performance/_benchmark_utils.py b/experiments/performance/_benchmark_utils.py
@@ -76,32 +76,32 @@ def get_code_for_op_benchmark(data_frame_rows, operator_type):
     if operator_type == OperatorBenchmarkType.PROJECTION:
         benchmark_setup = get_single_df_creation_str(data_frame_rows)
         benchmark_exec = get_test_projection_str()
-        benchmark_setup_func_str = "get_single_df_creation_str({})".format(data_frame_rows)
+        benchmark_setup_func_str = f"get_single_df_creation_str({data_frame_rows})"
         benchmark_exec_func_str = "get_test_projection_str()"
     elif operator_type == OperatorBenchmarkType.SELECTION:
         benchmark_setup = get_single_df_creation_str(data_frame_rows)
         benchmark_exec = get_test_selection_str()
-        benchmark_setup_func_str = "get_single_df_creation_str({})".format(data_frame_rows)
+        benchmark_setup_func_str = f"get_single_df_creation_str({data_frame_rows})"
         benchmark_exec_func_str = "get_test_selection_str()"
     elif operator_type == OperatorBenchmarkType.JOIN:
         benchmark_setup = get_multiple_dfs_creation_str(data_frame_rows)
         benchmark_exec = get_test_join_str()
-        benchmark_setup_func_str = "get_multiple_dfs_creation_str({})".format(data_frame_rows)
+        benchmark_setup_func_str = f"get_multiple_dfs_creation_str({data_frame_rows})"
         benchmark_exec_func_str = "get_test_join_str()"
     elif operator_type == OperatorBenchmarkType.ONE_HOT_ENCODER:
         benchmark_setup = get_np_cat_array_str(data_frame_rows)
         benchmark_exec = get_test_one_hot_encoder_str()
-        benchmark_setup_func_str = "get_np_cat_array_str({})".format(data_frame_rows)
+        benchmark_setup_func_str = f"get_np_cat_array_str({data_frame_rows})"
         benchmark_exec_func_str = "get_test_one_hot_encoder_str()"
     elif operator_type == OperatorBenchmarkType.STANDARD_SCALER:
         benchmark_setup = get_np_num_array_str(data_frame_rows)
         benchmark_exec = get_test_standard_scaler_str()
-        benchmark_setup_func_str = "get_np_num_array_str({})".format(data_frame_rows)
+        benchmark_setup_func_str = f"get_np_num_array_str({data_frame_rows})"
         benchmark_exec_func_str = "get_test_standard_scaler_str()"
     elif operator_type == OperatorBenchmarkType.DECISION_TREE:
         benchmark_setup = get_estimator_train_data_str(data_frame_rows)
         benchmark_exec = get_decision_tree_str()
-        benchmark_setup_func_str = "get_estimator_train_data_str({})".format(data_frame_rows)
+        benchmark_setup_func_str = f"get_estimator_train_data_str({data_frame_rows})"
         benchmark_exec_func_str = "get_decision_tree_str()"
     else:
         assert False
@@ -230,7 +230,7 @@ def prepare_benchmark_exec(benchmark_str, setup_str, inspections):
     """
     Get the setup str for timeit
     """
-    setup = cleandoc("""
+    setup = cleandoc(f"""
     from experiments.performance._empty_inspection import EmptyInspection
     from mlinspect.instrumentation._pipeline_executor import singleton
     from mlinspect.inspections import HistogramForColumns, RowLineage, MaterializeFirstOutputRows
@@ -239,57 +239,57 @@ def prepare_benchmark_exec(benchmark_str, setup_str, inspections):
         get_test_one_hot_encoder_str, get_np_num_array_str, get_test_standard_scaler_str, \
         get_estimator_train_data_str, get_decision_tree_str
 
-    test_code_setup = {}
-    inspector_result = singleton.run(python_code=test_code_setup, inspections={})
-    test_code_benchmark = {}
-    """.format(setup_str, inspections, benchmark_str))
+    test_code_setup = {setup_str}
+    inspector_result = singleton.run(python_code=test_code_setup, inspections={inspections})
+    test_code_benchmark = {benchmark_str}
+    """)
     return setup
 
 
 def trigger_benchmark_exec(inspections_str):
     """
     Get the benchmark str for timeit
     """
-    benchmark = cleandoc("""
-    inspector_result_two = singleton.run(python_code=test_code_benchmark, inspections={}, reset_state=False)
-    """.format(inspections_str))
+    benchmark = cleandoc(f"""
+    inspector_result_two = singleton.run(python_code=test_code_benchmark, inspections={inspections_str}, reset_state=False)
+    """)
     return benchmark
 
 
 def prepare_pipeline_benchmark_exec(test_code):
     """
     Get the benchmark str for timeit
     """
-    benchmark = cleandoc("""
+    benchmark = cleandoc(f"""
     from experiments.performance._benchmark_utils import get_adult_simple_py_str, get_adult_complex_py_str, \
         get_healthcare_py_str, get_compas_py_str
     
-    code = {}
-    """.format(test_code))
+    code = {test_code}
+    """)
     return benchmark
 
 
 def trigger_pipeline_benchmark_exec(inspections_str):
     """
     Get the benchmark str for timeit
     """
-    benchmark = cleandoc("""
+    benchmark = cleandoc(f"""
     from experiments.performance._empty_inspection import EmptyInspection
     from mlinspect import PipelineInspector
     
     PipelineInspector\
             .on_pipeline_from_string(code)\
-            .add_required_inspections({}) \
+            .add_required_inspections({inspections_str}) \
             .execute()
-    """.format(inspections_str))
+    """)
     return benchmark
 
 
 def get_single_df_creation_str(data_frame_rows):
     """
     Get a complete code str that creates a DF with random value
     """
-    test_code = cleandoc("""
+    test_code = cleandoc(f"""
         import pandas as pd
         import numpy as np
         from numpy.random import randint
@@ -305,7 +305,7 @@ def get_single_df_creation_str(data_frame_rows):
         group_col_3 = pd.Series(random.choices(categories, k={data_frame_rows}))
         df = pd.DataFrame(zip(a, b, c, d, group_col_1, group_col_2, group_col_3), columns=['A', 'B', 'C', 'D', 
             'group_col_1', 'group_col_2', 'group_col_3'])
-        """.format(data_frame_rows=data_frame_rows))
+        """)
     return test_code
 
 
@@ -339,7 +339,7 @@ def get_multiple_dfs_creation_str(data_frame_rows):
     assert sizes_before_join - start_with_offset == data_frame_rows
 
     # mlinspect does not support some ast nodes yet like *, /, and {}, so we need to avoid them
-    test_code = cleandoc("""
+    test_code = cleandoc(f"""
         import pandas as pd
         import numpy as np
         from numpy.random import randint, shuffle
@@ -362,8 +362,7 @@ def get_multiple_dfs_creation_str(data_frame_rows):
         df_a = pd.DataFrame(zip(id_a, a, b, group_col_1, group_col_2, group_col_3), columns=['id', 'A', 'B', 
             'group_col_1', 'group_col_2', 'group_col_3'])
         df_b = pd.DataFrame(zip(id_b, c, d), columns=['id', 'C', 'D'])
-        """.format(sizes_before_join=sizes_before_join, start_with_offset=start_with_offset,
-                   end_with_offset=end_with_offset))
+        """)
     return test_code
 
 
@@ -381,15 +380,15 @@ def get_np_cat_array_str(data_frame_rows):
     """
     Get a complete code str that creates a np array with random values
     """
-    test_code = cleandoc("""
+    test_code = cleandoc(f"""
         from sklearn.preprocessing import OneHotEncoder
         import pandas as pd
         import random
 
         categories = ['cat_a', 'cat_b', 'cat_c']
         group_col_1 = pd.Series(random.choices(categories, k={data_frame_rows}))
         df = pd.DataFrame(zip(group_col_1), columns=["group_col_1"])
-        """.format(data_frame_rows=data_frame_rows))
+        """)
     return test_code
 
 
@@ -408,14 +407,14 @@ def get_np_num_array_str(data_frame_rows):
     """
     Get a complete code str that creates a np array with random values
     """
-    test_code = cleandoc("""
+    test_code = cleandoc(f"""
         from sklearn.preprocessing import StandardScaler
         import pandas as pd
         from numpy.random import randint
 
-        series = randint(0,100,size=({})) 
+        series = randint(0,100,size=({data_frame_rows})) 
         df = pd.DataFrame(series, columns=["num"])
-        """.format(data_frame_rows))
+        """)
     return test_code
 
 
@@ -434,7 +433,7 @@ def get_estimator_train_data_str(data_frame_rows):
     """
     Get a complete code str that creates a np array with random values
     """
-    test_code = cleandoc("""
+    test_code = cleandoc(f"""
         from sklearn.preprocessing import StandardScaler
         import pandas as pd
         from numpy.random import randint
@@ -447,7 +446,7 @@ def get_estimator_train_data_str(data_frame_rows):
 
         data_df = pd.DataFrame(data)
         target_df = pd.DataFrame(target)
-        """.format(data_frame_rows=data_frame_rows))
+        """)
     return test_code
 
 
@@ -466,7 +465,7 @@ def get_adult_simple_py_str():
     """
     Get the code str for the adult_easy pipeline
     """
-    with open(ADULT_SIMPLE_PY) as file:
+    with open(ADULT_SIMPLE_PY, encoding="utf-8") as file:
         test_code = file.read()
     return test_code
 
@@ -475,7 +474,7 @@ def get_adult_complex_py_str():
     """
     Get the code str for the adult_easy pipeline
     """
-    with open(ADULT_COMPLEX_PY) as file:
+    with open(ADULT_COMPLEX_PY, encoding="utf-8") as file:
         test_code = file.read()
     return test_code
 
@@ -484,7 +483,7 @@ def get_compas_py_str():
     """
     Get the code str for the adult_easy pipeline
     """
-    with open(COMPAS_PY) as file:
+    with open(COMPAS_PY, encoding="utf-8") as file:
         test_code = file.read()
     return test_code
 
@@ -493,6 +492,6 @@ def get_healthcare_py_str():
     """
     Get the code str for the adult_easy pipeline
     """
-    with open(HEALTHCARE_PY) as file:
+    with open(HEALTHCARE_PY, encoding="utf-8") as file:
         test_code = file.read()
     return test_code
diff --git a/mlinspect/backends/_backend.py b/mlinspect/backends/_backend.py
@@ -30,15 +30,18 @@ class Backend(metaclass=abc.ABCMeta):
     The Interface for the different instrumentation backends
     """
 
+    @staticmethod
     @abc.abstractmethod
-    def before_call(self, operator_context, input_infos: List[AnnotatedDfObject]) \
+    def before_call(operator_context, input_infos: List[AnnotatedDfObject]) \
             -> List[AnnotatedDfObject]:
         """The value or module a function may be called on"""
         # pylint: disable=too-many-arguments, unused-argument
         raise NotImplementedError
 
+
+    @staticmethod
     @abc.abstractmethod
-    def after_call(self, operator_context, input_infos: List[AnnotatedDfObject], return_value,
+    def after_call(operator_context, input_infos: List[AnnotatedDfObject], return_value,
                    non_data_function_args: Dict[str, any] = MappingProxyType({})) -> BackendResult:
         """The return value of some function"""
         # pylint: disable=too-many-arguments, unused-argument

diff --git a/mlinspect/backends/_backend_utils.py b/mlinspect/backends/_backend_utils.py
@@ -1,6 +1,7 @@
 """
 Some utility functions the different instrumentation backends
 """
+# pylint: disable=unnecessary-dunder-call
 import itertools
 
 import numpy
@@ -62,7 +63,7 @@ def get_iterator_for_type(data, np_nditer_with_refs=False, columns=None):
     elif isinstance(data, list):
         iterator = get_list_row_iterator(data, columns)
     else:
-        raise NotImplementedError("TODO: Support type {}!".format(type(data)))
+        raise NotImplementedError(f"TODO: Support type {type(data)}!")
     return iterator
 
 
@@ -89,7 +90,7 @@ def create_wrapper_with_annotations(annotations_df, return_value) -> AnnotatedDf
     elif return_value is None:
         new_return_value = AnnotatedDfObject(None, annotations_df)
     else:
-        raise NotImplementedError("A type that is still unsupported was found: {}".format(return_value))
+        raise NotImplementedError(f"A type that is still unsupported was found: {return_value}")
     return new_return_value
 
 

diff --git a/mlinspect/backends/_pandas_backend.py b/mlinspect/backends/_pandas_backend.py
@@ -77,8 +77,8 @@ def after_call(operator_context, input_infos: List[AnnotatedDfObject], return_va
             input_infos[1].result_data.drop("mlinspect_index_y", axis=1, inplace=True)
 
         else:
-            raise NotImplementedError("PandasBackend doesn't know any operations of type '{}' yet!"
-                                      .format(operator_context.operator))
+            raise NotImplementedError(f"PandasBackend doesn't know any operations of type "
+                                      f"'{operator_context.operator}' yet!")
 
         return return_value
 

diff --git a/mlinspect/checks/_check.py b/mlinspect/checks/_check.py
@@ -63,4 +63,4 @@ def __hash__(self):
 
     def __repr__(self):
         """Checks must have a str representation"""
-        return "{}({})".format(self.__class__.__name__, self.check_id or "")
+        return f"{self.__class__.__name__}({self.check_id or ''})"
diff --git a/mlinspect/checks/_no_bias_introduced_for.py b/mlinspect/checks/_no_bias_introduced_for.py
@@ -78,10 +78,9 @@ def evaluate(self, inspection_result: InspectionResult) -> CheckResult:
                 column_result = self.get_histograms_for_node_and_column(column, histograms, node, parents)
                 column_results[column] = column_result
                 if not column_result.acceptable_change:
-                    issue = "A {} causes a min_relative_ratio_change of '{}' by {}, a value below the " \
-                            "configured minimum threshold {}!" \
-                        .format(node.operator_info.operator.value, column, column_result.min_relative_ratio_change,
-                                self.min_allowed_relative_ratio_change)
+                    issue = f"A {node.operator_info.operator.value} causes a min_relative_ratio_change of '{column}' " \
+                            f"by {column_result.min_relative_ratio_change}, a value below the " \
+                            f"configured minimum threshold {self.min_allowed_relative_ratio_change}!"
                     issue_list.append(issue)
                     check_status = CheckStatus.FAILURE
 
@@ -185,7 +184,7 @@ def get_distribution_changes_overview_as_df(no_bias_check_result: NoBiasIntroduc
         descriptions = []
         assert isinstance(no_bias_check_result.check, NoBiasIntroducedFor)
         sensitive_column_names = no_bias_check_result.check.sensitive_columns
-        sensitive_column_names = ["'{}' distribution change below the configured minimum test threshold".format(name)
+        sensitive_column_names = [f"'{name}' distribution change below the configured minimum test threshold"
                                   for name in sensitive_column_names]
         sensitive_columns = []
         for _ in range(len(sensitive_column_names)):

diff --git a/mlinspect/checks/_no_illegal_features.py b/mlinspect/checks/_no_illegal_features.py
@@ -54,7 +54,7 @@ def evaluate(self, inspection_result: InspectionResult) -> CheckResult:
         forbidden_columns = {*ILLEGAL_FEATURES, *self.additional_illegal_feature_names}
         used_illegal_columns = list(set(used_columns).intersection(forbidden_columns))
         if used_illegal_columns:
-            description = "Used illegal columns: {}".format(used_illegal_columns)
+            description = f"Used illegal columns: {used_illegal_columns}"
             result = NoIllegalFeaturesResult(self, CheckStatus.FAILURE, description, used_illegal_columns)
         else:
             result = NoIllegalFeaturesResult(self, CheckStatus.SUCCESS, None, [])

diff --git a/mlinspect/checks/_similar_removal_probabilities_for.py b/mlinspect/checks/_similar_removal_probabilities_for.py
@@ -78,10 +78,9 @@ def evaluate(self, inspection_result: InspectionResult) -> CheckResult:
                 column_result = self.get_histograms_for_node_and_column(column, histograms, node, parents)
                 column_results[column] = column_result
                 if not column_result.acceptable_probability_difference:
-                    issue = "A {} causes a max_probability_difference of '{}' by {}, a value above the " \
-                            "configured maximum threshold {}!" \
-                        .format(node.operator_info.operator.value, column, column_result.max_probability_difference,
-                                self.max_allowed_probability_difference)
+                    issue = f"A {node.operator_info.operator.value} causes a max_probability_difference of " \
+                            f"'{column}' by {column_result.max_probability_difference}, a value above the " \
+                            f"configured maximum threshold {self.max_allowed_probability_difference}!"
                     issue_list.append(issue)
                     check_status = CheckStatus.FAILURE
 
@@ -224,8 +223,8 @@ def get_removal_probabilities_overview_as_df(removal_probab_check_result: Simila
         assert isinstance(removal_probab_check_result.check, SimilarRemovalProbabilitiesFor)
         sensitive_column_names = []
         for name in removal_probab_check_result.check.sensitive_columns:
-            removal_probability_column_name = "'{}' probability difference below the configured maximum test " \
-                                              "threshold".format(name)
+            removal_probability_column_name = f"'{name}' probability difference below the configured maximum test " \
+                                              "threshold"
             sensitive_column_names.append(removal_probability_column_name)
 
         sensitive_columns = []

diff --git a/mlinspect/inspections/_inspection.py b/mlinspect/inspections/_inspection.py
@@ -41,4 +41,4 @@ def __hash__(self):
 
     def __repr__(self):
         """Inspections must have a str representation"""
-        return "{}({})".format(self.__class__.__name__, self.inspection_id)
+        return f"{self.__class__.__name__}({self.inspection_id})"