Pylint cleanup

gherka · Nov 24, 2023 · e637252 · e637252
1 parent 0eceb59
commit e637252
Show file tree

Hide file tree

Showing 27 changed files with 208 additions and 195 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -23,13 +23,14 @@
 # no Warning level messages displayed, use "--disable=all --enable=classes
 # --disable=W".
 disable=C0303, # trailing whitespace
-        C0330, # hanging indent,
         C0103, # invalid name - too many false positives
+        C3001, # Lambda expression assigned to a variable
         E1136, # value is unsubscriptable
+        E1101, # no-member; too many false positives
         W0212, # access to protected member
         E1130, # invalid unary operand (numpy)
-        R0201, # no self use - false positives in test modules
         W0622, # redefine builtins - __package__
+        W0640, # variable defined in loop
 
 
 [BASIC]
@@ -104,7 +105,7 @@ indent-after-paren=4
 indent-string='    '
 
 # Maximum number of characters on a single line.
-max-line-length=88
+max-line-length=100
 
 # Maximum number of lines in a module.
 max-module-lines=1000
@@ -236,7 +237,7 @@ valid-metaclass-classmethod-first-arg=cls
 [DESIGN]
 
 # Maximum number of arguments for function / method.
-max-args=6
+max-args=8
 
 # Maximum number of attributes for a class (see R0902).
 max-attributes=15
@@ -248,7 +249,7 @@ max-bool-expr=5
 max-branches=12
 
 # Maximum number of locals for function / method body.
-max-locals=20
+max-locals=25
 
 # Maximum number of parents for a class (see R0901).
 max-parents=7
@@ -279,5 +280,5 @@ deprecated-modules=
 
 # Exceptions that will emit a warning when being caught. Defaults to
 # "BaseException, Exception".
-overgeneral-exceptions=BaseException,
-                       Exception
+overgeneral-exceptions=builtins.BaseException,
+                       builtins.Exception
diff --git a/exhibit/command/tests/test_performance.py b/exhibit/command/tests/test_performance.py
@@ -3,6 +3,8 @@
 regular unit / reference testing
 '''
 
+#pylint: disable=W0201
+
 # Standard library imports
 import string
 import unittest
@@ -25,7 +27,7 @@
     from memory_profiler import memory_usage
 except ImportError:
     memory_usage = None
-    print(f"memory_profiler not found. Make sure exhibit is installed in [dev] mode")
+    print("memory_profiler not found. Make sure exhibit is installed in [dev] mode")
 
 class performanceTests(unittest.TestCase):
     '''

diff --git a/exhibit/core/constants.py b/exhibit/core/constants.py
@@ -8,4 +8,4 @@
 ORIGINAL_VALUES_PAIRED = "See paired column"
 ORIGINAL_VALUES_REGEX = "regex"
 MISSING_DATA_STR = "Missing data"
-EXHIBIT_DB_LOCAL = join("db", "exhibit.db")
+EXHIBIT_DB_LOCAL = join("db", "exhibit.db")
diff --git a/exhibit/core/constraints.py b/exhibit/core/constraints.py
@@ -1,6 +1,9 @@
 '''
 Module for various derived and user-set constraints
 '''
+
+# pylint: disable=C0302
+
 # Standard library imports
 from collections import namedtuple
 from datetime import datetime
@@ -166,7 +169,7 @@ def process_custom_constraints(self, custom_constraints):
 
             cc_filter = constraint.get("filter", None)
             cc_partitions = constraint.get("partition", None)
-            cc_targets = constraint.get("targets", dict())
+            cc_targets = constraint.get("targets", {})
 
             clean_cc_filter = clean_up_constraint_string(cc_filter)
             cc_filter_mask = get_constraint_mask(output_df, clean_cc_filter)
@@ -447,7 +450,7 @@ def _within_group_outliers(series):
             whether the value is divisible by 2 without remainder.
             '''
 
-            q25, q50, q75 = np.percentile(series, [25, 50, 75])
+            q25, _, q75 = np.percentile(series, [25, 50, 75])
             iqr = q75 - q25
 
             if iqr == 0:
@@ -761,9 +764,11 @@ def _make_almost_same(group):
             ulinked_df = generate_user_linked_anon_df(
                 self.spec_dict, user_linked_cols, new_df.shape[0], starting_col_matrix)
 
+            non_user_linked_cols = [x for x in df.columns if x not in user_linked_cols]
+
             new_df = pd.concat(
                 [ulinked_df.set_index(new_df.index)] + 
-                [df.loc[filter_idx, [x for x in df.columns if x not in user_linked_cols]]],
+                [df.loc[filter_idx, non_user_linked_cols]],
                 axis=1
             ).reindex(columns=df.columns)
 
@@ -865,7 +870,7 @@ def _generate_ordered_values(target_sequence, ordered_list, ordered_probs):
                 else:
                     pointer = 0
 
-            result = sorted(unordered_result, key=lambda x: ordered_list.index(x))
+            result = sorted(unordered_result, key=ordered_list.index)
 
             return result  
 
@@ -1025,7 +1030,7 @@ def _make_skewed_series(group):
 
             # add nulls based on the miss_probability of the skew column
             miss_pct = self.spec_dict["columns"][skew_col]["miss_probability"]
-            miss_val = pd.NA if group.dtype =='Int64' else np.nan
+            miss_val = pd.NA if group.dtype =="Int64" else np.nan
             skewed_result = np.where(
                 rng.random(size=nrows) < miss_pct,
                 miss_val, result.values)
@@ -1037,7 +1042,8 @@ def _make_skewed_series(group):
 
         target_cols = [x.strip() for x in target_str.split(",")]
         if len(target_cols) != 2: # pragma: no cover
-            raise Exception(f"{self.current_action} requires exactly 2 target columns.")
+            raise RuntimeError(
+                f"{self.current_action} requires exactly 2 target columns.")
 
         if partition_cols is not None:
             partition_cols = [x.strip() for x in partition_cols.split(",") if x]
@@ -1281,7 +1287,7 @@ def find_basic_constraint_columns(df):
 
     return output
 
-def clean_up_constraint_string(raw_string, type="cc_filter"):
+def clean_up_constraint_string(raw_string):
     '''
     The default way to handle column names with whitespace in eval strings
     is to enclose them in backticks. However, the default tokeniser will
@@ -1379,8 +1385,8 @@ def get_constraint_mask(df, clean_string):
                 .rename(lambda x: x.replace(" ", "__"), axis="columns")
                 .eval(clean_string, engine="python"))
 
-    except SyntaxError: #pragma: no cover
-        raise SyntaxError("Invalid filter expression supplied to custom action.")
+    except SyntaxError as e: #pragma: no cover
+        raise SyntaxError("Invalid filter expression supplied to custom action.") from e
 
     return mask
 

diff --git a/exhibit/core/exhibit.py b/exhibit/core/exhibit.py
@@ -96,22 +96,22 @@ def __init__(
         skip_columns=None, linked_columns=None, 
         uuid_columns=None, discrete_columns=None,
         save_probabilities=None, derived_columns_first=False,
-        verbose=False, **kwargs):
+        verbose=False):
         '''
         Initialise either from the CLI or by instantiating directly
         '''
 
         # Basic error checking on the arguments
         if linked_columns is not None and len(linked_columns) < 2:
-            raise Exception("Please provide at least two linked columns")
+            raise RuntimeError("Please provide at least two linked columns")
 
         self.command = command
         self.source = source
         self.output = output
         self.inline_limit = inline_limit
         self.equal_weights = equal_weights
         self.skip_columns = skip_columns or set()
-        self.linked_columns= linked_columns or list()
+        self.linked_columns= linked_columns or []
         self.uuid_columns= uuid_columns or set()
         self.discrete_columns = discrete_columns or set()
         self.save_probabilities = save_probabilities or set()
@@ -179,7 +179,7 @@ def write_spec(self, spec_yaml=None):
         else:
             output_path = self.output
 
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             f.write(spec_yaml)
 
         print("Exhibit ready to view")
@@ -208,7 +208,7 @@ def read_spec(self):
                 self.source = path_checker(self.source)
 
             if self.source.suffix == ".yml":
-                with open(self.source) as f:
+                with open(self.source, encoding="utf-8") as f:
                     self.spec_dict = yaml.safe_load(f)
             else: #pragma: no cover
                 raise TypeError("Specification is not in .yml format")
@@ -219,13 +219,13 @@ def read_spec(self):
         # these NONE values early and change them into empty sequences.
         for key, value in self.spec_dict["metadata"].items():
             if "columns" in key and value is None:
-                self.spec_dict["metadata"][key] = list()
+                self.spec_dict["metadata"][key] = []
 
         if self.spec_dict.get("linked_columns", None) is None:
-                self.spec_dict["linked_columns"] = list()
+            self.spec_dict["linked_columns"] = []
 
         if self.spec_dict.get("derived_columns", None) is None:
-                self.spec_dict["derived_columns"] = dict()
+            self.spec_dict["derived_columns"] = {}
 
         for col in self.spec_dict["metadata"]["categorical_columns"]:
 
@@ -253,6 +253,7 @@ def validate_spec(self):
         self.spec_dict = validated_spec
         return validated_spec is not None
 
+    #pylint: disable=R0912, R0915
     def execute_spec(self):
         '''
         Function only runs if validate_spec returned True
@@ -310,7 +311,7 @@ def execute_spec(self):
         for num_col in self.spec_dict["metadata"]["numerical_columns"]:
 
             # skip derived columns; they need main columns (inc. nulls) generated first
-            if num_col in (self.spec_dict.get("derived_columns", dict()) or dict()):
+            if num_col in (self.spec_dict.get("derived_columns", {}) or {}):
                 continue
 
             anon_df[num_col] = generate_continuous_column(
@@ -432,7 +433,8 @@ def execute_spec(self):
                     col_name=num_col
                 )
             # see comments above as to why we're re-generating derived columns
-            for derived_col, derived_def in self.spec_dict["derived_columns"].items(): #pragma: no cover
+            derived = self.spec_dict["derived_columns"].items()
+            for derived_col, derived_def in derived: #pragma: no cover
                 for num_col in num_cols:
                     if num_col in derived_def:
                         anon_df[derived_col] = generate_derived_column(anon_df, derived_def)
@@ -508,7 +510,7 @@ def generate(self):
             self.write_spec()
             return None
 
-        else:
+        if self.command == "fromspec":
             self.read_spec()
             if self.validate_spec():
                 self.execute_spec()
@@ -520,8 +522,9 @@ def generate(self):
                 self.write_data()
                 return None
 
-            # technically unreachable code because validation failures will raise
-            return None #pragma: no cover
+        raise RuntimeError( #pragma: no cover
+            "Generation command not recognized. Please initialise Exhibit "
+            "with a valid command.") 
 
 class Specification(UserDict): #pragma: no cover
     '''
@@ -541,5 +544,5 @@ def write_spec(self, path):
 
         spec_yaml = generate_YAML_string(self.data)
 
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
             f.write(spec_yaml)
diff --git a/exhibit/core/formatters.py b/exhibit/core/formatters.py
@@ -19,7 +19,7 @@ class FormattedList(list):
     separate processing for these formatted values from a basic list
     of values passed to original_values during manual column creation
     '''
-    pass
+
 
 def format_header(dataframe, series_name, prefix=None):
     '''
@@ -162,7 +162,7 @@ def build_list_of_probability_vectors(dataframe, original_series_name, ew=False)
 
     vectors = temp_vectors.values.tolist()
 
-    string_vectors = ["{0:.3f}".format(x).ljust(len(HEADER)) for x in vectors]
+    string_vectors = [f"{x:.3f}".ljust(len(HEADER)) for x in vectors]
 
     return string_vectors
 
@@ -187,7 +187,7 @@ def build_list_of_column_weights(weights):
 
     for key in sorted(weights):
 
-        padded_key = ["{0:.3f}".format(x).ljust(len(key)) for x in weights[key]]
+        padded_key = [f"{x:.3f}".ljust(len(key)) for x in weights[key]]
         sorted_temp.append(padded_key)
 
     sorted_final = [" | ".join(y for y in x).rstrip() for x in zip(*sorted_temp)]
@@ -333,15 +333,15 @@ def build_list_of_uuid_frequencies(df, target_col):
     counts = Counter(df[target_col].value_counts())
 
     freq_df = pd.DataFrame(
-        [(frequency, count) for frequency, count in counts.items()],
+        list(counts.items()),
         columns=["frequency", "count"]
     ).sort_values("frequency")
 
-    freq_df["pct"] = freq_df["count"] / freq_df["count"].sum()
+    freq_df.loc[:, "pct"] = freq_df["count"] / freq_df["count"].sum()
 
     freq_list = (
         freq_df["frequency"].astype(str).str.ljust(9)
-        .str.cat(freq_df["pct"].transform(lambda x: "{0:.3f}".format(x)), sep=' | ')
+        .str.cat(freq_df["pct"].transform(lambda x: f"{x:.3f}"), sep=" | ")
         .tolist()
     )
 
@@ -359,6 +359,6 @@ def format_df_for_export(df):
 
     for column in df.columns:
         if df[column].dtype == "timedelta64[ns]":
-            df[column] = df[column].astype(str).str.replace('0 days ', '')
+            df[column] = df[column].astype(str).str.replace("0 days ", "")
 
     return df