From e6372523641ab79d1361db2ef0cbb8c5b27a48e7 Mon Sep 17 00:00:00 2001 From: gherka Date: Fri, 24 Nov 2023 17:08:32 +0000 Subject: [PATCH] Pylint cleanup --- .pylintrc | 15 +- exhibit/command/tests/test_performance.py | 4 +- exhibit/core/constants.py | 2 +- exhibit/core/constraints.py | 24 ++-- exhibit/core/exhibit.py | 31 +++-- exhibit/core/formatters.py | 14 +- exhibit/core/generate/categorical.py | 20 +-- exhibit/core/generate/geo.py | 13 +- exhibit/core/generate/missing.py | 12 +- exhibit/core/generate/regex.py | 2 +- .../core/generate/tests/test_categorical.py | 22 +-- .../core/generate/tests/test_continuous.py | 6 +- exhibit/core/generate/tests/test_derived.py | 6 +- exhibit/core/generate/uuids.py | 4 +- exhibit/core/generate/yaml.py | 1 - exhibit/core/linkage/hierarchical.py | 2 +- exhibit/core/linkage/matrix.py | 9 +- .../tests/test_linkage_hierarchical.py | 2 +- exhibit/core/spec.py | 22 +-- exhibit/core/sql.py | 2 + exhibit/core/tests/test_constraints.py | 6 +- exhibit/core/tests/test_exhibit.py | 131 +++++++++--------- exhibit/core/tests/test_formatters.py | 2 +- exhibit/core/tests/test_reference.py | 29 ++-- exhibit/core/utils.py | 4 +- exhibit/core/validator.py | 16 +-- exhibit/sample/sample.py | 2 +- 27 files changed, 208 insertions(+), 195 deletions(-) diff --git a/.pylintrc b/.pylintrc index c941f28..eafaa17 100644 --- a/.pylintrc +++ b/.pylintrc @@ -23,13 +23,14 @@ # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable=C0303, # trailing whitespace - C0330, # hanging indent, C0103, # invalid name - too many false positives + C3001, # Lambda expression assigned to a variable E1136, # value is unsubscriptable + E1101, # no-member; too many false positives W0212, # access to protected member E1130, # invalid unary operand (numpy) - R0201, # no self use - false positives in test modules W0622, # redefine builtins - __package__ + W0640, # variable defined in loop [BASIC] @@ -104,7 +105,7 @@ indent-after-paren=4 indent-string=' ' # Maximum number of characters on a single line. -max-line-length=88 +max-line-length=100 # Maximum number of lines in a module. max-module-lines=1000 @@ -236,7 +237,7 @@ valid-metaclass-classmethod-first-arg=cls [DESIGN] # Maximum number of arguments for function / method. -max-args=6 +max-args=8 # Maximum number of attributes for a class (see R0902). max-attributes=15 @@ -248,7 +249,7 @@ max-bool-expr=5 max-branches=12 # Maximum number of locals for function / method body. -max-locals=20 +max-locals=25 # Maximum number of parents for a class (see R0901). max-parents=7 @@ -279,5 +280,5 @@ deprecated-modules= # Exceptions that will emit a warning when being caught. Defaults to # "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception +overgeneral-exceptions=builtins.BaseException, + builtins.Exception diff --git a/exhibit/command/tests/test_performance.py b/exhibit/command/tests/test_performance.py index ccc088f..94f65a0 100644 --- a/exhibit/command/tests/test_performance.py +++ b/exhibit/command/tests/test_performance.py @@ -3,6 +3,8 @@ regular unit / reference testing ''' +#pylint: disable=W0201 + # Standard library imports import string import unittest @@ -25,7 +27,7 @@ from memory_profiler import memory_usage except ImportError: memory_usage = None - print(f"memory_profiler not found. Make sure exhibit is installed in [dev] mode") + print("memory_profiler not found. Make sure exhibit is installed in [dev] mode") class performanceTests(unittest.TestCase): ''' diff --git a/exhibit/core/constants.py b/exhibit/core/constants.py index 3eef5ee..a96afca 100644 --- a/exhibit/core/constants.py +++ b/exhibit/core/constants.py @@ -8,4 +8,4 @@ ORIGINAL_VALUES_PAIRED = "See paired column" ORIGINAL_VALUES_REGEX = "regex" MISSING_DATA_STR = "Missing data" -EXHIBIT_DB_LOCAL = join("db", "exhibit.db") \ No newline at end of file +EXHIBIT_DB_LOCAL = join("db", "exhibit.db") diff --git a/exhibit/core/constraints.py b/exhibit/core/constraints.py index 6380fc7..3012a67 100644 --- a/exhibit/core/constraints.py +++ b/exhibit/core/constraints.py @@ -1,6 +1,9 @@ ''' Module for various derived and user-set constraints ''' + +# pylint: disable=C0302 + # Standard library imports from collections import namedtuple from datetime import datetime @@ -166,7 +169,7 @@ def process_custom_constraints(self, custom_constraints): cc_filter = constraint.get("filter", None) cc_partitions = constraint.get("partition", None) - cc_targets = constraint.get("targets", dict()) + cc_targets = constraint.get("targets", {}) clean_cc_filter = clean_up_constraint_string(cc_filter) cc_filter_mask = get_constraint_mask(output_df, clean_cc_filter) @@ -447,7 +450,7 @@ def _within_group_outliers(series): whether the value is divisible by 2 without remainder. ''' - q25, q50, q75 = np.percentile(series, [25, 50, 75]) + q25, _, q75 = np.percentile(series, [25, 50, 75]) iqr = q75 - q25 if iqr == 0: @@ -761,9 +764,11 @@ def _make_almost_same(group): ulinked_df = generate_user_linked_anon_df( self.spec_dict, user_linked_cols, new_df.shape[0], starting_col_matrix) + non_user_linked_cols = [x for x in df.columns if x not in user_linked_cols] + new_df = pd.concat( [ulinked_df.set_index(new_df.index)] + - [df.loc[filter_idx, [x for x in df.columns if x not in user_linked_cols]]], + [df.loc[filter_idx, non_user_linked_cols]], axis=1 ).reindex(columns=df.columns) @@ -865,7 +870,7 @@ def _generate_ordered_values(target_sequence, ordered_list, ordered_probs): else: pointer = 0 - result = sorted(unordered_result, key=lambda x: ordered_list.index(x)) + result = sorted(unordered_result, key=ordered_list.index) return result @@ -1025,7 +1030,7 @@ def _make_skewed_series(group): # add nulls based on the miss_probability of the skew column miss_pct = self.spec_dict["columns"][skew_col]["miss_probability"] - miss_val = pd.NA if group.dtype =='Int64' else np.nan + miss_val = pd.NA if group.dtype =="Int64" else np.nan skewed_result = np.where( rng.random(size=nrows) < miss_pct, miss_val, result.values) @@ -1037,7 +1042,8 @@ def _make_skewed_series(group): target_cols = [x.strip() for x in target_str.split(",")] if len(target_cols) != 2: # pragma: no cover - raise Exception(f"{self.current_action} requires exactly 2 target columns.") + raise RuntimeError( + f"{self.current_action} requires exactly 2 target columns.") if partition_cols is not None: partition_cols = [x.strip() for x in partition_cols.split(",") if x] @@ -1281,7 +1287,7 @@ def find_basic_constraint_columns(df): return output -def clean_up_constraint_string(raw_string, type="cc_filter"): +def clean_up_constraint_string(raw_string): ''' The default way to handle column names with whitespace in eval strings is to enclose them in backticks. However, the default tokeniser will @@ -1379,8 +1385,8 @@ def get_constraint_mask(df, clean_string): .rename(lambda x: x.replace(" ", "__"), axis="columns") .eval(clean_string, engine="python")) - except SyntaxError: #pragma: no cover - raise SyntaxError("Invalid filter expression supplied to custom action.") + except SyntaxError as e: #pragma: no cover + raise SyntaxError("Invalid filter expression supplied to custom action.") from e return mask diff --git a/exhibit/core/exhibit.py b/exhibit/core/exhibit.py index 635ff38..91f9a98 100644 --- a/exhibit/core/exhibit.py +++ b/exhibit/core/exhibit.py @@ -96,14 +96,14 @@ def __init__( skip_columns=None, linked_columns=None, uuid_columns=None, discrete_columns=None, save_probabilities=None, derived_columns_first=False, - verbose=False, **kwargs): + verbose=False): ''' Initialise either from the CLI or by instantiating directly ''' # Basic error checking on the arguments if linked_columns is not None and len(linked_columns) < 2: - raise Exception("Please provide at least two linked columns") + raise RuntimeError("Please provide at least two linked columns") self.command = command self.source = source @@ -111,7 +111,7 @@ def __init__( self.inline_limit = inline_limit self.equal_weights = equal_weights self.skip_columns = skip_columns or set() - self.linked_columns= linked_columns or list() + self.linked_columns= linked_columns or [] self.uuid_columns= uuid_columns or set() self.discrete_columns = discrete_columns or set() self.save_probabilities = save_probabilities or set() @@ -179,7 +179,7 @@ def write_spec(self, spec_yaml=None): else: output_path = self.output - with open(output_path, "w") as f: + with open(output_path, "w", encoding="utf-8") as f: f.write(spec_yaml) print("Exhibit ready to view") @@ -208,7 +208,7 @@ def read_spec(self): self.source = path_checker(self.source) if self.source.suffix == ".yml": - with open(self.source) as f: + with open(self.source, encoding="utf-8") as f: self.spec_dict = yaml.safe_load(f) else: #pragma: no cover raise TypeError("Specification is not in .yml format") @@ -219,13 +219,13 @@ def read_spec(self): # these NONE values early and change them into empty sequences. for key, value in self.spec_dict["metadata"].items(): if "columns" in key and value is None: - self.spec_dict["metadata"][key] = list() + self.spec_dict["metadata"][key] = [] if self.spec_dict.get("linked_columns", None) is None: - self.spec_dict["linked_columns"] = list() + self.spec_dict["linked_columns"] = [] if self.spec_dict.get("derived_columns", None) is None: - self.spec_dict["derived_columns"] = dict() + self.spec_dict["derived_columns"] = {} for col in self.spec_dict["metadata"]["categorical_columns"]: @@ -253,6 +253,7 @@ def validate_spec(self): self.spec_dict = validated_spec return validated_spec is not None + #pylint: disable=R0912, R0915 def execute_spec(self): ''' Function only runs if validate_spec returned True @@ -310,7 +311,7 @@ def execute_spec(self): for num_col in self.spec_dict["metadata"]["numerical_columns"]: # skip derived columns; they need main columns (inc. nulls) generated first - if num_col in (self.spec_dict.get("derived_columns", dict()) or dict()): + if num_col in (self.spec_dict.get("derived_columns", {}) or {}): continue anon_df[num_col] = generate_continuous_column( @@ -432,7 +433,8 @@ def execute_spec(self): col_name=num_col ) # see comments above as to why we're re-generating derived columns - for derived_col, derived_def in self.spec_dict["derived_columns"].items(): #pragma: no cover + derived = self.spec_dict["derived_columns"].items() + for derived_col, derived_def in derived: #pragma: no cover for num_col in num_cols: if num_col in derived_def: anon_df[derived_col] = generate_derived_column(anon_df, derived_def) @@ -508,7 +510,7 @@ def generate(self): self.write_spec() return None - else: + if self.command == "fromspec": self.read_spec() if self.validate_spec(): self.execute_spec() @@ -520,8 +522,9 @@ def generate(self): self.write_data() return None - # technically unreachable code because validation failures will raise - return None #pragma: no cover + raise RuntimeError( #pragma: no cover + "Generation command not recognized. Please initialise Exhibit " + "with a valid command.") class Specification(UserDict): #pragma: no cover ''' @@ -541,5 +544,5 @@ def write_spec(self, path): spec_yaml = generate_YAML_string(self.data) - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(spec_yaml) diff --git a/exhibit/core/formatters.py b/exhibit/core/formatters.py index 0b9525a..1d548f5 100644 --- a/exhibit/core/formatters.py +++ b/exhibit/core/formatters.py @@ -19,7 +19,7 @@ class FormattedList(list): separate processing for these formatted values from a basic list of values passed to original_values during manual column creation ''' - pass + def format_header(dataframe, series_name, prefix=None): ''' @@ -162,7 +162,7 @@ def build_list_of_probability_vectors(dataframe, original_series_name, ew=False) vectors = temp_vectors.values.tolist() - string_vectors = ["{0:.3f}".format(x).ljust(len(HEADER)) for x in vectors] + string_vectors = [f"{x:.3f}".ljust(len(HEADER)) for x in vectors] return string_vectors @@ -187,7 +187,7 @@ def build_list_of_column_weights(weights): for key in sorted(weights): - padded_key = ["{0:.3f}".format(x).ljust(len(key)) for x in weights[key]] + padded_key = [f"{x:.3f}".ljust(len(key)) for x in weights[key]] sorted_temp.append(padded_key) sorted_final = [" | ".join(y for y in x).rstrip() for x in zip(*sorted_temp)] @@ -333,15 +333,15 @@ def build_list_of_uuid_frequencies(df, target_col): counts = Counter(df[target_col].value_counts()) freq_df = pd.DataFrame( - [(frequency, count) for frequency, count in counts.items()], + list(counts.items()), columns=["frequency", "count"] ).sort_values("frequency") - freq_df["pct"] = freq_df["count"] / freq_df["count"].sum() + freq_df.loc[:, "pct"] = freq_df["count"] / freq_df["count"].sum() freq_list = ( freq_df["frequency"].astype(str).str.ljust(9) - .str.cat(freq_df["pct"].transform(lambda x: "{0:.3f}".format(x)), sep=' | ') + .str.cat(freq_df["pct"].transform(lambda x: f"{x:.3f}"), sep=" | ") .tolist() ) @@ -359,6 +359,6 @@ def format_df_for_export(df): for column in df.columns: if df[column].dtype == "timedelta64[ns]": - df[column] = df[column].astype(str).str.replace('0 days ', '') + df[column] = df[column].astype(str).str.replace("0 days ", "") return df diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py index ae38dc8..79b6ec5 100644 --- a/exhibit/core/generate/categorical.py +++ b/exhibit/core/generate/categorical.py @@ -46,6 +46,7 @@ def __init__(self, spec_dict, core_rows, anon_df=None): self.fixed_anon_sets = ["random", "mountains", "patients", "birds", "dates"] # we need UUID dataset (if it exists) for possible conditional SQL that # references already-generated columns in the spec + self.generated_dfs = [] self.anon_df = anon_df (self.all_cols, @@ -62,10 +63,8 @@ def generate(self): A dataframe with all categorical columns ''' - self.generated_dfs = [] - #1) GENERATE LINKED DFs FROM EACH LINKED COLUMNS GROUP - for linked_group in (self.spec_dict.get("linked_columns") or list()): + for linked_group in (self.spec_dict.get("linked_columns") or []): # zero-numbered linked group is reserved for user-defined groupings if linked_group[0] == 0: @@ -298,7 +297,7 @@ def _generate_from_sql(self, col_name, col_attrs, complete=False, db_path=None): anon_set = col_attrs["anonymising_set"] uniques = col_attrs["uniques"] - paired_cols = col_attrs["paired_columns"] or list() + paired_cols = col_attrs["paired_columns"] or [] #1) QUERY SQL TO GET VALUES USED TO BUILD THE DATAFRAME if anon_set == "random": @@ -424,12 +423,12 @@ def _get_column_types(self): # there might be cases when you want to generate just the date columns or just # the categorical columns so they might be missing from the metadata section all_cols = ( - (self.spec_dict["metadata"].get("categorical_columns", list())) + - (self.spec_dict["metadata"].get("date_columns", list())) + (self.spec_dict["metadata"].get("categorical_columns", [])) + + (self.spec_dict["metadata"].get("date_columns", [])) ) nested_linked_cols = [ - sublist for n, sublist in (self.spec_dict.get("linked_columns") or list()) + sublist for n, sublist in (self.spec_dict.get("linked_columns") or []) ] complete_cols = [c for c, v in get_attr_values( @@ -520,7 +519,7 @@ def _generate_using_external_table(self, col_name, anon_set): # unless we make an explicit copy of the de-duplicated dataframe, Pandas will # trigger SettingWithCopy warning when trying to change any values. existing_data_distinct = existing_data.drop_duplicates(subset=join_columns).copy() - existing_data_cols = [c for c in existing_data.columns] + existing_data_cols = list(existing_data.columns) # this function converts list of tuples into a dataframe anyway create_temp_table( @@ -572,7 +571,8 @@ def _generate_using_external_table(self, col_name, anon_set): # having a COALESCE in SQL would fix it, but in case it's also missing, # we try to catch this edge case in code as well. try: - new_data = self.rng.choice(a=probas[group_key][0], p=probas[group_key][1], size=len(group_index)) + new_data = self.rng.choice( + a=probas[group_key][0], p=probas[group_key][1], size=len(group_index)) except KeyError: #pragma: no cover new_data = [np.nan] * len(group_index) @@ -582,7 +582,7 @@ def _generate_using_external_table(self, col_name, anon_set): # ensure we return the correct type for date columns col_type = self.spec_dict["columns"][col_name]["type"] - if col_type == 'date': + if col_type == "date": final_result = final_result.astype("datetime64[ns]") return final_result diff --git a/exhibit/core/generate/geo.py b/exhibit/core/generate/geo.py index 967a98c..02d0e9a 100644 --- a/exhibit/core/generate/geo.py +++ b/exhibit/core/generate/geo.py @@ -97,7 +97,7 @@ def geo_make_regions( ''' if not partition_cols: #pragma: no cover - raise Exception("make_geo_regions action requires at least one partition") + raise RuntimeError("make_geo_regions action requires at least one partition") geo_target_cols = [x.strip() for x in target_str.split(",")] partition_cols = [x.strip() for x in partition_cols.split(",") if x] @@ -110,7 +110,8 @@ def geo_make_regions( geo_target_cols_table_names.append(h3_table_name) if len(set(geo_target_cols_table_names)) != 1: #pragma: no cover - raise Exception("columns used for make_geo_regions action rely on different h3 tables") + raise RuntimeError( + "Columns used for make_geo_regions action rely on different h3 tables") # add placeholders for output columns target_cols = [] @@ -138,7 +139,7 @@ def geo_make_regions( # add H3 centroid coordinates geo_df["lat"], geo_df["long"] = zip( - *geo_df["h3"].transform(lambda x: h3.h3_to_geo(x))) + *geo_df["h3"].transform(h3.h3_to_geo)) # create initial region indices based on the N of values in level=0 n_regions = grouped_idx.get_level_values(level=0).nunique() @@ -267,7 +268,7 @@ def _create_contiguous_regions( aspect_ratio = height/width line = LineString([p1, p2]) - scaled_line = scale(line, xfact=15.0, yfact=15.0, zfact=1.0, origin='center') + scaled_line = scale(line, xfact=15.0, yfact=15.0, zfact=1.0, origin="center") # to avoid very thin regions, change the rotation angle of the cutting line based # on the aspect rato; 2 is a magic number; another option is to use a tighter "crop" @@ -306,7 +307,7 @@ def _create_contiguous_regions( if retries == 5: print("Regions created: ", len(final_regions_idx)) - raise Exception("Can't create a subregion.") + raise RuntimeError("Can't create a subregion.") rotated_line = rotate(scaled_line, angle=int(rng.uniform(low=0, high=180))) result = _cut_polygon_by_line(polygon, rotated_line) @@ -323,7 +324,7 @@ def _create_contiguous_regions( final_regions_idx.extend([idx_child_1, idx_child_2]) parent_idx = None - for i in range(len(final_regions_idx)): + for i, _ in enumerate(final_regions_idx): if final_regions_idx[i].equals(idx_child_1.union(idx_child_2)): parent_idx = i diff --git a/exhibit/core/generate/missing.py b/exhibit/core/generate/missing.py index 79a7c99..4cd048f 100644 --- a/exhibit/core/generate/missing.py +++ b/exhibit/core/generate/missing.py @@ -184,7 +184,7 @@ def add_missing_data(self): # if it's already NA, don't re-generate; it's NA for a reason! num_mask = self.nan_data[num_col].isna() - mask = (cat_mask & ~num_mask) + mask = cat_mask & ~num_mask # it's possible to have the left side be Int64 type and the right side # to be float64 (newly generated, unscaled); assigning different types @@ -256,7 +256,7 @@ def _find_columns_with_linked_missing_data(self): pairs.update([col] + attrs["paired_columns"]) # linked groups - for i, linked_group in (self.spec_dict["linked_columns"] or list()): + for i, linked_group in (self.spec_dict["linked_columns"] or []): # zero numbered linked group is reserved for user defined linkage if i == 0: continue @@ -289,14 +289,14 @@ def _find_make_null_idx(self): original data passed in to the generator. ''' - cc = self.spec_dict["constraints"]["custom_constraints"] or dict() + cc = self.spec_dict["constraints"]["custom_constraints"] or {} make_null_idx = [] for _, constraint in cc.items(): cc_filter = constraint.get("filter", None) - cc_targets = constraint.get("targets", dict()) + cc_targets = constraint.get("targets", {}) clean_cc_filter = clean_up_constraint_string(cc_filter) cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter) @@ -319,14 +319,14 @@ def _find_not_null_idx(self): Doc string ''' - cc = self.spec_dict["constraints"]["custom_constraints"] or dict() + cc = self.spec_dict["constraints"]["custom_constraints"] or {} not_null_idx = [] for _, constraint in cc.items(): cc_filter = constraint.get("filter", None) - cc_targets = constraint.get("targets", dict()) + cc_targets = constraint.get("targets", {}) clean_cc_filter = clean_up_constraint_string(cc_filter) cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter) diff --git a/exhibit/core/generate/regex.py b/exhibit/core/generate/regex.py index bfdce70..4fbf9f8 100644 --- a/exhibit/core/generate/regex.py +++ b/exhibit/core/generate/regex.py @@ -24,7 +24,7 @@ def generate_regex_column(anon_pattern, name, size, target_uniques=None): Returns pd.Series ''' # ensure that each column gets a unique (except for anagrams) seed - column_seed = sum([ord(x) for x in name]) + column_seed = sum(ord(x) for x in name) static_quant_pattern = r"[^\]]\{\d+\}" static_string = anon_pattern diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py index 5b5e883..e9a0439 100644 --- a/exhibit/core/generate/tests/test_categorical.py +++ b/exhibit/core/generate/tests/test_categorical.py @@ -267,9 +267,9 @@ def test_column_with_categorical_values_based_on_conditonal_sql(self): result = gen.generate() self.assertTrue( - (result.query("gender == 'F'")["linked_condition"] == 'C').all()) + (result.query("gender == 'F'")["linked_condition"] == "C").all()) self.assertFalse( - (result.query("gender == 'M'")["linked_condition"] == 'C').any()) + (result.query("gender == 'M'")["linked_condition"] == "C").any()) def test_column_with_external_date_values_in_conditonal_sql(self): ''' @@ -282,8 +282,8 @@ def test_column_with_external_date_values_in_conditonal_sql(self): FROM temp_main JOIN temp_linked ON temp_main.gender = temp_linked.gender ''' - m_dates = pd.date_range(start='2022-01-01', periods=3, freq='D') - f_dates = pd.date_range(start='2023-01-01', periods=3, freq='D') + m_dates = pd.date_range(start="2022-01-01", periods=3, freq="D") + f_dates = pd.date_range(start="2023-01-01", periods=3, freq="D") dates = m_dates.union(f_dates) linked_data = pd.DataFrame(data={ @@ -355,10 +355,10 @@ def test_column_with_source_date_values_in_conditonal_sql(self): "columns": { "source_date": { "type": "date", - "from": '2023-01-01', - "to" : '2023-02-01', + "from": "2023-01-01", + "to" : "2023-02-01", "uniques" : 5, - "frequency" : 'D', + "frequency" : "D", "cross_join_all_unique_values" : False, }, "conditional_date": { @@ -373,7 +373,7 @@ def test_column_with_source_date_values_in_conditonal_sql(self): result = gen.generate() self.assertTrue((result["conditional_date"] > result["source_date"]).all()) - self.assertTrue((result["conditional_date"] < '2023-03-01').all()) + self.assertTrue((result["conditional_date"] < "2023-03-01").all()) def test_column_with_using_case_statement_in_conditonal_sql(self): ''' @@ -445,10 +445,10 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self): "columns": { "source_date": { "type": "date", - "from": '2023-01-01', - "to" : '2023-02-01', + "from": "2023-01-01", + "to" : "2023-02-01", "uniques" : 60, - "frequency" : 'D', + "frequency" : "D", "cross_join_all_unique_values" : False, }, } diff --git a/exhibit/core/generate/tests/test_continuous.py b/exhibit/core/generate/tests/test_continuous.py index 9988597..1a822bb 100644 --- a/exhibit/core/generate/tests/test_continuous.py +++ b/exhibit/core/generate/tests/test_continuous.py @@ -28,16 +28,16 @@ def test_apply_dispersion(self): #zero dispersion returns original value test_case_1 = tm._apply_dispersion(5, 0, rng) - expected_1 = (test_case_1 == 5) + expected_1 = test_case_1 == 5 #basic interval picking test_case_2 = tm._apply_dispersion(10, 0.5, rng) - expected_2 = (5 <= test_case_2 <= 15) + expected_2 = 5 <= test_case_2 <= 15 #avoid negative interval for values of zero where all #values are expected to be greater or equal to zero test_case_3 = tm._apply_dispersion(0, 0.2, rng) - expected_3 = (0 <= test_case_3 <= 2) + expected_3 = 0 <= test_case_3 <= 2 #na returns na test_case_4 = tm._apply_dispersion(pd.NA, 0.2, rng) diff --git a/exhibit/core/generate/tests/test_derived.py b/exhibit/core/generate/tests/test_derived.py index 7b4080b..a3be948 100644 --- a/exhibit/core/generate/tests/test_derived.py +++ b/exhibit/core/generate/tests/test_derived.py @@ -81,9 +81,9 @@ def test_generate_derived_column_with_timestamp(self): calc = "@create_timestamp(hours, minutes, seconds)" expected = pd.Series([ - '2022-01-31 01:00:00', - '2022-02-28 02:00:01', - '2022-03-31 00:59:10' + "2022-01-31 01:00:00", + "2022-02-28 02:00:01", + "2022-03-31 00:59:10" ]) # can add dates and timedelta timestamps easily diff --git a/exhibit/core/generate/uuids.py b/exhibit/core/generate/uuids.py index 47779c4..43e5d2f 100644 --- a/exhibit/core/generate/uuids.py +++ b/exhibit/core/generate/uuids.py @@ -131,11 +131,11 @@ def _generate_pseudo_chis(n, seed=0): while len(result) < n: pseudo_chi = ( str(random.randint(0,31)) + # day will be zero padded if total length < 10 - '13' + # ensure no accidental collissions + "13" + # ensure no accidental collissions str(random.randint(20, 99)) + str(random.randint(0,9999)).zfill(4) # no specific logic for 9th digit ).zfill(10) result.add(pseudo_chi) - return sorted(list(result)) \ No newline at end of file + return sorted(list(result)) diff --git a/exhibit/core/generate/yaml.py b/exhibit/core/generate/yaml.py index 3c9971c..a20a9f0 100644 --- a/exhibit/core/generate/yaml.py +++ b/exhibit/core/generate/yaml.py @@ -21,7 +21,6 @@ class ExhibitDumper(yaml.SafeDumper): Columns are subclassed dictionaries, but YAML's safe_dump will not recognize them as such unless you add a specific representer for each "special" class. ''' - pass def generate_YAML_string(spec_dict): diff --git a/exhibit/core/linkage/hierarchical.py b/exhibit/core/linkage/hierarchical.py index 163fcc5..0e66171 100644 --- a/exhibit/core/linkage/hierarchical.py +++ b/exhibit/core/linkage/hierarchical.py @@ -742,7 +742,7 @@ def add_paired_columns(self, linked_df): for c in self.linked_cols: #just generate a DF with duplicate paired columns - for pair in self.spec_dict["columns"][c]["paired_columns"] or list(): + for pair in self.spec_dict["columns"][c]["paired_columns"] or []: #overwrite linked_df linked_df = pd.concat( diff --git a/exhibit/core/linkage/matrix.py b/exhibit/core/linkage/matrix.py index b1dd5f9..379cf19 100644 --- a/exhibit/core/linkage/matrix.py +++ b/exhibit/core/linkage/matrix.py @@ -160,10 +160,11 @@ def generate_user_linked_anon_df( # get the numerical mapping as expected, also changing the dtype for performance. if starting_col_matrix is not None: - starting_col_matrix = ( - pd.DataFrame(starting_col_matrix) - .fillna(MISSING_DATA_STR) - .applymap(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16)) + starting_col_matrix = ( + pd.DataFrame(starting_col_matrix) + .fillna(MISSING_DATA_STR) + .applymap(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16) + ) else: starting_col_matrix = np.full( diff --git a/exhibit/core/linkage/tests/test_linkage_hierarchical.py b/exhibit/core/linkage/tests/test_linkage_hierarchical.py index 6b4c3ba..f908302 100644 --- a/exhibit/core/linkage/tests/test_linkage_hierarchical.py +++ b/exhibit/core/linkage/tests/test_linkage_hierarchical.py @@ -214,7 +214,7 @@ def test_hierarchically_linked_columns_with_missing_data(self): # lists with equal elements, ignoring order self.assertCountEqual( tm.find_hierarchically_linked_columns(test_df, test_spec), - [('C1', 'C4'), ("C3", "C4")] + [("C1", "C4"), ("C3", "C4")] ) def test_1_to_1_linked_columns(self): diff --git a/exhibit/core/spec.py b/exhibit/core/spec.py index 9419ff5..206f1e6 100644 --- a/exhibit/core/spec.py +++ b/exhibit/core/spec.py @@ -80,11 +80,11 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs self.output = { "metadata": { "number_of_rows" : 0, - "uuid_columns" : list(), - "categorical_columns" : list(), - "numerical_columns" : list(), - "date_columns" : list(), - "geospatial_columns" : list(), + "uuid_columns" : [], + "categorical_columns" : [], + "numerical_columns" : [], + "date_columns" : [], + "geospatial_columns" : [], "inline_limit" : self.inline_limit, "random_seed" : self.random_seed, "id" : "", @@ -92,8 +92,8 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs "columns": {}, "constraints": { "allow_duplicates" : True, - "basic_constraints" : list(), - "custom_constraints" : list() + "basic_constraints" : [], + "custom_constraints" : [] }, "linked_columns" : [], "derived_columns": {}, @@ -104,7 +104,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs self.df = data.copy() self.ew = ew - self.user_linked_cols = kwargs.get("user_linked_cols", list()) + self.user_linked_cols = kwargs.get("user_linked_cols", []) self.uuid_cols = kwargs.get("uuid_cols", set()) self.db_prob_cols = kwargs.get("save_probabilities", set()) self.id = generate_table_id() @@ -135,7 +135,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs meta["categorical_columns"] = sorted(list(self.cat_cols)) meta["numerical_columns"] = sorted(list(self.numerical_cols)) meta["date_columns"] = sorted(list(self.date_cols)) - meta["geospatial_columns"] = list() + meta["geospatial_columns"] = [] meta["inline_limit"] = self.inline_limit meta["random_seed"] = self.random_seed meta["id"] = self.id @@ -431,7 +431,7 @@ def _original_values_path_resolver(self, path, wt, col): return output #if path is something else, raise exception - raise ValueError("Incorrect %s" % path) # pragma: no cover + raise ValueError(f"Incorrect {path}") # pragma: no cover class UUIDColumn(dict): ''' @@ -522,7 +522,7 @@ def __init__(self, self["type"] = "categorical" self["name"] = name self["original_values"] = original_values - self["paired_columns"] = list() if paired_columns is None else paired_columns + self["paired_columns"] = [] if paired_columns is None else paired_columns self["uniques"] = 0 if uniques is None else uniques self["cross_join_all_unique_values"] = cross_join self["miss_probability"] = miss_proba diff --git a/exhibit/core/sql.py b/exhibit/core/sql.py index 5e38018..e4b131b 100644 --- a/exhibit/core/sql.py +++ b/exhibit/core/sql.py @@ -1,6 +1,8 @@ ''' Module with functions to provide an interface with the exhibit database ''' +#false positive on engine.dispose() +#pylint: disable=E1101 # Standard library imports import os diff --git a/exhibit/core/tests/test_constraints.py b/exhibit/core/tests/test_constraints.py index d22a445..3a39885 100644 --- a/exhibit/core/tests/test_constraints.py +++ b/exhibit/core/tests/test_constraints.py @@ -2,6 +2,8 @@ Test the code for parsing and enforcing constraints ''' +# pylint: disable=C0302 + # Standard library imports import unittest from datetime import datetime @@ -809,7 +811,7 @@ def test_custom_constraints_make_distinct_with_date_columns(self): } test_data = pd.DataFrame(data={ - "A" : list(pd.date_range(start='2020-01-01', periods=5)) * 4, + "A" : list(pd.date_range(start="2020-01-01", periods=5)) * 4, "B" : [True, False] * 10, }) @@ -1423,7 +1425,7 @@ def test_custom_constraints_make_almost_same(self): result = test_gen.process_constraints().query("B=='spam'") pct_B = result.value_counts().agg(lambda x: x / sum(x)).iloc[1] - self.assertTrue(pct_B > 0 and pct_B < 0.1) + self.assertTrue(0.1 > pct_B > 0) def test_custom_constraints_targeting_high_frequency_rows(self): ''' diff --git a/exhibit/core/tests/test_exhibit.py b/exhibit/core/tests/test_exhibit.py index 5aac9fe..8627c61 100644 --- a/exhibit/core/tests/test_exhibit.py +++ b/exhibit/core/tests/test_exhibit.py @@ -47,12 +47,12 @@ def test_read_data_func_reads_csv_from_source_path(self): Send "mock" command line arguments to parse_args function and assert that the program reads the same data as ref_df. ''' - args = dict( - command="fromdata", - source=Path(package_dir("sample", "_data", "inpatients.csv")), - verbose=True, - skip_columns=[] - ) + args = { + "command" : "fromdata", + "source" : Path(package_dir("sample", "_data", "inpatients.csv")), + "skip_columns" : [], + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_data() @@ -80,19 +80,19 @@ def test_output_spec_creates_file_with_o_argument(self): advantage of its methods that mimick the open() builtin ''' - args = dict( - command="fromdata", - source="dummy.csv", - output="test.yml", - verbose=True, - ) + args = { + "command" : "fromdata", + "source" : "dummy.csv", + "output" : "test.yml", + "verbose" : True, + } with patch("exhibit.core.exhibit.open", new=mock_open()) as mo: xA = tm.Exhibit(**args) xA.write_spec("hello") - mo.assert_called_with("test.yml", "w") + mo.assert_called_with("test.yml", "w", encoding="utf-8") mo.return_value.__enter__.return_value.write.assert_called_with("hello") def test_output_spec_creates_file_without_o_argument(self): @@ -102,19 +102,19 @@ def test_output_spec_creates_file_without_o_argument(self): the command: fromdata or fromspec. ''' - args = dict( - command="fromdata", - source=Path("source_dataset.csv"), - output=None, - verbose=True, - ) + args = { + "command" : "fromdata", + "source" : Path("source_dataset.csv"), + "output" : None, + "verbose" : True, + } with patch("exhibit.core.exhibit.open", new=mock_open()) as mo: xA = tm.Exhibit(**args) xA.write_spec("hello") - mo.assert_called_with("source_dataset_SPEC.yml", "w") + mo.assert_called_with("source_dataset_SPEC.yml", "w", encoding="utf-8") mo.return_value.__enter__.return_value.write.assert_called_with("hello") def test_output_spec_respectes_equal_weights_argument(self): @@ -122,14 +122,14 @@ def test_output_spec_respectes_equal_weights_argument(self): Doc string ''' - args = dict( - command="fromdata", - source=Path(package_dir("sample", "_data", "inpatients.csv")), - verbose=True, - inline_limit=30, - equal_weights=True, - skip_columns=[] - ) + args = { + "command" : "fromdata", + "source" : Path(package_dir("sample", "_data", "inpatients.csv")), + "inline_limit" : 30, + "equal_weights" : True, + "skip_columns" : [], + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_data() @@ -151,15 +151,15 @@ def test_spec_generation_with_predefined_linked_columns(self): user_linked_cols = ["sex", "age"] - args = dict( - command="fromdata", - source=Path(package_dir("sample", "_data", "inpatients.csv")), - verbose=True, - inline_limit=30, - equal_weights=True, - skip_columns=[], - linked_columns=user_linked_cols - ) + args = { + "command" : "fromdata", + "source" : Path(package_dir("sample", "_data", "inpatients.csv")), + "inline_limit" : 30, + "equal_weights" : True, + "skip_columns" : [], + "linked_columns" : user_linked_cols, + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_data() @@ -181,15 +181,15 @@ def test_overlapping_hierarchical_and_predefined_linked_columns(self): user_linked_cols = ["hb_name", "hb_code", "age"] - args = dict( - command="fromdata", - source=Path(package_dir("sample", "_data", "inpatients.csv")), - verbose=True, - inline_limit=30, - equal_weights=True, - skip_columns=[], - linked_columns=user_linked_cols - ) + args = { + "command" : "fromdata", + "source" : Path(package_dir("sample", "_data", "inpatients.csv")), + "inline_limit" : 30, + "equal_weights" : True, + "skip_columns" : [], + "linked_columns" : user_linked_cols, + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_data() @@ -208,15 +208,15 @@ def test_less_than_two_predefined_linked_columns_raiser_error(self): user_linked_cols = ["hb_name"] - args = dict( - command="fromdata", - source=Path(package_dir("sample", "_data", "inpatients.csv")), - verbose=True, - inline_limit=30, - equal_weights=True, - skip_columns=[], - linked_columns=user_linked_cols - ) + args = { + "command" : "fromdata", + "source" : Path(package_dir("sample", "_data", "inpatients.csv")), + "inline_limit" : 30, + "equal_weights" : True, + "skip_columns" : [], + "linked_columns" : user_linked_cols, + "verbose" : True, + } self.assertRaises(Exception, tm.Exhibit, **args) @@ -229,15 +229,15 @@ def test_uuid_columns_are_never_duplicated_in_other_column_types(self): uuid_columns = {"hb_name", "quarter_date", "stays"} - args = dict( - command="fromdata", - source=Path(package_dir("sample", "_data", "inpatients.csv")), - verbose=True, - inline_limit=30, - equal_weights=True, - skip_columns={}, - uuid_columns=uuid_columns - ) + args = { + "command" : "fromdata", + "source" : Path(package_dir("sample", "_data", "inpatients.csv")), + "inline_limit" : 30, + "equal_weights" : True, + "skip_columns" : [], + "uuid_columns" : uuid_columns, + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_data() @@ -248,8 +248,7 @@ def test_uuid_columns_are_never_duplicated_in_other_column_types(self): metadata_uuid_columns = xA.spec_dict["metadata"]["uuid_columns"] - col_names_and_types = [ - x for x in get_attr_values(xA.spec_dict, "type", col_names=True)] + col_names_and_types = list(get_attr_values(xA.spec_dict, "type", col_names=True)) not_expected = [] for col_name, col_type in col_names_and_types: diff --git a/exhibit/core/tests/test_formatters.py b/exhibit/core/tests/test_formatters.py index 8a894f1..2165cb7 100644 --- a/exhibit/core/tests/test_formatters.py +++ b/exhibit/core/tests/test_formatters.py @@ -42,7 +42,7 @@ def test_uuid_frequency_list_generation(self): "value" : range(7) }) - test_col = 'id' + test_col = "id" expected_list = [ "frequency | probability_vector", diff --git a/exhibit/core/tests/test_reference.py b/exhibit/core/tests/test_reference.py index 8fc6476..8071d0b 100644 --- a/exhibit/core/tests/test_reference.py +++ b/exhibit/core/tests/test_reference.py @@ -45,8 +45,6 @@ def temp_exhibit( test_spec_dict=None, return_spec=True, return_df=True, - *args, - **kwargs, ): ''' A helper method to generate and read custom specifications @@ -324,14 +322,13 @@ def test_reference_inpatient_anon_data(self): be different from those set in the spec as target sum is enforced BEFORE basic constraints are adjusted. ''' - - - args = dict( - command="fromspec", - source=Path(package_dir("sample", "_spec", "inpatients_demo.yml")), - verbose=True, - skip_columns=[] - ) + + args = { + "command" : "fromspec", + "source" : Path(package_dir("sample", "_spec", "inpatients_demo.yml")), + "skip_columns" : [], + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_spec() @@ -766,12 +763,12 @@ def test_reference_uuid_data(self): - no db ''' - args = dict( - command="fromspec", - source=Path(package_dir("sample", "_spec", "uuid_demo.yml")), - verbose=True, - skip_columns=[] - ) + args = { + "command" : "fromspec", + "source" : Path(package_dir("sample", "_spec", "uuid_demo.yml")), + "skip_columns" : [], + "verbose" : True, + } xA = tm.Exhibit(**args) xA.read_spec() diff --git a/exhibit/core/utils.py b/exhibit/core/utils.py index 362b1bb..95a7dae 100644 --- a/exhibit/core/utils.py +++ b/exhibit/core/utils.py @@ -418,7 +418,7 @@ def natural_key(string_): ''' Thanks SO! ''' - return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)] + return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)] def shuffle_data(data): ''' @@ -430,4 +430,4 @@ def shuffle_data(data): .reset_index(drop=True) ) - return shuffled_series \ No newline at end of file + return shuffled_series diff --git a/exhibit/core/validator.py b/exhibit/core/validator.py index 149c8a1..9cbf36b 100644 --- a/exhibit/core/validator.py +++ b/exhibit/core/validator.py @@ -126,7 +126,7 @@ def validate_linked_cols(self, spec_dict=None): VALIDATION FAIL: linked columns must have matching attributes (%(err_attr)s) """) - for linked_col_group in spec_dict["linked_columns"] or list(): + for linked_col_group in spec_dict["linked_columns"] or []: #linked_columns[0] is the index of linked group; actual columns are [1] linked_cols = linked_col_group[1] @@ -341,7 +341,7 @@ def validate_no_repeating_columns_in_linked_groups(self, spec_dict=None): VALIDATION FAIL: Duplicate column(s) in linked groups """) - nested_list = spec_dict["linked_columns"] or list() + nested_list = spec_dict["linked_columns"] or [] flat_list = list(chain(*[sublist for _, sublist in nested_list])) flat_set = set(flat_list) @@ -369,11 +369,11 @@ def validate_metadata_columns(self, spec_dict=None): warn = False - m_uuid = spec_dict["metadata"].get("uuid_columns", list()) - m_cat = spec_dict["metadata"].get("categorical_columns", list()) - m_num = spec_dict["metadata"].get("numerical_columns", list()) - m_date = spec_dict["metadata"].get("date_columns", list()) - m_geo = spec_dict["metadata"].get("geospatial_columns", list()) + m_uuid = spec_dict["metadata"].get("uuid_columns", []) + m_cat = spec_dict["metadata"].get("categorical_columns", []) + m_num = spec_dict["metadata"].get("numerical_columns", []) + m_date = spec_dict["metadata"].get("date_columns", []) + m_geo = spec_dict["metadata"].get("geospatial_columns", []) col_types = { "uuid" : m_uuid, @@ -392,4 +392,4 @@ def validate_metadata_columns(self, spec_dict=None): if warn: print(warn_msg) - return True \ No newline at end of file + return True diff --git a/exhibit/sample/sample.py b/exhibit/sample/sample.py index 455cca6..0643535 100644 --- a/exhibit/sample/sample.py +++ b/exhibit/sample/sample.py @@ -29,5 +29,5 @@ package_dir("sample", "_data", "prescribing.csv"), parse_dates=["PaidDateMonth"]) #Load specs -with open(package_dir("sample", "_spec", "inpatients_demo.yml")) as f: +with open(package_dir("sample", "_spec", "inpatients_demo.yml"), encoding="utf-8") as f: inpatients_spec = yaml.safe_load(f)