From e6372523641ab79d1361db2ef0cbb8c5b27a48e7 Mon Sep 17 00:00:00 2001
From: gherka <germanpriks@gmail.com>
Date: Fri, 24 Nov 2023 17:08:32 +0000
Subject: [PATCH] Pylint cleanup

---
 .pylintrc                                     |  15 +-
 exhibit/command/tests/test_performance.py     |   4 +-
 exhibit/core/constants.py                     |   2 +-
 exhibit/core/constraints.py                   |  24 ++--
 exhibit/core/exhibit.py                       |  31 +++--
 exhibit/core/formatters.py                    |  14 +-
 exhibit/core/generate/categorical.py          |  20 +--
 exhibit/core/generate/geo.py                  |  13 +-
 exhibit/core/generate/missing.py              |  12 +-
 exhibit/core/generate/regex.py                |   2 +-
 .../core/generate/tests/test_categorical.py   |  22 +--
 .../core/generate/tests/test_continuous.py    |   6 +-
 exhibit/core/generate/tests/test_derived.py   |   6 +-
 exhibit/core/generate/uuids.py                |   4 +-
 exhibit/core/generate/yaml.py                 |   1 -
 exhibit/core/linkage/hierarchical.py          |   2 +-
 exhibit/core/linkage/matrix.py                |   9 +-
 .../tests/test_linkage_hierarchical.py        |   2 +-
 exhibit/core/spec.py                          |  22 +--
 exhibit/core/sql.py                           |   2 +
 exhibit/core/tests/test_constraints.py        |   6 +-
 exhibit/core/tests/test_exhibit.py            | 131 +++++++++---------
 exhibit/core/tests/test_formatters.py         |   2 +-
 exhibit/core/tests/test_reference.py          |  29 ++--
 exhibit/core/utils.py                         |   4 +-
 exhibit/core/validator.py                     |  16 +--
 exhibit/sample/sample.py                      |   2 +-
 27 files changed, 208 insertions(+), 195 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index c941f28..eafaa17 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -23,13 +23,14 @@
 # no Warning level messages displayed, use "--disable=all --enable=classes
 # --disable=W".
 disable=C0303, # trailing whitespace
-        C0330, # hanging indent,
         C0103, # invalid name - too many false positives
+        C3001, # Lambda expression assigned to a variable
         E1136, # value is unsubscriptable
+        E1101, # no-member; too many false positives
         W0212, # access to protected member
         E1130, # invalid unary operand (numpy)
-        R0201, # no self use - false positives in test modules
         W0622, # redefine builtins - __package__
+        W0640, # variable defined in loop
 
 
 [BASIC]
@@ -104,7 +105,7 @@ indent-after-paren=4
 indent-string='    '
 
 # Maximum number of characters on a single line.
-max-line-length=88
+max-line-length=100
 
 # Maximum number of lines in a module.
 max-module-lines=1000
@@ -236,7 +237,7 @@ valid-metaclass-classmethod-first-arg=cls
 [DESIGN]
 
 # Maximum number of arguments for function / method.
-max-args=6
+max-args=8
 
 # Maximum number of attributes for a class (see R0902).
 max-attributes=15
@@ -248,7 +249,7 @@ max-bool-expr=5
 max-branches=12
 
 # Maximum number of locals for function / method body.
-max-locals=20
+max-locals=25
 
 # Maximum number of parents for a class (see R0901).
 max-parents=7
@@ -279,5 +280,5 @@ deprecated-modules=
 
 # Exceptions that will emit a warning when being caught. Defaults to
 # "BaseException, Exception".
-overgeneral-exceptions=BaseException,
-                       Exception
+overgeneral-exceptions=builtins.BaseException,
+                       builtins.Exception
diff --git a/exhibit/command/tests/test_performance.py b/exhibit/command/tests/test_performance.py
index ccc088f..94f65a0 100644
--- a/exhibit/command/tests/test_performance.py
+++ b/exhibit/command/tests/test_performance.py
@@ -3,6 +3,8 @@
 regular unit / reference testing
 '''
 
+#pylint: disable=W0201
+
 # Standard library imports
 import string
 import unittest
@@ -25,7 +27,7 @@
     from memory_profiler import memory_usage
 except ImportError:
     memory_usage = None
-    print(f"memory_profiler not found. Make sure exhibit is installed in [dev] mode")
+    print("memory_profiler not found. Make sure exhibit is installed in [dev] mode")
 
 class performanceTests(unittest.TestCase):
     '''
diff --git a/exhibit/core/constants.py b/exhibit/core/constants.py
index 3eef5ee..a96afca 100644
--- a/exhibit/core/constants.py
+++ b/exhibit/core/constants.py
@@ -8,4 +8,4 @@
 ORIGINAL_VALUES_PAIRED = "See paired column"
 ORIGINAL_VALUES_REGEX = "regex"
 MISSING_DATA_STR = "Missing data"
-EXHIBIT_DB_LOCAL = join("db", "exhibit.db")
\ No newline at end of file
+EXHIBIT_DB_LOCAL = join("db", "exhibit.db")
diff --git a/exhibit/core/constraints.py b/exhibit/core/constraints.py
index 6380fc7..3012a67 100644
--- a/exhibit/core/constraints.py
+++ b/exhibit/core/constraints.py
@@ -1,6 +1,9 @@
 '''
 Module for various derived and user-set constraints
 '''
+
+# pylint: disable=C0302
+
 # Standard library imports
 from collections import namedtuple
 from datetime import datetime
@@ -166,7 +169,7 @@ def process_custom_constraints(self, custom_constraints):
 
             cc_filter = constraint.get("filter", None)
             cc_partitions = constraint.get("partition", None)
-            cc_targets = constraint.get("targets", dict())
+            cc_targets = constraint.get("targets", {})
 
             clean_cc_filter = clean_up_constraint_string(cc_filter)
             cc_filter_mask = get_constraint_mask(output_df, clean_cc_filter)
@@ -447,7 +450,7 @@ def _within_group_outliers(series):
             whether the value is divisible by 2 without remainder.
             '''
 
-            q25, q50, q75 = np.percentile(series, [25, 50, 75])
+            q25, _, q75 = np.percentile(series, [25, 50, 75])
             iqr = q75 - q25
 
             if iqr == 0:
@@ -761,9 +764,11 @@ def _make_almost_same(group):
             ulinked_df = generate_user_linked_anon_df(
                 self.spec_dict, user_linked_cols, new_df.shape[0], starting_col_matrix)
 
+            non_user_linked_cols = [x for x in df.columns if x not in user_linked_cols]
+
             new_df = pd.concat(
                 [ulinked_df.set_index(new_df.index)] + 
-                [df.loc[filter_idx, [x for x in df.columns if x not in user_linked_cols]]],
+                [df.loc[filter_idx, non_user_linked_cols]],
                 axis=1
             ).reindex(columns=df.columns)
 
@@ -865,7 +870,7 @@ def _generate_ordered_values(target_sequence, ordered_list, ordered_probs):
                 else:
                     pointer = 0
 
-            result = sorted(unordered_result, key=lambda x: ordered_list.index(x))
+            result = sorted(unordered_result, key=ordered_list.index)
 
             return result  
 
@@ -1025,7 +1030,7 @@ def _make_skewed_series(group):
 
             # add nulls based on the miss_probability of the skew column
             miss_pct = self.spec_dict["columns"][skew_col]["miss_probability"]
-            miss_val = pd.NA if group.dtype =='Int64' else np.nan
+            miss_val = pd.NA if group.dtype =="Int64" else np.nan
             skewed_result = np.where(
                 rng.random(size=nrows) < miss_pct,
                 miss_val, result.values)
@@ -1037,7 +1042,8 @@ def _make_skewed_series(group):
 
         target_cols = [x.strip() for x in target_str.split(",")]
         if len(target_cols) != 2: # pragma: no cover
-            raise Exception(f"{self.current_action} requires exactly 2 target columns.")
+            raise RuntimeError(
+                f"{self.current_action} requires exactly 2 target columns.")
 
         if partition_cols is not None:
             partition_cols = [x.strip() for x in partition_cols.split(",") if x]
@@ -1281,7 +1287,7 @@ def find_basic_constraint_columns(df):
             
     return output
 
-def clean_up_constraint_string(raw_string, type="cc_filter"):
+def clean_up_constraint_string(raw_string):
     '''
     The default way to handle column names with whitespace in eval strings
     is to enclose them in backticks. However, the default tokeniser will
@@ -1379,8 +1385,8 @@ def get_constraint_mask(df, clean_string):
                 .rename(lambda x: x.replace(" ", "__"), axis="columns")
                 .eval(clean_string, engine="python"))
 
-    except SyntaxError: #pragma: no cover
-        raise SyntaxError("Invalid filter expression supplied to custom action.")
+    except SyntaxError as e: #pragma: no cover
+        raise SyntaxError("Invalid filter expression supplied to custom action.") from e
 
     return mask
 
diff --git a/exhibit/core/exhibit.py b/exhibit/core/exhibit.py
index 635ff38..91f9a98 100644
--- a/exhibit/core/exhibit.py
+++ b/exhibit/core/exhibit.py
@@ -96,14 +96,14 @@ def __init__(
         skip_columns=None, linked_columns=None, 
         uuid_columns=None, discrete_columns=None,
         save_probabilities=None, derived_columns_first=False,
-        verbose=False, **kwargs):
+        verbose=False):
         '''
         Initialise either from the CLI or by instantiating directly
         '''
 
         # Basic error checking on the arguments
         if linked_columns is not None and len(linked_columns) < 2:
-            raise Exception("Please provide at least two linked columns")
+            raise RuntimeError("Please provide at least two linked columns")
         
         self.command = command
         self.source = source
@@ -111,7 +111,7 @@ def __init__(
         self.inline_limit = inline_limit
         self.equal_weights = equal_weights
         self.skip_columns = skip_columns or set()
-        self.linked_columns= linked_columns or list()
+        self.linked_columns= linked_columns or []
         self.uuid_columns= uuid_columns or set()
         self.discrete_columns = discrete_columns or set()
         self.save_probabilities = save_probabilities or set()
@@ -179,7 +179,7 @@ def write_spec(self, spec_yaml=None):
         else:
             output_path = self.output
 
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             f.write(spec_yaml)
         
         print("Exhibit ready to view")
@@ -208,7 +208,7 @@ def read_spec(self):
                 self.source = path_checker(self.source)
 
             if self.source.suffix == ".yml":
-                with open(self.source) as f:
+                with open(self.source, encoding="utf-8") as f:
                     self.spec_dict = yaml.safe_load(f)
             else: #pragma: no cover
                 raise TypeError("Specification is not in .yml format")
@@ -219,13 +219,13 @@ def read_spec(self):
         # these NONE values early and change them into empty sequences.
         for key, value in self.spec_dict["metadata"].items():
             if "columns" in key and value is None:
-                self.spec_dict["metadata"][key] = list()
+                self.spec_dict["metadata"][key] = []
 
         if self.spec_dict.get("linked_columns", None) is None:
-                self.spec_dict["linked_columns"] = list()
+            self.spec_dict["linked_columns"] = []
 
         if self.spec_dict.get("derived_columns", None) is None:
-                self.spec_dict["derived_columns"] = dict()
+            self.spec_dict["derived_columns"] = {}
 
         for col in self.spec_dict["metadata"]["categorical_columns"]:
 
@@ -253,6 +253,7 @@ def validate_spec(self):
         self.spec_dict = validated_spec
         return validated_spec is not None
 
+    #pylint: disable=R0912, R0915
     def execute_spec(self):
         '''
         Function only runs if validate_spec returned True
@@ -310,7 +311,7 @@ def execute_spec(self):
         for num_col in self.spec_dict["metadata"]["numerical_columns"]:
             
             # skip derived columns; they need main columns (inc. nulls) generated first
-            if num_col in (self.spec_dict.get("derived_columns", dict()) or dict()):
+            if num_col in (self.spec_dict.get("derived_columns", {}) or {}):
                 continue
 
             anon_df[num_col] = generate_continuous_column(
@@ -432,7 +433,8 @@ def execute_spec(self):
                     col_name=num_col
                 )
             # see comments above as to why we're re-generating derived columns
-            for derived_col, derived_def in self.spec_dict["derived_columns"].items(): #pragma: no cover
+            derived = self.spec_dict["derived_columns"].items()
+            for derived_col, derived_def in derived: #pragma: no cover
                 for num_col in num_cols:
                     if num_col in derived_def:
                         anon_df[derived_col] = generate_derived_column(anon_df, derived_def)
@@ -508,7 +510,7 @@ def generate(self):
             self.write_spec()
             return None
 
-        else:
+        if self.command == "fromspec":
             self.read_spec()
             if self.validate_spec():
                 self.execute_spec()
@@ -520,8 +522,9 @@ def generate(self):
                 self.write_data()
                 return None
 
-            # technically unreachable code because validation failures will raise
-            return None #pragma: no cover
+        raise RuntimeError( #pragma: no cover
+            "Generation command not recognized. Please initialise Exhibit "
+            "with a valid command.") 
 
 class Specification(UserDict): #pragma: no cover
     '''
@@ -541,5 +544,5 @@ def write_spec(self, path):
 
         spec_yaml = generate_YAML_string(self.data)
 
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
             f.write(spec_yaml)
diff --git a/exhibit/core/formatters.py b/exhibit/core/formatters.py
index 0b9525a..1d548f5 100644
--- a/exhibit/core/formatters.py
+++ b/exhibit/core/formatters.py
@@ -19,7 +19,7 @@ class FormattedList(list):
     separate processing for these formatted values from a basic list
     of values passed to original_values during manual column creation
     '''
-    pass
+
 
 def format_header(dataframe, series_name, prefix=None):
     '''
@@ -162,7 +162,7 @@ def build_list_of_probability_vectors(dataframe, original_series_name, ew=False)
     
     vectors = temp_vectors.values.tolist()
 
-    string_vectors = ["{0:.3f}".format(x).ljust(len(HEADER)) for x in vectors]
+    string_vectors = [f"{x:.3f}".ljust(len(HEADER)) for x in vectors]
 
     return string_vectors
 
@@ -187,7 +187,7 @@ def build_list_of_column_weights(weights):
     
     for key in sorted(weights):
 
-        padded_key = ["{0:.3f}".format(x).ljust(len(key)) for x in weights[key]]
+        padded_key = [f"{x:.3f}".ljust(len(key)) for x in weights[key]]
         sorted_temp.append(padded_key)
         
     sorted_final = [" | ".join(y for y in x).rstrip() for x in zip(*sorted_temp)]
@@ -333,15 +333,15 @@ def build_list_of_uuid_frequencies(df, target_col):
     counts = Counter(df[target_col].value_counts())
 
     freq_df = pd.DataFrame(
-        [(frequency, count) for frequency, count in counts.items()],
+        list(counts.items()),
         columns=["frequency", "count"]
     ).sort_values("frequency")
 
-    freq_df["pct"] = freq_df["count"] / freq_df["count"].sum()
+    freq_df.loc[:, "pct"] = freq_df["count"] / freq_df["count"].sum()
 
     freq_list = (
         freq_df["frequency"].astype(str).str.ljust(9)
-        .str.cat(freq_df["pct"].transform(lambda x: "{0:.3f}".format(x)), sep=' | ')
+        .str.cat(freq_df["pct"].transform(lambda x: f"{x:.3f}"), sep=" | ")
         .tolist()
     )
 
@@ -359,6 +359,6 @@ def format_df_for_export(df):
 
     for column in df.columns:
         if df[column].dtype == "timedelta64[ns]":
-            df[column] = df[column].astype(str).str.replace('0 days ', '')
+            df[column] = df[column].astype(str).str.replace("0 days ", "")
 
     return df
diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py
index ae38dc8..79b6ec5 100644
--- a/exhibit/core/generate/categorical.py
+++ b/exhibit/core/generate/categorical.py
@@ -46,6 +46,7 @@ def __init__(self, spec_dict, core_rows, anon_df=None):
         self.fixed_anon_sets = ["random", "mountains", "patients", "birds", "dates"]
         # we need UUID dataset (if it exists) for possible conditional SQL that
         # references already-generated columns in the spec
+        self.generated_dfs = []
         self.anon_df = anon_df
         
         (self.all_cols,
@@ -62,10 +63,8 @@ def generate(self):
         A dataframe with all categorical columns
         '''
 
-        self.generated_dfs = []
-
         #1) GENERATE LINKED DFs FROM EACH LINKED COLUMNS GROUP
-        for linked_group in (self.spec_dict.get("linked_columns") or list()):
+        for linked_group in (self.spec_dict.get("linked_columns") or []):
             
             # zero-numbered linked group is reserved for user-defined groupings
             if linked_group[0] == 0:
@@ -298,7 +297,7 @@ def _generate_from_sql(self, col_name, col_attrs, complete=False, db_path=None):
 
         anon_set = col_attrs["anonymising_set"]
         uniques = col_attrs["uniques"]
-        paired_cols = col_attrs["paired_columns"] or list()
+        paired_cols = col_attrs["paired_columns"] or []
 
         #1) QUERY SQL TO GET VALUES USED TO BUILD THE DATAFRAME
         if anon_set == "random":
@@ -424,12 +423,12 @@ def _get_column_types(self):
         # there might be cases when you want to generate just the date columns or just
         # the categorical columns so they might be missing from the metadata section
         all_cols = (
-            (self.spec_dict["metadata"].get("categorical_columns", list())) +
-            (self.spec_dict["metadata"].get("date_columns", list()))
+            (self.spec_dict["metadata"].get("categorical_columns", [])) +
+            (self.spec_dict["metadata"].get("date_columns", []))
         )
         
         nested_linked_cols = [
-            sublist for n, sublist in (self.spec_dict.get("linked_columns") or list())
+            sublist for n, sublist in (self.spec_dict.get("linked_columns") or [])
             ]
 
         complete_cols = [c for c, v in get_attr_values(
@@ -520,7 +519,7 @@ def _generate_using_external_table(self, col_name, anon_set):
         # unless we make an explicit copy of the de-duplicated dataframe, Pandas will 
         # trigger SettingWithCopy warning when trying to change any values.
         existing_data_distinct = existing_data.drop_duplicates(subset=join_columns).copy()
-        existing_data_cols = [c for c in existing_data.columns]
+        existing_data_cols = list(existing_data.columns)
 
         # this function converts list of tuples into a dataframe anyway
         create_temp_table(
@@ -572,7 +571,8 @@ def _generate_using_external_table(self, col_name, anon_set):
             # having a COALESCE in SQL would fix it, but in case it's also missing, 
             # we try to catch this edge case in code as well. 
             try:
-                new_data = self.rng.choice(a=probas[group_key][0], p=probas[group_key][1], size=len(group_index))
+                new_data = self.rng.choice(
+                    a=probas[group_key][0], p=probas[group_key][1], size=len(group_index))
             except KeyError: #pragma: no cover
                 new_data = [np.nan] * len(group_index)
     
@@ -582,7 +582,7 @@ def _generate_using_external_table(self, col_name, anon_set):
 
         # ensure we return the correct type for date columns
         col_type = self.spec_dict["columns"][col_name]["type"]
-        if col_type == 'date':
+        if col_type == "date":
             final_result = final_result.astype("datetime64[ns]")
 
         return final_result
diff --git a/exhibit/core/generate/geo.py b/exhibit/core/generate/geo.py
index 967a98c..02d0e9a 100644
--- a/exhibit/core/generate/geo.py
+++ b/exhibit/core/generate/geo.py
@@ -97,7 +97,7 @@ def geo_make_regions(
     '''
 
     if not partition_cols: #pragma: no cover
-        raise Exception("make_geo_regions action requires at least one partition")
+        raise RuntimeError("make_geo_regions action requires at least one partition")
 
     geo_target_cols = [x.strip() for x in target_str.split(",")]
     partition_cols = [x.strip() for x in partition_cols.split(",") if x]
@@ -110,7 +110,8 @@ def geo_make_regions(
         geo_target_cols_table_names.append(h3_table_name)
 
     if len(set(geo_target_cols_table_names)) != 1: #pragma: no cover
-        raise Exception("columns used for make_geo_regions action rely on different h3 tables")
+        raise RuntimeError(
+            "Columns used for make_geo_regions action rely on different h3 tables")
 
     # add placeholders for output columns
     target_cols = []
@@ -138,7 +139,7 @@ def geo_make_regions(
 
     # add H3 centroid coordinates
     geo_df["lat"], geo_df["long"] = zip(
-        *geo_df["h3"].transform(lambda x: h3.h3_to_geo(x)))
+        *geo_df["h3"].transform(h3.h3_to_geo))
 
     # create initial region indices based on the N of values in level=0
     n_regions = grouped_idx.get_level_values(level=0).nunique()
@@ -267,7 +268,7 @@ def _create_contiguous_regions(
     aspect_ratio = height/width
 
     line = LineString([p1, p2])
-    scaled_line = scale(line, xfact=15.0, yfact=15.0, zfact=1.0, origin='center')
+    scaled_line = scale(line, xfact=15.0, yfact=15.0, zfact=1.0, origin="center")
     
     # to avoid very thin regions, change the rotation angle of the cutting line based
     # on the aspect rato; 2 is a magic number; another option is to use a tighter "crop"
@@ -306,7 +307,7 @@ def _create_contiguous_regions(
 
         if retries == 5:
             print("Regions created: ", len(final_regions_idx))
-            raise Exception("Can't create a subregion.")
+            raise RuntimeError("Can't create a subregion.")
 
         rotated_line = rotate(scaled_line, angle=int(rng.uniform(low=0, high=180)))
         result = _cut_polygon_by_line(polygon, rotated_line)
@@ -323,7 +324,7 @@ def _create_contiguous_regions(
     final_regions_idx.extend([idx_child_1, idx_child_2])
     
     parent_idx = None
-    for i in range(len(final_regions_idx)):
+    for i, _ in enumerate(final_regions_idx):
         if final_regions_idx[i].equals(idx_child_1.union(idx_child_2)):
             parent_idx = i
             
diff --git a/exhibit/core/generate/missing.py b/exhibit/core/generate/missing.py
index 79a7c99..4cd048f 100644
--- a/exhibit/core/generate/missing.py
+++ b/exhibit/core/generate/missing.py
@@ -184,7 +184,7 @@ def add_missing_data(self):
             
             # if it's already NA, don't re-generate; it's NA for a reason!
             num_mask = self.nan_data[num_col].isna()
-            mask = (cat_mask & ~num_mask)
+            mask = cat_mask & ~num_mask
 
             # it's possible to have the left side be Int64 type and the right side
             # to be float64 (newly generated, unscaled); assigning different types
@@ -256,7 +256,7 @@ def _find_columns_with_linked_missing_data(self):
                 pairs.update([col] + attrs["paired_columns"])
 
             # linked groups
-            for i, linked_group in (self.spec_dict["linked_columns"] or list()):
+            for i, linked_group in (self.spec_dict["linked_columns"] or []):
                 # zero numbered linked group is reserved for user defined linkage
                 if i == 0:
                     continue
@@ -289,14 +289,14 @@ def _find_make_null_idx(self):
         original data passed in to the generator.
         '''
         
-        cc = self.spec_dict["constraints"]["custom_constraints"] or dict()
+        cc = self.spec_dict["constraints"]["custom_constraints"] or {}
 
         make_null_idx = []
         
         for _, constraint in cc.items():
 
             cc_filter = constraint.get("filter", None)
-            cc_targets = constraint.get("targets", dict())
+            cc_targets = constraint.get("targets", {})
             clean_cc_filter = clean_up_constraint_string(cc_filter)
             cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter)
 
@@ -319,14 +319,14 @@ def _find_not_null_idx(self):
         Doc string
         '''
         
-        cc = self.spec_dict["constraints"]["custom_constraints"] or dict()
+        cc = self.spec_dict["constraints"]["custom_constraints"] or {}
 
         not_null_idx = []
             
         for _, constraint in cc.items():
 
             cc_filter = constraint.get("filter", None)
-            cc_targets = constraint.get("targets", dict())
+            cc_targets = constraint.get("targets", {})
             clean_cc_filter = clean_up_constraint_string(cc_filter)
             cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter)
 
diff --git a/exhibit/core/generate/regex.py b/exhibit/core/generate/regex.py
index bfdce70..4fbf9f8 100644
--- a/exhibit/core/generate/regex.py
+++ b/exhibit/core/generate/regex.py
@@ -24,7 +24,7 @@ def generate_regex_column(anon_pattern, name, size, target_uniques=None):
     Returns pd.Series
     '''
     # ensure that each column gets a unique (except for anagrams) seed
-    column_seed = sum([ord(x) for x in name])
+    column_seed = sum(ord(x) for x in name)
     
     static_quant_pattern = r"[^\]]\{\d+\}"
     static_string = anon_pattern
diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py
index 5b5e883..e9a0439 100644
--- a/exhibit/core/generate/tests/test_categorical.py
+++ b/exhibit/core/generate/tests/test_categorical.py
@@ -267,9 +267,9 @@ def test_column_with_categorical_values_based_on_conditonal_sql(self):
         result = gen.generate()
 
         self.assertTrue(
-            (result.query("gender == 'F'")["linked_condition"] == 'C').all())
+            (result.query("gender == 'F'")["linked_condition"] == "C").all())
         self.assertFalse(
-            (result.query("gender == 'M'")["linked_condition"] == 'C').any())
+            (result.query("gender == 'M'")["linked_condition"] == "C").any())
 
     def test_column_with_external_date_values_in_conditonal_sql(self):
         '''
@@ -282,8 +282,8 @@ def test_column_with_external_date_values_in_conditonal_sql(self):
         FROM temp_main JOIN temp_linked ON temp_main.gender = temp_linked.gender
         '''
 
-        m_dates = pd.date_range(start='2022-01-01', periods=3, freq='D')
-        f_dates = pd.date_range(start='2023-01-01', periods=3, freq='D') 
+        m_dates = pd.date_range(start="2022-01-01", periods=3, freq="D")
+        f_dates = pd.date_range(start="2023-01-01", periods=3, freq="D") 
         dates = m_dates.union(f_dates)
 
         linked_data = pd.DataFrame(data={
@@ -355,10 +355,10 @@ def test_column_with_source_date_values_in_conditonal_sql(self):
             "columns": {
                 "source_date": {
                     "type": "date",
-                    "from": '2023-01-01',
-                    "to"  : '2023-02-01',
+                    "from": "2023-01-01",
+                    "to"  : "2023-02-01",
                     "uniques" : 5,
-                    "frequency" : 'D',
+                    "frequency" : "D",
                     "cross_join_all_unique_values" : False,
                 },
                 "conditional_date": {
@@ -373,7 +373,7 @@ def test_column_with_source_date_values_in_conditonal_sql(self):
         result = gen.generate()
 
         self.assertTrue((result["conditional_date"] > result["source_date"]).all())
-        self.assertTrue((result["conditional_date"] < '2023-03-01').all())
+        self.assertTrue((result["conditional_date"] < "2023-03-01").all())
 
     def test_column_with_using_case_statement_in_conditonal_sql(self):
         '''
@@ -445,10 +445,10 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self):
             "columns": {
                 "source_date": {
                     "type": "date",
-                    "from": '2023-01-01',
-                    "to"  : '2023-02-01',
+                    "from": "2023-01-01",
+                    "to"  : "2023-02-01",
                     "uniques" : 60,
-                    "frequency" : 'D',
+                    "frequency" : "D",
                     "cross_join_all_unique_values" : False,
                 },
             }
diff --git a/exhibit/core/generate/tests/test_continuous.py b/exhibit/core/generate/tests/test_continuous.py
index 9988597..1a822bb 100644
--- a/exhibit/core/generate/tests/test_continuous.py
+++ b/exhibit/core/generate/tests/test_continuous.py
@@ -28,16 +28,16 @@ def test_apply_dispersion(self):
 
         #zero dispersion returns original value
         test_case_1 = tm._apply_dispersion(5, 0, rng)
-        expected_1 = (test_case_1 == 5)
+        expected_1 = test_case_1 == 5
 
         #basic interval picking
         test_case_2 = tm._apply_dispersion(10, 0.5, rng)
-        expected_2 = (5 <= test_case_2 <= 15)
+        expected_2 = 5 <= test_case_2 <= 15
 
         #avoid negative interval for values of zero where all
         #values are expected to be greater or equal to zero
         test_case_3 = tm._apply_dispersion(0, 0.2, rng)
-        expected_3 = (0 <= test_case_3 <= 2)
+        expected_3 = 0 <= test_case_3 <= 2
 
         #na returns na
         test_case_4 = tm._apply_dispersion(pd.NA, 0.2, rng)
diff --git a/exhibit/core/generate/tests/test_derived.py b/exhibit/core/generate/tests/test_derived.py
index 7b4080b..a3be948 100644
--- a/exhibit/core/generate/tests/test_derived.py
+++ b/exhibit/core/generate/tests/test_derived.py
@@ -81,9 +81,9 @@ def test_generate_derived_column_with_timestamp(self):
         calc = "@create_timestamp(hours, minutes, seconds)"
 
         expected = pd.Series([
-            '2022-01-31 01:00:00',
-            '2022-02-28 02:00:01',
-            '2022-03-31 00:59:10'
+            "2022-01-31 01:00:00",
+            "2022-02-28 02:00:01",
+            "2022-03-31 00:59:10"
         ])
 
         # can add dates and timedelta timestamps easily
diff --git a/exhibit/core/generate/uuids.py b/exhibit/core/generate/uuids.py
index 47779c4..43e5d2f 100644
--- a/exhibit/core/generate/uuids.py
+++ b/exhibit/core/generate/uuids.py
@@ -131,11 +131,11 @@ def _generate_pseudo_chis(n, seed=0):
     while len(result) < n:
         pseudo_chi = (
             str(random.randint(0,31)) + # day will be zero padded if total length < 10
-            '13' +                      # ensure no accidental collissions
+            "13" +                      # ensure no accidental collissions
             str(random.randint(20, 99)) +
             str(random.randint(0,9999)).zfill(4) # no specific logic for 9th digit
         ).zfill(10)
 
         result.add(pseudo_chi)
     
-    return sorted(list(result))
\ No newline at end of file
+    return sorted(list(result))
diff --git a/exhibit/core/generate/yaml.py b/exhibit/core/generate/yaml.py
index 3c9971c..a20a9f0 100644
--- a/exhibit/core/generate/yaml.py
+++ b/exhibit/core/generate/yaml.py
@@ -21,7 +21,6 @@ class ExhibitDumper(yaml.SafeDumper):
     Columns are subclassed dictionaries, but YAML's safe_dump will not recognize
     them as such unless you add a specific representer for each "special" class.
     '''
-    pass
 
 
 def generate_YAML_string(spec_dict):
diff --git a/exhibit/core/linkage/hierarchical.py b/exhibit/core/linkage/hierarchical.py
index 163fcc5..0e66171 100644
--- a/exhibit/core/linkage/hierarchical.py
+++ b/exhibit/core/linkage/hierarchical.py
@@ -742,7 +742,7 @@ def add_paired_columns(self, linked_df):
             for c in self.linked_cols:
                 
                 #just generate a DF with duplicate paired columns
-                for pair in self.spec_dict["columns"][c]["paired_columns"] or list():
+                for pair in self.spec_dict["columns"][c]["paired_columns"] or []:
 
                     #overwrite linked_df
                     linked_df = pd.concat(
diff --git a/exhibit/core/linkage/matrix.py b/exhibit/core/linkage/matrix.py
index b1dd5f9..379cf19 100644
--- a/exhibit/core/linkage/matrix.py
+++ b/exhibit/core/linkage/matrix.py
@@ -160,10 +160,11 @@ def generate_user_linked_anon_df(
     # get the numerical mapping as expected, also changing the dtype for performance.
 
     if starting_col_matrix is not None:
-       starting_col_matrix = (
-        pd.DataFrame(starting_col_matrix)
-        .fillna(MISSING_DATA_STR)
-        .applymap(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16))
+        starting_col_matrix = (
+            pd.DataFrame(starting_col_matrix)
+            .fillna(MISSING_DATA_STR)
+            .applymap(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16)
+        )
 
     else:
         starting_col_matrix = np.full(
diff --git a/exhibit/core/linkage/tests/test_linkage_hierarchical.py b/exhibit/core/linkage/tests/test_linkage_hierarchical.py
index 6b4c3ba..f908302 100644
--- a/exhibit/core/linkage/tests/test_linkage_hierarchical.py
+++ b/exhibit/core/linkage/tests/test_linkage_hierarchical.py
@@ -214,7 +214,7 @@ def test_hierarchically_linked_columns_with_missing_data(self):
         # lists with equal elements, ignoring order
         self.assertCountEqual(
             tm.find_hierarchically_linked_columns(test_df, test_spec),
-            [('C1', 'C4'), ("C3", "C4")]
+            [("C1", "C4"), ("C3", "C4")]
         )
 
     def test_1_to_1_linked_columns(self):
diff --git a/exhibit/core/spec.py b/exhibit/core/spec.py
index 9419ff5..206f1e6 100644
--- a/exhibit/core/spec.py
+++ b/exhibit/core/spec.py
@@ -80,11 +80,11 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs
         self.output = {
             "metadata": {
                 "number_of_rows"      : 0,
-                "uuid_columns"        : list(),
-                "categorical_columns" : list(),
-                "numerical_columns"   : list(),
-                "date_columns"        : list(),
-                "geospatial_columns"  : list(),
+                "uuid_columns"        : [],
+                "categorical_columns" : [],
+                "numerical_columns"   : [],
+                "date_columns"        : [],
+                "geospatial_columns"  : [],
                 "inline_limit"        : self.inline_limit,
                 "random_seed"         : self.random_seed,
                 "id"                  : "",
@@ -92,8 +92,8 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs
             "columns": {},
             "constraints": {
                 "allow_duplicates"   : True,
-                "basic_constraints"  : list(),
-                "custom_constraints" : list()
+                "basic_constraints"  : [],
+                "custom_constraints" : []
             },
             "linked_columns" : [],
             "derived_columns": {},
@@ -104,7 +104,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs
 
             self.df = data.copy()
             self.ew = ew
-            self.user_linked_cols = kwargs.get("user_linked_cols", list())
+            self.user_linked_cols = kwargs.get("user_linked_cols", [])
             self.uuid_cols = kwargs.get("uuid_cols", set())
             self.db_prob_cols = kwargs.get("save_probabilities", set())
             self.id = generate_table_id()
@@ -135,7 +135,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs
             meta["categorical_columns"] = sorted(list(self.cat_cols))
             meta["numerical_columns"]   = sorted(list(self.numerical_cols))
             meta["date_columns"]        = sorted(list(self.date_cols))
-            meta["geospatial_columns"]  = list()
+            meta["geospatial_columns"]  = []
             meta["inline_limit"]        = self.inline_limit
             meta["random_seed"]         = self.random_seed
             meta["id"]                  = self.id
@@ -431,7 +431,7 @@ def _original_values_path_resolver(self, path, wt, col):
             return output
         
         #if path is something else, raise exception
-        raise ValueError("Incorrect %s" % path) # pragma: no cover
+        raise ValueError(f"Incorrect {path}") # pragma: no cover
 
 class UUIDColumn(dict):
     '''
@@ -522,7 +522,7 @@ def __init__(self,
         self["type"] = "categorical"
         self["name"] = name
         self["original_values"] = original_values
-        self["paired_columns"] = list() if paired_columns is None else paired_columns
+        self["paired_columns"] = [] if paired_columns is None else paired_columns
         self["uniques"] = 0 if uniques is None else uniques
         self["cross_join_all_unique_values"] = cross_join
         self["miss_probability"] = miss_proba
diff --git a/exhibit/core/sql.py b/exhibit/core/sql.py
index 5e38018..e4b131b 100644
--- a/exhibit/core/sql.py
+++ b/exhibit/core/sql.py
@@ -1,6 +1,8 @@
 '''
 Module with functions to provide an interface with the exhibit database
 '''
+#false positive on engine.dispose()
+#pylint: disable=E1101
 
 # Standard library imports
 import os
diff --git a/exhibit/core/tests/test_constraints.py b/exhibit/core/tests/test_constraints.py
index d22a445..3a39885 100644
--- a/exhibit/core/tests/test_constraints.py
+++ b/exhibit/core/tests/test_constraints.py
@@ -2,6 +2,8 @@
 Test the code for parsing and enforcing constraints
 '''
 
+# pylint: disable=C0302
+
 # Standard library imports
 import unittest
 from datetime import datetime
@@ -809,7 +811,7 @@ def test_custom_constraints_make_distinct_with_date_columns(self):
         }
 
         test_data = pd.DataFrame(data={
-            "A" : list(pd.date_range(start='2020-01-01', periods=5)) * 4,
+            "A" : list(pd.date_range(start="2020-01-01", periods=5)) * 4,
             "B" : [True, False] * 10,
         })
 
@@ -1423,7 +1425,7 @@ def test_custom_constraints_make_almost_same(self):
         result = test_gen.process_constraints().query("B=='spam'")
         
         pct_B = result.value_counts().agg(lambda x: x / sum(x)).iloc[1]
-        self.assertTrue(pct_B > 0 and pct_B < 0.1)
+        self.assertTrue(0.1 > pct_B > 0)
 
     def test_custom_constraints_targeting_high_frequency_rows(self):
         '''
diff --git a/exhibit/core/tests/test_exhibit.py b/exhibit/core/tests/test_exhibit.py
index 5aac9fe..8627c61 100644
--- a/exhibit/core/tests/test_exhibit.py
+++ b/exhibit/core/tests/test_exhibit.py
@@ -47,12 +47,12 @@ def test_read_data_func_reads_csv_from_source_path(self):
         Send "mock" command line arguments to parse_args function
         and assert that the program reads the same data as ref_df.
         '''
-        args = dict(
-            command="fromdata",
-            source=Path(package_dir("sample", "_data", "inpatients.csv")),
-            verbose=True,
-            skip_columns=[]
-        )
+        args = {
+            "command"      : "fromdata",
+            "source"       : Path(package_dir("sample", "_data", "inpatients.csv")),
+            "skip_columns" : [],
+            "verbose"      : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_data()
@@ -80,19 +80,19 @@ def test_output_spec_creates_file_with_o_argument(self):
         advantage of its methods that mimick the open() builtin
         '''
 
-        args = dict(
-            command="fromdata",
-            source="dummy.csv",
-            output="test.yml",
-            verbose=True,
-        )
+        args = {
+            "command" : "fromdata",
+            "source"  : "dummy.csv",
+            "output"  : "test.yml",
+            "verbose" : True,
+        }
 
         with patch("exhibit.core.exhibit.open", new=mock_open()) as mo:
            
             xA = tm.Exhibit(**args)
             xA.write_spec("hello")
 
-            mo.assert_called_with("test.yml", "w")
+            mo.assert_called_with("test.yml", "w", encoding="utf-8")
             mo.return_value.__enter__.return_value.write.assert_called_with("hello")
 
     def test_output_spec_creates_file_without_o_argument(self):
@@ -102,19 +102,19 @@ def test_output_spec_creates_file_without_o_argument(self):
         the command: fromdata or fromspec.
         '''
 
-        args = dict(
-            command="fromdata",
-            source=Path("source_dataset.csv"),
-            output=None,
-            verbose=True,
-        )
+        args = {
+            "command" : "fromdata",
+            "source"  : Path("source_dataset.csv"),
+            "output"  : None,
+            "verbose" : True,
+        }
         
         with patch("exhibit.core.exhibit.open", new=mock_open()) as mo:
                 
             xA = tm.Exhibit(**args)
             xA.write_spec("hello")
 
-            mo.assert_called_with("source_dataset_SPEC.yml", "w")
+            mo.assert_called_with("source_dataset_SPEC.yml", "w", encoding="utf-8")
             mo.return_value.__enter__.return_value.write.assert_called_with("hello")
     
     def test_output_spec_respectes_equal_weights_argument(self):
@@ -122,14 +122,14 @@ def test_output_spec_respectes_equal_weights_argument(self):
         Doc string
         '''
 
-        args = dict(
-            command="fromdata",
-            source=Path(package_dir("sample", "_data", "inpatients.csv")),
-            verbose=True,
-            inline_limit=30,
-            equal_weights=True,
-            skip_columns=[]
-        )
+        args = {
+            "command"       : "fromdata",
+            "source"        : Path(package_dir("sample", "_data", "inpatients.csv")),
+            "inline_limit"  : 30,
+            "equal_weights" : True,
+            "skip_columns"  : [],
+            "verbose"       : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_data()
@@ -151,15 +151,15 @@ def test_spec_generation_with_predefined_linked_columns(self):
 
         user_linked_cols = ["sex", "age"]
 
-        args = dict(
-            command="fromdata",
-            source=Path(package_dir("sample", "_data", "inpatients.csv")),
-            verbose=True,
-            inline_limit=30,
-            equal_weights=True,
-            skip_columns=[],
-            linked_columns=user_linked_cols
-        )
+        args = {
+            "command"        : "fromdata",
+            "source"         : Path(package_dir("sample", "_data", "inpatients.csv")),
+            "inline_limit"   : 30,
+            "equal_weights"  : True,
+            "skip_columns"   : [],
+            "linked_columns" : user_linked_cols,
+            "verbose"        : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_data()
@@ -181,15 +181,15 @@ def test_overlapping_hierarchical_and_predefined_linked_columns(self):
 
         user_linked_cols = ["hb_name", "hb_code", "age"]
 
-        args = dict(
-            command="fromdata",
-            source=Path(package_dir("sample", "_data", "inpatients.csv")),
-            verbose=True,
-            inline_limit=30,
-            equal_weights=True,
-            skip_columns=[],
-            linked_columns=user_linked_cols
-        )
+        args = {
+            "command"        : "fromdata",
+            "source"         : Path(package_dir("sample", "_data", "inpatients.csv")),
+            "inline_limit"   : 30,
+            "equal_weights"  : True,
+            "skip_columns"   : [],
+            "linked_columns" : user_linked_cols,
+            "verbose"        : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_data()
@@ -208,15 +208,15 @@ def test_less_than_two_predefined_linked_columns_raiser_error(self):
 
         user_linked_cols = ["hb_name"]
 
-        args = dict(
-            command="fromdata",
-            source=Path(package_dir("sample", "_data", "inpatients.csv")),
-            verbose=True,
-            inline_limit=30,
-            equal_weights=True,
-            skip_columns=[],
-            linked_columns=user_linked_cols
-        )
+        args = {
+            "command"        : "fromdata",
+            "source"         : Path(package_dir("sample", "_data", "inpatients.csv")),
+            "inline_limit"   : 30,
+            "equal_weights"  : True,
+            "skip_columns"   : [],
+            "linked_columns" : user_linked_cols,
+            "verbose"        : True,
+        }
 
         self.assertRaises(Exception, tm.Exhibit, **args)
 
@@ -229,15 +229,15 @@ def test_uuid_columns_are_never_duplicated_in_other_column_types(self):
 
         uuid_columns = {"hb_name", "quarter_date", "stays"}
 
-        args = dict(
-            command="fromdata",
-            source=Path(package_dir("sample", "_data", "inpatients.csv")),
-            verbose=True,
-            inline_limit=30,
-            equal_weights=True,
-            skip_columns={},
-            uuid_columns=uuid_columns
-        )
+        args = {
+            "command"       : "fromdata",
+            "source"        : Path(package_dir("sample", "_data", "inpatients.csv")),
+            "inline_limit"  : 30,
+            "equal_weights" : True,
+            "skip_columns"  : [],
+            "uuid_columns"  : uuid_columns,
+            "verbose"       : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_data()
@@ -248,8 +248,7 @@ def test_uuid_columns_are_never_duplicated_in_other_column_types(self):
 
         metadata_uuid_columns = xA.spec_dict["metadata"]["uuid_columns"]
 
-        col_names_and_types = [
-            x for x in get_attr_values(xA.spec_dict, "type", col_names=True)]
+        col_names_and_types = list(get_attr_values(xA.spec_dict, "type", col_names=True))
 
         not_expected = []
         for col_name, col_type in col_names_and_types:
diff --git a/exhibit/core/tests/test_formatters.py b/exhibit/core/tests/test_formatters.py
index 8a894f1..2165cb7 100644
--- a/exhibit/core/tests/test_formatters.py
+++ b/exhibit/core/tests/test_formatters.py
@@ -42,7 +42,7 @@ def test_uuid_frequency_list_generation(self):
             "value" : range(7)
         })
 
-        test_col = 'id'
+        test_col = "id"
 
         expected_list = [
             "frequency | probability_vector",
diff --git a/exhibit/core/tests/test_reference.py b/exhibit/core/tests/test_reference.py
index 8fc6476..8071d0b 100644
--- a/exhibit/core/tests/test_reference.py
+++ b/exhibit/core/tests/test_reference.py
@@ -45,8 +45,6 @@ def temp_exhibit(
     test_spec_dict=None,
     return_spec=True,
     return_df=True,
-    *args,
-    **kwargs,
     ):
     '''
     A helper method to generate and read custom specifications 
@@ -324,14 +322,13 @@ def test_reference_inpatient_anon_data(self):
         be different from those set in the spec as target sum is enforced
         BEFORE basic constraints are adjusted.
         '''
-
-
-        args = dict(
-                command="fromspec",
-                source=Path(package_dir("sample", "_spec", "inpatients_demo.yml")),
-                verbose=True,
-                skip_columns=[]
-            )
+        
+        args = {
+            "command"      : "fromspec",
+            "source"       : Path(package_dir("sample", "_spec", "inpatients_demo.yml")),
+            "skip_columns" : [],
+            "verbose"      : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_spec()
@@ -766,12 +763,12 @@ def test_reference_uuid_data(self):
             - no db
         '''
         
-        args = dict(
-                command="fromspec",
-                source=Path(package_dir("sample", "_spec", "uuid_demo.yml")),
-                verbose=True,
-                skip_columns=[]
-            )
+        args = {
+            "command"      : "fromspec",
+            "source"       : Path(package_dir("sample", "_spec", "uuid_demo.yml")),
+            "skip_columns" : [],
+            "verbose"      : True,
+        }
 
         xA = tm.Exhibit(**args)
         xA.read_spec()
diff --git a/exhibit/core/utils.py b/exhibit/core/utils.py
index 362b1bb..95a7dae 100644
--- a/exhibit/core/utils.py
+++ b/exhibit/core/utils.py
@@ -418,7 +418,7 @@ def natural_key(string_):
     '''
     Thanks SO!
     '''
-    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
+    return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)]
 
 def shuffle_data(data):
     '''
@@ -430,4 +430,4 @@ def shuffle_data(data):
         .reset_index(drop=True)
     )
     
-    return shuffled_series
\ No newline at end of file
+    return shuffled_series
diff --git a/exhibit/core/validator.py b/exhibit/core/validator.py
index 149c8a1..9cbf36b 100644
--- a/exhibit/core/validator.py
+++ b/exhibit/core/validator.py
@@ -126,7 +126,7 @@ def validate_linked_cols(self, spec_dict=None):
         VALIDATION FAIL: linked columns must have matching attributes (%(err_attr)s)
         """)
 
-        for linked_col_group in spec_dict["linked_columns"] or list():
+        for linked_col_group in spec_dict["linked_columns"] or []:
             #linked_columns[0] is the index of linked group; actual columns are [1] 
             linked_cols = linked_col_group[1]
 
@@ -341,7 +341,7 @@ def validate_no_repeating_columns_in_linked_groups(self, spec_dict=None):
         VALIDATION FAIL: Duplicate column(s) in linked groups
         """)
 
-        nested_list = spec_dict["linked_columns"] or list()
+        nested_list = spec_dict["linked_columns"] or []
         flat_list = list(chain(*[sublist for _, sublist in nested_list]))
         flat_set = set(flat_list)
 
@@ -369,11 +369,11 @@ def validate_metadata_columns(self, spec_dict=None):
 
         warn = False
 
-        m_uuid = spec_dict["metadata"].get("uuid_columns", list())
-        m_cat = spec_dict["metadata"].get("categorical_columns", list())
-        m_num = spec_dict["metadata"].get("numerical_columns", list())
-        m_date = spec_dict["metadata"].get("date_columns", list())
-        m_geo = spec_dict["metadata"].get("geospatial_columns", list())
+        m_uuid = spec_dict["metadata"].get("uuid_columns", [])
+        m_cat = spec_dict["metadata"].get("categorical_columns", [])
+        m_num = spec_dict["metadata"].get("numerical_columns", [])
+        m_date = spec_dict["metadata"].get("date_columns", [])
+        m_geo = spec_dict["metadata"].get("geospatial_columns", [])
 
         col_types = {
             "uuid" : m_uuid,
@@ -392,4 +392,4 @@ def validate_metadata_columns(self, spec_dict=None):
         if warn:
             print(warn_msg)
 
-        return True
\ No newline at end of file
+        return True
diff --git a/exhibit/sample/sample.py b/exhibit/sample/sample.py
index 455cca6..0643535 100644
--- a/exhibit/sample/sample.py
+++ b/exhibit/sample/sample.py
@@ -29,5 +29,5 @@
     package_dir("sample", "_data", "prescribing.csv"), parse_dates=["PaidDateMonth"])
 
 #Load specs
-with open(package_dir("sample", "_spec", "inpatients_demo.yml")) as f:
+with open(package_dir("sample", "_spec", "inpatients_demo.yml"), encoding="utf-8") as f:
     inpatients_spec = yaml.safe_load(f)