address flake8 and most mypy problems

dedupeio · Jun 20, 2024 · 59dce08 · 59dce08
1 parent f9ee163
commit 59dce08
Showing 13 changed files with 71 additions and 55 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,3 +1,4 @@
 [flake8]
 max-line-length=160
 extend-ignore = E203
+per-file-ignores = dedupe/variables/__init__.py:F401
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,3 +8,8 @@ repos:
     hooks:
       - id: isort
         name: isort (python)
+  - repo: https://github.com/pycqa/flake8
+    rev: "7.1.0"
+    hooks:
+      - id: flake8
+        args: [--config=.flake8]
diff --git a/dedupe/_typing.py b/dedupe/_typing.py
@@ -4,7 +4,6 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Collection,
     Dict,
     FrozenSet,
     Iterable,
@@ -109,6 +108,20 @@ def close(self) -> None: ...
     def join(self) -> None: ...
 
 
+class Variable(Protocol):
+    name: str
+    predicates: list["Predicate"]
+    has_missing: bool
+    higher_vars: Sequence["Variable"]
+
+    def __len__(self) -> int: ...
+
+
+class FieldVariable(Variable, Protocol):
+    field: str
+    comparator: Comparator
+
+
 MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]
 
 PathLike = Union[str, os.PathLike]
diff --git a/dedupe/api.py b/dedupe/api.py
@@ -70,7 +70,7 @@
         Scores,
         TrainingData,
         TupleLinks,
-        VariableDefinition,
+        Variable,
     )
 
 logger = logging.getLogger(__name__)
@@ -1117,7 +1117,7 @@ class ActiveMatching(Matching):
 
     def __init__(
         self,
-        variable_definition: Collection[VariableDefinition],
+        variable_definition: Collection[Variable],
         num_cores: int | None = None,
         in_memory: bool = False,
         **kwargs,

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -177,7 +177,7 @@ def union_find(scored_pairs: Scores) -> numpy.typing.NDArray[numpy.int_]:
 
 def condensedDistance(
     dupes: Scores,
-) -> tuple[dict[int, RecordID], numpy.typing.NDArray[numpy.float_], int]:
+) -> tuple[dict[int, RecordID], numpy.typing.NDArray[numpy.float64], int]:
     """
     Convert the pairwise list of distances in dupes to "condensed
     distance matrix" required by the hierarchical clustering
@@ -262,16 +262,16 @@ def cluster(
 
 def confidences(
     cluster: Sequence[int],
-    squared_distances: numpy.typing.NDArray[numpy.float_],
+    squared_distances: numpy.typing.NDArray[numpy.float64],
     d: int,
-) -> numpy.typing.NDArray[numpy.float_]:
+) -> numpy.typing.NDArray[numpy.float64]:
     """
     We calculate a per record score that is similar to a standard
     deviation.  The main reason is that these record scores can be
     used to calculate the standard deviation of an entire cluster,
     which is a reasonable metric for clusters.
     """
-    scores: numpy.typing.NDArray[numpy.float_]
+    scores: numpy.typing.NDArray[numpy.float64]
     scores_d = dict.fromkeys(cluster, 0.0)
     C = 2 * d - 3
     for i, j in itertools.combinations(cluster, 2):

diff --git a/dedupe/convenience.py b/dedupe/convenience.py
@@ -135,7 +135,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None:  # pragma: no cov
 
     finished = False
     use_previous = False
-    fields = unique(var.field for var in deduper.data_model.primary_variables)
+    fields = unique(var.field for var in deduper.data_model.field_variables)
 
     buffer_len = 1  # Max number of previous operations
     unlabeled: list[RecordDictPair] = []

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
@@ -6,15 +6,19 @@
 
 import numpy
 
-import dedupe.variables
-from dedupe.variables.base import FieldType as FieldVariable
-from dedupe.variables.base import MissingDataType, Variable
+from dedupe.variables.base import MissingDataType
 from dedupe.variables.interaction import InteractionType
 
 if TYPE_CHECKING:
     from typing import Generator, Iterable, Sequence
 
-    from dedupe._typing import Comparator, RecordDict, RecordDictPair
+    from dedupe._typing import (
+        Comparator,
+        FieldVariable,
+        RecordDict,
+        RecordDictPair,
+        Variable,
+    )
     from dedupe.predicates import Predicate
 
 
@@ -27,25 +31,24 @@ def __init__(
         variable_definitions = list(variable_definitions)
         if not variable_definitions:
             raise ValueError("The variable definitions cannot be empty")
-        if all(
-            isinstance(variable, dedupe.variables.Custom)
-            for variable in variable_definitions
-        ):
+        if any(hasattr(variable, "predicates") for variable in variable_definitions):
             raise ValueError(
                 "At least one of the variable types needs to be a type"
                 "other than 'Custom'. 'Custom' types have no associated"
                 "blocking rules"
             )
 
-        self.field_variables = [
-            variable for variable in variable_definitions if hasattr(variable, "field")
+        self.field_variables: list[FieldVariable] = [
+            variable
+            for variable in variable_definitions
+            if hasattr(variable, "field") and hasattr(variable, "comparator")
         ]
 
         # we need to keep track of ordering of variables because in
         # order calculate derived fields like interation and missing
         # data fields. This code would be much better if there was
         # always a "columns" attribute on variables
-        columns = []
+        columns: list[Variable] = []
         for variable in self.field_variables:
             if hasattr(variable, "higher_vars"):
                 columns.extend(variable.higher_vars)
@@ -91,7 +94,7 @@ def predicates(self) -> set[Predicate]:
 
     def distances(
         self, record_pairs: Sequence[RecordDictPair]
-    ) -> numpy.typing.NDArray[numpy.float_]:
+    ) -> numpy.typing.NDArray[numpy.float64]:
         num_records = len(record_pairs)
 
         distances = numpy.empty((num_records, len(self)), "f4")
@@ -110,8 +113,8 @@ def distances(
         return distances
 
     def _add_derived_distances(
-        self, distances: numpy.typing.NDArray[numpy.float_]
-    ) -> numpy.typing.NDArray[numpy.float_]:
+        self, distances: numpy.typing.NDArray[numpy.float64]
+    ) -> numpy.typing.NDArray[numpy.float64]:
         current_column = self._derived_start
 
         for indices in self._interaction_indices:
@@ -161,11 +164,11 @@ def missing(variables: list[Variable]) -> list[MissingDataType]:
 
 
 def interactions(
-    variables: Iterable[Variable], primary_variables: list[FieldVariable]
+    variables: Iterable[Variable], primary_variables: Iterable[FieldVariable]
 ) -> list[InteractionType]:
     field_d = {field.name: field for field in primary_variables}
 
-    interactions = []
+    interactions: list[InteractionType] = []
     for variable in variables:
         if isinstance(variable, InteractionType):
             variable.expandInteractions(field_d)

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -56,7 +56,7 @@ def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
         """Train on the given data."""
 
     @abstractmethod
-    def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
+    def candidate_scores(self) -> numpy.typing.NDArray[numpy.float64]:
         """For each of self.candidates, return our current guess [0,1] of if a match."""
 
     @abstractmethod
@@ -92,7 +92,7 @@ def remove(self, index: int) -> None:
         self._candidates.pop(index)
         self._features = numpy.delete(self._features, index, axis=0)
 
-    def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
+    def candidate_scores(self) -> numpy.typing.NDArray[numpy.float64]:
         if not self._fitted:
             raise ValueError("Must call fit() before candidate_scores()")
         return self._classifier.predict_proba(self._features)[:, 1].reshape(-1, 1)
@@ -103,7 +103,7 @@ class BlockLearner(Learner):
 
     def __init__(self):
         self.current_predicates: tuple[Predicate, ...] = ()
-        self._cached_scores: numpy.typing.NDArray[numpy.float_] | None = None
+        self._cached_scores: numpy.typing.NDArray[numpy.float64] | None = None
         self._old_dupes: TrainingExamples = []
 
     def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
@@ -121,7 +121,7 @@ def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
             self._old_dupes = dupes
         self._fitted = True
 
-    def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
+    def candidate_scores(self) -> numpy.typing.NDArray[numpy.float64]:
         if not self._fitted:
             raise ValueError("Must call fit() before candidate_scores()")
         if self._cached_scores is None:

diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py
@@ -31,12 +31,6 @@ def __eq__(self, other: Any) -> bool:
 
     def __init__(self, has_missing=False):
         self.has_missing = has_missing
-        if self.has_missing:
-            try:
-                exists_pred = predicates.ExistsPredicate(self.field)
-                self.predicates.append(exists_pred)
-            except AttributeError:
-                pass
 
     def __getstate__(self) -> dict[str, Any]:
         odict = self.__dict__.copy()
@@ -78,7 +72,7 @@ class FieldType(Variable):
     _Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate
     comparator: Comparator
 
-    def __init__(self, field, name=None, **kwargs):
+    def __init__(self, field, name=None, has_missing=False):
         self.field = field
 
         if name is None:
@@ -94,7 +88,10 @@ def __init__(self, field, name=None, **kwargs):
             self._index_predicates, self._index_thresholds, self.field
         )
 
-        super().__init__(**kwargs)
+        self.has_missing = has_missing
+        if self.has_missing:
+            exists_pred = predicates.ExistsPredicate(self.field)
+            self.predicates.append(exists_pred)
 
 
 class CustomType(FieldType):

diff --git a/dedupe/variables/exists.py b/dedupe/variables/exists.py
@@ -4,7 +4,7 @@
 
 from categorical import CategoricalComparator
 
-from dedupe._typing import PredicateFunction, VariableDefinition
+from dedupe._typing import PredicateFunction
 from dedupe.variables.base import DerivedType
 from dedupe.variables.categorical_type import CategoricalType
 
@@ -13,16 +13,14 @@ class ExistsType(CategoricalType):
     type = "Exists"
     _predicate_functions: list[PredicateFunction] = []
 
-    def __init__(self, definition: VariableDefinition):
-        super(CategoricalType, self).__init__(definition)
+    def __init__(self, field, **kwargs):
+        super().__init__(field, **kwargs)
 
         self.cat_comparator = CategoricalComparator([0, 1])
 
         self.higher_vars = []
         for higher_var in self.cat_comparator.dummy_names:
-            dummy_var = DerivedType(
-                {"name": higher_var, "type": "Dummy", "has missing": self.has_missing}
-            )
+            dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing)
             self.higher_vars.append(dummy_var)
 
     def comparator(self, field_1: Any, field_2: Any) -> list[int]:

diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py
@@ -3,7 +3,7 @@
 import itertools
 from typing import Mapping
 
-from dedupe.variables.base import FieldType as FieldVariable
+from dedupe._typing import FieldVariable
 from dedupe.variables.base import Variable
 
 

diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py
@@ -1,7 +1,6 @@
 from simplecosine.cosine import CosineSetSimilarity
 
 from dedupe import predicates
-from dedupe._typing import VariableDefinition
 from dedupe.variables.base import FieldType
 
 
@@ -24,10 +23,10 @@ class SetType(FieldType):
     )
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(self, definition: VariableDefinition):
-        super(SetType, self).__init__(definition)
+    def __init__(self, field: str, corpus=None, **kwargs):
+        super().__init__(field, **kwargs)
 
-        if "corpus" not in definition:
-            definition["corpus"] = []
+        if corpus is None:
+            corpus = []
 
-        self.comparator = CosineSetSimilarity(definition["corpus"])  # type: ignore[assignment]
+        self.comparator = CosineSetSimilarity(corpus)  # type: ignore[assignment]
diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py
@@ -5,7 +5,7 @@
 from simplecosine.cosine import CosineTextSimilarity
 
 from dedupe import predicates
-from dedupe._typing import PredicateFunction, VariableDefinition
+from dedupe._typing import PredicateFunction
 from dedupe.variables.base import FieldType, indexPredicates
 
 crfEd = CRFEditDistance()
@@ -98,10 +98,10 @@ class TextType(BaseStringType):
     ]
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(self, definition: VariableDefinition):
-        super(TextType, self).__init__(definition)
+    def __init__(self, field, corpus=None, **kwargs):
+        super().__init__(field, **kwargs)
 
-        if "corpus" not in definition:
-            definition["corpus"] = []
+        if corpus is None:
+            corpus = []
 
-        self.comparator = CosineTextSimilarity(definition["corpus"])  # type: ignore[assignment]
+        self.comparator = CosineTextSimilarity(corpus)  # type: ignore[assignment]