Skip to content

Commit

Permalink
address flake8 and most mypy problems
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Jun 20, 2024
1 parent f9ee163 commit 59dce08
Showing 13 changed files with 71 additions and 55 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[flake8]
max-line-length=160
extend-ignore = E203
per-file-ignores = dedupe/variables/__init__.py:F401
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -8,3 +8,8 @@ repos:
hooks:
- id: isort
name: isort (python)
- repo: https://github.com/pycqa/flake8
rev: "7.1.0"
hooks:
- id: flake8
args: [--config=.flake8]
15 changes: 14 additions & 1 deletion dedupe/_typing.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,6 @@
TYPE_CHECKING,
Any,
Callable,
Collection,
Dict,
FrozenSet,
Iterable,
@@ -109,6 +108,20 @@ def close(self) -> None: ...
def join(self) -> None: ...


class Variable(Protocol):
name: str
predicates: list["Predicate"]
has_missing: bool
higher_vars: Sequence["Variable"]

def __len__(self) -> int: ...


class FieldVariable(Variable, Protocol):
field: str
comparator: Comparator


MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]

PathLike = Union[str, os.PathLike]
4 changes: 2 additions & 2 deletions dedupe/api.py
Original file line number Diff line number Diff line change
@@ -70,7 +70,7 @@
Scores,
TrainingData,
TupleLinks,
VariableDefinition,
Variable,
)

logger = logging.getLogger(__name__)
@@ -1117,7 +1117,7 @@ class ActiveMatching(Matching):

def __init__(
self,
variable_definition: Collection[VariableDefinition],
variable_definition: Collection[Variable],
num_cores: int | None = None,
in_memory: bool = False,
**kwargs,
8 changes: 4 additions & 4 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
@@ -177,7 +177,7 @@ def union_find(scored_pairs: Scores) -> numpy.typing.NDArray[numpy.int_]:

def condensedDistance(
dupes: Scores,
) -> tuple[dict[int, RecordID], numpy.typing.NDArray[numpy.float_], int]:
) -> tuple[dict[int, RecordID], numpy.typing.NDArray[numpy.float64], int]:
"""
Convert the pairwise list of distances in dupes to "condensed
distance matrix" required by the hierarchical clustering
@@ -262,16 +262,16 @@ def cluster(

def confidences(
cluster: Sequence[int],
squared_distances: numpy.typing.NDArray[numpy.float_],
squared_distances: numpy.typing.NDArray[numpy.float64],
d: int,
) -> numpy.typing.NDArray[numpy.float_]:
) -> numpy.typing.NDArray[numpy.float64]:
"""
We calculate a per record score that is similar to a standard
deviation. The main reason is that these record scores can be
used to calculate the standard deviation of an entire cluster,
which is a reasonable metric for clusters.
"""
scores: numpy.typing.NDArray[numpy.float_]
scores: numpy.typing.NDArray[numpy.float64]
scores_d = dict.fromkeys(cluster, 0.0)
C = 2 * d - 3
for i, j in itertools.combinations(cluster, 2):
2 changes: 1 addition & 1 deletion dedupe/convenience.py
Original file line number Diff line number Diff line change
@@ -135,7 +135,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov

finished = False
use_previous = False
fields = unique(var.field for var in deduper.data_model.primary_variables)
fields = unique(var.field for var in deduper.data_model.field_variables)

buffer_len = 1 # Max number of previous operations
unlabeled: list[RecordDictPair] = []
35 changes: 19 additions & 16 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
@@ -6,15 +6,19 @@

import numpy

import dedupe.variables
from dedupe.variables.base import FieldType as FieldVariable
from dedupe.variables.base import MissingDataType, Variable
from dedupe.variables.base import MissingDataType
from dedupe.variables.interaction import InteractionType

if TYPE_CHECKING:
from typing import Generator, Iterable, Sequence

from dedupe._typing import Comparator, RecordDict, RecordDictPair
from dedupe._typing import (
Comparator,
FieldVariable,
RecordDict,
RecordDictPair,
Variable,
)
from dedupe.predicates import Predicate


@@ -27,25 +31,24 @@ def __init__(
variable_definitions = list(variable_definitions)
if not variable_definitions:
raise ValueError("The variable definitions cannot be empty")
if all(
isinstance(variable, dedupe.variables.Custom)
for variable in variable_definitions
):
if any(hasattr(variable, "predicates") for variable in variable_definitions):
raise ValueError(
"At least one of the variable types needs to be a type"
"other than 'Custom'. 'Custom' types have no associated"
"blocking rules"
)

self.field_variables = [
variable for variable in variable_definitions if hasattr(variable, "field")
self.field_variables: list[FieldVariable] = [
variable
for variable in variable_definitions
if hasattr(variable, "field") and hasattr(variable, "comparator")
]

# we need to keep track of ordering of variables because in
# order calculate derived fields like interation and missing
# data fields. This code would be much better if there was
# always a "columns" attribute on variables
columns = []
columns: list[Variable] = []
for variable in self.field_variables:
if hasattr(variable, "higher_vars"):
columns.extend(variable.higher_vars)
@@ -91,7 +94,7 @@ def predicates(self) -> set[Predicate]:

def distances(
self, record_pairs: Sequence[RecordDictPair]
) -> numpy.typing.NDArray[numpy.float_]:
) -> numpy.typing.NDArray[numpy.float64]:
num_records = len(record_pairs)

distances = numpy.empty((num_records, len(self)), "f4")
@@ -110,8 +113,8 @@ def distances(
return distances

def _add_derived_distances(
self, distances: numpy.typing.NDArray[numpy.float_]
) -> numpy.typing.NDArray[numpy.float_]:
self, distances: numpy.typing.NDArray[numpy.float64]
) -> numpy.typing.NDArray[numpy.float64]:
current_column = self._derived_start

for indices in self._interaction_indices:
@@ -161,11 +164,11 @@ def missing(variables: list[Variable]) -> list[MissingDataType]:


def interactions(
variables: Iterable[Variable], primary_variables: list[FieldVariable]
variables: Iterable[Variable], primary_variables: Iterable[FieldVariable]
) -> list[InteractionType]:
field_d = {field.name: field for field in primary_variables}

interactions = []
interactions: list[InteractionType] = []
for variable in variables:
if isinstance(variable, InteractionType):
variable.expandInteractions(field_d)
8 changes: 4 additions & 4 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
@@ -56,7 +56,7 @@ def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
"""Train on the given data."""

@abstractmethod
def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
def candidate_scores(self) -> numpy.typing.NDArray[numpy.float64]:
"""For each of self.candidates, return our current guess [0,1] of if a match."""

@abstractmethod
@@ -92,7 +92,7 @@ def remove(self, index: int) -> None:
self._candidates.pop(index)
self._features = numpy.delete(self._features, index, axis=0)

def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
def candidate_scores(self) -> numpy.typing.NDArray[numpy.float64]:
if not self._fitted:
raise ValueError("Must call fit() before candidate_scores()")
return self._classifier.predict_proba(self._features)[:, 1].reshape(-1, 1)
@@ -103,7 +103,7 @@ class BlockLearner(Learner):

def __init__(self):
self.current_predicates: tuple[Predicate, ...] = ()
self._cached_scores: numpy.typing.NDArray[numpy.float_] | None = None
self._cached_scores: numpy.typing.NDArray[numpy.float64] | None = None
self._old_dupes: TrainingExamples = []

def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
@@ -121,7 +121,7 @@ def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
self._old_dupes = dupes
self._fitted = True

def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
def candidate_scores(self) -> numpy.typing.NDArray[numpy.float64]:
if not self._fitted:
raise ValueError("Must call fit() before candidate_scores()")
if self._cached_scores is None:
13 changes: 5 additions & 8 deletions dedupe/variables/base.py
Original file line number Diff line number Diff line change
@@ -31,12 +31,6 @@ def __eq__(self, other: Any) -> bool:

def __init__(self, has_missing=False):
self.has_missing = has_missing
if self.has_missing:
try:
exists_pred = predicates.ExistsPredicate(self.field)
self.predicates.append(exists_pred)
except AttributeError:
pass

def __getstate__(self) -> dict[str, Any]:
odict = self.__dict__.copy()
@@ -78,7 +72,7 @@ class FieldType(Variable):
_Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate
comparator: Comparator

def __init__(self, field, name=None, **kwargs):
def __init__(self, field, name=None, has_missing=False):
self.field = field

if name is None:
@@ -94,7 +88,10 @@ def __init__(self, field, name=None, **kwargs):
self._index_predicates, self._index_thresholds, self.field
)

super().__init__(**kwargs)
self.has_missing = has_missing
if self.has_missing:
exists_pred = predicates.ExistsPredicate(self.field)
self.predicates.append(exists_pred)


class CustomType(FieldType):
10 changes: 4 additions & 6 deletions dedupe/variables/exists.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@

from categorical import CategoricalComparator

from dedupe._typing import PredicateFunction, VariableDefinition
from dedupe._typing import PredicateFunction
from dedupe.variables.base import DerivedType
from dedupe.variables.categorical_type import CategoricalType

@@ -13,16 +13,14 @@ class ExistsType(CategoricalType):
type = "Exists"
_predicate_functions: list[PredicateFunction] = []

def __init__(self, definition: VariableDefinition):
super(CategoricalType, self).__init__(definition)
def __init__(self, field, **kwargs):
super().__init__(field, **kwargs)

self.cat_comparator = CategoricalComparator([0, 1])

self.higher_vars = []
for higher_var in self.cat_comparator.dummy_names:
dummy_var = DerivedType(
{"name": higher_var, "type": "Dummy", "has missing": self.has_missing}
)
dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing)
self.higher_vars.append(dummy_var)

def comparator(self, field_1: Any, field_2: Any) -> list[int]:
2 changes: 1 addition & 1 deletion dedupe/variables/interaction.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
import itertools
from typing import Mapping

from dedupe.variables.base import FieldType as FieldVariable
from dedupe._typing import FieldVariable
from dedupe.variables.base import Variable


11 changes: 5 additions & 6 deletions dedupe/variables/set.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from simplecosine.cosine import CosineSetSimilarity

from dedupe import predicates
from dedupe._typing import VariableDefinition
from dedupe.variables.base import FieldType


@@ -24,10 +23,10 @@ class SetType(FieldType):
)
_index_thresholds = (0.2, 0.4, 0.6, 0.8)

def __init__(self, definition: VariableDefinition):
super(SetType, self).__init__(definition)
def __init__(self, field: str, corpus=None, **kwargs):
super().__init__(field, **kwargs)

if "corpus" not in definition:
definition["corpus"] = []
if corpus is None:
corpus = []

self.comparator = CosineSetSimilarity(definition["corpus"]) # type: ignore[assignment]
self.comparator = CosineSetSimilarity(corpus) # type: ignore[assignment]
12 changes: 6 additions & 6 deletions dedupe/variables/string.py
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
from simplecosine.cosine import CosineTextSimilarity

from dedupe import predicates
from dedupe._typing import PredicateFunction, VariableDefinition
from dedupe._typing import PredicateFunction
from dedupe.variables.base import FieldType, indexPredicates

crfEd = CRFEditDistance()
@@ -98,10 +98,10 @@ class TextType(BaseStringType):
]
_index_thresholds = (0.2, 0.4, 0.6, 0.8)

def __init__(self, definition: VariableDefinition):
super(TextType, self).__init__(definition)
def __init__(self, field, corpus=None, **kwargs):
super().__init__(field, **kwargs)

if "corpus" not in definition:
definition["corpus"] = []
if corpus is None:
corpus = []

self.comparator = CosineTextSimilarity(definition["corpus"]) # type: ignore[assignment]
self.comparator = CosineTextSimilarity(corpus) # type: ignore[assignment]

0 comments on commit 59dce08

Please sign in to comment.