You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
i'm not entirely sure which records in my dataset caused this, so I can't offer much aid in that regard, but there should probably be a logical protection against feeding two empty strings to the function and a default score or behavior for cases like this.
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
247 # Train or load the model
248 deduper = _train(settings_file, training_file, data_d, field_properties,
--> 249 sample_size, update_model, n_cores)
250
251 # Cluster the records
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _active_learning(data, sample_size, deduper, training_file, settings_file)
39 # To train dedupe, we feed it a sample of records.
40 sample_num = math.floor(len(data) * sample_size)
---> 41 deduper.prepare_training(data, sample_size=sample_num)
42
43 print('Starting active labeling...')
/opt/conda/lib/python3.7/site-packages/dedupe/datamodel.py in distances(self, record_pairs)
83 if record_1[field] is not None and record_2[field] is not None:
84 distances[i, start:stop] = compare(record_1[field],
---> 85 record_2[field])
86 elif hasattr(compare, 'missing'):
87 distances[i, start:stop] = compare(record_1[field],
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
ZeroDivisionError: normalizedAffineGapDistance cannot take two empty strings
The text was updated successfully, but these errors were encountered:
It can be "fixed" by forcing empty strings to be None, but there might be cases where those actually do mean different things in a given dataset. (provided and empty vs not provided at all), so avoiding this problem in a way that doesn't make "" == None would be a preferable result.
I am aware that this actually stems from the dedupe package itself, but they previously considered this issue and marked it as expected behavior.
i'm not entirely sure which records in my dataset caused this, so I can't offer much aid in that regard, but there should probably be a logical protection against feeding two empty strings to the function and a default score or behavior for cases like this.
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
247 # Train or load the model
248 deduper = _train(settings_file, training_file, data_d, field_properties,
--> 249 sample_size, update_model, n_cores)
250
251 # Cluster the records
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _train(settings_file, training_file, data, field_properties, sample_size, update_model, n_cores)
106
107 # Launch active learning
--> 108 deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
109
110 else:
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _active_learning(data, sample_size, deduper, training_file, settings_file)
39 # To train dedupe, we feed it a sample of records.
40 sample_num = math.floor(len(data) * sample_size)
---> 41 deduper.prepare_training(data, sample_size=sample_num)
42
43 print('Starting active labeling...')
/opt/conda/lib/python3.7/site-packages/dedupe/api.py in prepare_training(self, data, training_file, sample_size, blocked_proportion)
1292 if training_file:
1293 self._read_training(training_file)
-> 1294 self._sample(data, sample_size, blocked_proportion)
1295
1296 def _sample(self,
/opt/conda/lib/python3.7/site-packages/dedupe/api.py in _sample(self, data, sample_size, blocked_proportion)
1322 blocked_proportion,
1323 sample_size,
-> 1324 index_include=examples)
1325
1326 self.active_learner.mark(examples, y)
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in init(self, data_model, data, blocked_proportion, sample_size, index_include)
440
441 self.classifier = RLRLearner(self.data_model)
--> 442 self.classifier.candidates = self.candidates
443
444 self._common_init()
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in candidates(self, new_candidates)
54 self._candidates = new_candidates
55
---> 56 self.distances = self.transform(self._candidates)
57
58 random_pair = random.choice(self._candidates)
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in transform(self, pairs)
62
63 def transform(self, pairs):
---> 64 return self.data_model.distances(pairs)
65
66 def fit(self, X, y):
/opt/conda/lib/python3.7/site-packages/dedupe/datamodel.py in distances(self, record_pairs)
83 if record_1[field] is not None and record_2[field] is not None:
84 distances[i, start:stop] = compare(record_1[field],
---> 85 record_2[field])
86 elif hasattr(compare, 'missing'):
87 distances[i, start:stop] = compare(record_1[field],
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
ZeroDivisionError: normalizedAffineGapDistance cannot take two empty strings
The text was updated successfully, but these errors were encountered: