|
13 | 13 | DEFAULT_NGRAM_SIZE: int = 3 |
14 | 14 | DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) |
15 | 15 | DEFAULT_REGEX: str = r'[,-./]|\s' |
16 | | -DEFAULT_MAX_N_MATCHES: int = 20 |
17 | 16 | DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match |
18 | 17 | DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1 |
19 | 18 | DEFAULT_IGNORE_CASE: bool = True # ignores case by default |
@@ -209,7 +208,8 @@ class StringGrouperConfig(NamedTuple): |
209 | 208 | (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision |
210 | 209 | than np.float64.) |
211 | 210 | :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'. |
212 | | - :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. |
| 211 | + :param max_n_matches: int. The maximum number of matching strings in `master` allowed per string in |
| 212 | + `duplicates` (or `master` if `duplicates` is not given). Default will be set by StringGrouper. |
213 | 213 | :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match. |
214 | 214 | Defaults to 0.8. |
215 | 215 | :param number_of_processes: int. The number of processes used by the cosine similarity calculation. |
@@ -297,7 +297,7 @@ def __init__(self, master: pd.Series, |
297 | 297 |
|
298 | 298 | self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) |
299 | 299 | if self._config.max_n_matches is None: |
300 | | - self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) |
| 300 | + self._max_n_matches = len(self._master) |
301 | 301 | else: |
302 | 302 | self._max_n_matches = self._config.max_n_matches |
303 | 303 |
|
|
0 commit comments