From 92436e351adcc58117199576ad8ee0884118d669 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 26 Apr 2021 22:31:08 +0200 Subject: [PATCH 01/11] boosted _symmetrize_matches_list() (5x) and _get_matches_list() (33x) --- string_grouper/string_grouper.py | 48 +++++++--------------- string_grouper/test/test_string_grouper.py | 8 ++-- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 3ab8cc46..c022a1df 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -251,11 +251,11 @@ def fit(self) -> 'StringGrouper': master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity matches = self._build_matches(master_matrix, duplicate_matrix) + if self._duplicates is None: + # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) + matches = StringGrouper._symmetrize_matrix(matches) # retrieve all matches self._matches_list = self._get_matches_list(matches) - if self._duplicates is None: - # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) - self._symmetrize_matches_list() self.is_build = True return self @@ -450,18 +450,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix self._config.min_similarity, **optional_kwargs) - def _symmetrize_matches_list(self): - # [symmetrized matches_list] = [matches_list] UNION [transposed matches_list] (i.e., column-names swapped): - self._matches_list = self._matches_list.set_index(['master_side', 'dupe_side'])\ - .combine_first( - self._matches_list.rename( - columns={ - 'master_side': 'dupe_side', - 'dupe_side': 'master_side' - } - ).set_index(['master_side', 'dupe_side']) - ).reset_index() - def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame: """Returns a list of all the indices of non-matching pairs (with similarity set to 0)""" m_sz, d_sz = len(self._master), len(self._master if self._duplicates is None else self._duplicates) @@ -480,25 +468,19 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame: return missing_pairs @staticmethod - def _get_matches_list(matches) -> pd.DataFrame: + def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: + A = AA.tolil() + r, c = A.nonzero() + A[c, r] = A[r, c] + return A.tocsr() + + @staticmethod + def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" - non_zeros = matches.nonzero() - - sparserows = non_zeros[0] - sparsecols = non_zeros[1] - nr_matches = sparsecols.size - master_side = np.empty([nr_matches], dtype=np.int64) - dupe_side = np.empty([nr_matches], dtype=np.int64) - similarity = np.zeros(nr_matches) - - for index in range(0, nr_matches): - master_side[index] = sparserows[index] - dupe_side[index] = sparsecols[index] - similarity[index] = matches.data[index] - - matches_list = pd.DataFrame({'master_side': master_side, - 'dupe_side': dupe_side, - 'similarity': similarity}) + r, c = matches.nonzero() + matches_list = pd.DataFrame({'master_side': r.astype(np.int64), + 'dupe_side': c.astype(np.int64), + 'similarity': matches.data}) return matches_list def _get_nearest_matches(self, diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 723d3f22..6ff4cf5e 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -11,6 +11,8 @@ from unittest.mock import patch import warnings +def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix: + return A class SimpleExample(object): def __init__(self): @@ -197,14 +199,14 @@ def test_match_strings(self, mock_StringGouper): mock_StringGrouper_instance.get_matches.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matches_list') - def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matches_list): + @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix) + def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix): """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" simple_example = SimpleExample() df = simple_example.customers_df2['Customer Name'] sg = StringGrouper(df, max_n_matches=2).fit() - mock_symmetrize_matches_list.assert_called_once() + mock_symmetrize_matrix.assert_called_once() # obtain the upper and lower triangular parts of the matrix of matches: upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']] lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']] From 99545de3317a62620b1305a300b07e3456a178a8 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 28 Apr 2021 11:49:11 +0200 Subject: [PATCH 02/11] made more pypi-friendly changes in README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 13f22127..3ddc43c7 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`. Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`). -The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. +The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score. @@ -85,7 +85,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f 2. `'similarity'` whose column has the similarity-scores as values, and 3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`. - Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) + Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above. Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above. @@ -101,7 +101,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`. If `master` has no name, it is assumed to have the name `'master'` before being prefixed. - If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) + If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`. @@ -109,7 +109,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f * #### `group_similar_strings` - Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) + Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings. If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`. @@ -140,11 +140,11 @@ All functions are built using a class **`StringGrouper`**. This class can be use Defaults to `0.8` * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to `number of cores on a machine - 1.` - * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md) for a demonstration.) **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false. + * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) + * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) + * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md) for a demonstration.) **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false. * **`suppress_warning`**: when `min_similarity` ≤ 0 and `include_zeroes` is `True`, determines whether or not to suppress the message warning that `max_n_matches` may be too small. Defaults to `False`. - * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation. + * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation. ## Examples @@ -306,7 +306,7 @@ Out of the four company names in `duplicates`, three companies are found in the ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied. -A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](tutorials/tutorial_1.md) that steps though the process with an example data set is available. +A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available. ### For a second data set, find only the most similar match From 35dddd9b2841e881075f83490c22a9451a3810a7 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Tue, 4 May 2021 18:37:39 +0200 Subject: [PATCH 03/11] fixed bug related to single-valued input Series --- string_grouper/string_grouper.py | 18 +++++++++-------- string_grouper/test/test_string_grouper.py | 23 ++++++++++++++++++++-- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index c022a1df..7e99b506 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -139,7 +139,7 @@ def match_strings(master: pd.Series, class StringGrouperConfig(NamedTuple): - """ + r""" Class with configuration variables. :param ngram_size: int. The amount of characters in each n-gram. Default is 3. @@ -253,7 +253,8 @@ def fit(self) -> 'StringGrouper': matches = self._build_matches(master_matrix, duplicate_matrix) if self._duplicates is None: # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) - matches = StringGrouper._symmetrize_matrix(matches) + # and each of its diagonal components must be equal to 1 + matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches) # retrieve all matches self._matches_list = self._get_matches_list(matches) self.is_build = True @@ -468,10 +469,12 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame: return missing_pairs @staticmethod - def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: + def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix: A = AA.tolil() r, c = A.nonzero() A[c, r] = A[r, c] + r = np.arange(A.shape[0]) + A[r, r] = 1 return A.tocsr() @staticmethod @@ -549,11 +552,10 @@ def _get_nearest_matches(self, dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side') output = dupes_max_sim[index_column_list + required_column_list] output.index = self._duplicates.index - return output.squeeze() + return output.squeeze(axis=1) def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: - # discard self-matches: A matches A - pairs = self._matches_list[self._matches_list['master_side'] != self._matches_list['dupe_side']] + pairs = self._matches_list # rebuild graph adjacency matrix from already found matches: n = len(self._master) graph = csr_matrix( @@ -581,7 +583,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: graph.data = pairs['similarity'].to_numpy() # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ... # ... convert to 1D numpy array (using asarray then squeeze) and then to Series: - group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze()) + group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze(axis=1)) method = 'idxmax' # Determine the group representatives AND merge with indices: @@ -605,7 +607,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: output_id = self._master_id.iloc[group_of_master_index.group_rep].rename(id_label).reset_index(drop=True) output = pd.concat([output_id, output], axis=1) output.index = self._master.index - return output.squeeze() + return output def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, pd.Series]: master_strings = self._master diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 6ff4cf5e..e7b39685 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -199,7 +199,10 @@ def test_match_strings(self, mock_StringGouper): mock_StringGrouper_instance.get_matches.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix) + @patch( + 'string_grouper.string_grouper.StringGrouper._symmetrize_matrix_and_fix_diagonal', + side_effect=mock_symmetrize_matrix + ) def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix): """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" @@ -238,17 +241,33 @@ def test_match_list_symmetry_with_symmetrize_function(self): # upper, upper_prime and their intersection should be identical. self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) - def test_match_list_diagonal(self): + @patch( + 'string_grouper.string_grouper.StringGrouper._symmetrize_matrix_and_fix_diagonal', + side_effect=mock_symmetrize_matrix + ) + def test_match_list_diagonal_without_the_fix(self, mock_symmetrize_matrix): """test fails whenever _matches_list's number of self-joins is not equal to the number of strings""" # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; # for small datasets setting max_n_matches=1 reproduces the bug simple_example = SimpleExample() df = simple_example.customers_df['Customer Name'] matches = match_strings(df, max_n_matches=1) + mock_symmetrize_matrix.assert_called_once() num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) num_strings = len(df) self.assertNotEqual(num_self_joins, num_strings) + def test_match_list_diagonal(self): + """This test ensures that all self-joins are present""" + # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; + # for small datasets setting max_n_matches=1 reproduces the bug + simple_example = SimpleExample() + df = simple_example.customers_df['Customer Name'] + matches = match_strings(df, max_n_matches=1) + num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) + num_strings = len(df) + self.assertEqual(num_self_joins, num_strings) + def test_zero_min_similarity(self): """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" From bea485f8cfca65ec5ad9568b1cc1516e51ce5c6a Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 28 Apr 2021 11:49:11 +0200 Subject: [PATCH 04/11] made more pypi-friendly changes in README.md --- README.md | 18 +++++++++--------- string_grouper/string_grouper.py | 32 ++++++++++++++++---------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 13f22127..3ddc43c7 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`. Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`). -The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. +The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score. @@ -85,7 +85,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f 2. `'similarity'` whose column has the similarity-scores as values, and 3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`. - Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) + Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above. Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above. @@ -101,7 +101,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`. If `master` has no name, it is assumed to have the name `'master'` before being prefixed. - If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) + If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`. @@ -109,7 +109,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f * #### `group_similar_strings` - Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) + Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings. If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`. @@ -140,11 +140,11 @@ All functions are built using a class **`StringGrouper`**. This class can be use Defaults to `0.8` * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to `number of cores on a machine - 1.` - * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md) for a demonstration.) **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false. + * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) + * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) + * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md) for a demonstration.) **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false. * **`suppress_warning`**: when `min_similarity` ≤ 0 and `include_zeroes` is `True`, determines whether or not to suppress the message warning that `max_n_matches` may be too small. Defaults to `False`. - * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation. + * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation. ## Examples @@ -306,7 +306,7 @@ Out of the four company names in `duplicates`, three companies are found in the ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied. -A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](tutorials/tutorial_1.md) that steps though the process with an example data set is available. +A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available. ### For a second data set, find only the most similar match diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index c022a1df..e1a2f80b 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -467,22 +467,6 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame: missing_pairs['similarity'] = 0 return missing_pairs - @staticmethod - def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: - A = AA.tolil() - r, c = A.nonzero() - A[c, r] = A[r, c] - return A.tocsr() - - @staticmethod - def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: - """Returns a list of all the indices of matches""" - r, c = matches.nonzero() - matches_list = pd.DataFrame({'master_side': r.astype(np.int64), - 'dupe_side': c.astype(np.int64), - 'similarity': matches.data}) - return matches_list - def _get_nearest_matches(self, ignore_index=False, replace_na=False) -> Union[pd.DataFrame, pd.Series]: @@ -633,6 +617,22 @@ def _validate_replace_na_and_drop(self): "index if the number of index-levels does not equal the number of index-columns." ) + @staticmethod + def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: + A = AA.tolil() + r, c = A.nonzero() + A[c, r] = A[r, c] + return A.tocsr() + + @staticmethod + def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: + """Returns a list of all the indices of matches""" + r, c = matches.nonzero() + matches_list = pd.DataFrame({'master_side': r.astype(np.int64), + 'dupe_side': c.astype(np.int64), + 'similarity': matches.data}) + return matches_list + @staticmethod def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame: columns_switched = pd.DataFrame({'master_side': new_matches.dupe_side, From 2e5d9a3530b0cc09607387f99fcaa284903a2724 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 5 May 2021 14:45:19 +0200 Subject: [PATCH 05/11] added unittest for get_groups() with single-valued input Series --- string_grouper/test/test_string_grouper.py | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index e7b39685..518ebf52 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -491,6 +491,43 @@ def test_get_groups_single_df_group_rep_default(self): ) ) + def test_get_groups_single_valued_series(self): + """This test ensures that get_groups() returns a single-valued DataFrame or Series object + since the input-series is also single-valued. This test was created in response to a bug discovered + by George Walker""" + pd.testing.assert_frame_equal( + pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']), + group_similar_strings( + pd.Series(["hello"]), + min_similarity=0.6 + ) + ) + pd.testing.assert_series_equal( + pd.Series(["hello"], name='group_rep'), + group_similar_strings( + pd.Series(["hello"]), + min_similarity=0.6, + ignore_index=True + ) + ) + pd.testing.assert_frame_equal( + pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']), + match_most_similar( + pd.Series(["hello"]), + pd.Series(["hello"]), + min_similarity=0.6 + ) + ) + pd.testing.assert_series_equal( + pd.Series(["hello"], name='most_similar_master'), + match_most_similar( + pd.Series(["hello"]), + pd.Series(["hello"]), + min_similarity=0.6, + ignore_index=True + ) + ) + def test_get_groups_single_df_keep_index(self): """Should return a pd.Series object with the same length as the original df. The series object will contain a list of the grouped strings with their indexes displayed in columns""" From 4a0b22501920ef19c8faf64630d82c9b4a97efff Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 8 May 2021 10:16:43 +0200 Subject: [PATCH 06/11] fixed remaining squeeze() bugs --- string_grouper/string_grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d284d50e..86144715 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -266,7 +266,7 @@ def dot(self) -> pd.Series: raise Exception("To perform this function, both input Series must have the same length.") master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate pairwise cosine similarities: - pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze() + pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze(axis=1) return pd.Series(pairwise_similarities, name='similarity', index=self._master.index) @validate_is_fit @@ -662,7 +662,7 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool: return False elif series_to_test.to_frame().applymap( lambda x: not isinstance(x, str) - ).squeeze().any(): + ).squeeze(axis=1).any(): return False return True From faa974cbc9d12f5a1026f743e64f9413546c6029 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sun, 9 May 2021 10:56:29 +0200 Subject: [PATCH 07/11] added error-handler to capture non-strings in input Series --- string_grouper/string_grouper.py | 186 ++++++++++++++++----- string_grouper/test/test_string_grouper.py | 35 +++- 2 files changed, 176 insertions(+), 45 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 86144715..a619d486 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -41,10 +41,32 @@ # StringGrouper.get_nearest_matches GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate + # High level functions -def compute_pairwise_similarities(string_series_1: pd.Series, +def who(bad_StringGrouper_param, param_1, param_name_1, param_2, param_name_2): + # Private utility function used by high-level functions (that call StringGrouper) to form a + # descriptive name for their series input parameter which caused the exception of type + # StringGrouperNotAllStringsException to occur + if bad_StringGrouper_param == 'master': + return f'\'{param_1.name}\' ({param_name_1})' if param_1.name else param_name_1 + else: + return f'\'{param_2.name}\' ({param_name_2})' if param_2.name else param_name_2 + + +def add_this_arg(func): + # Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func") + # that shifts the parameters of "func" to the right by one, inserting a reference to local + # function "this" in the first parameter position + def this(*args, **kwargs): + return func(this, *args, **kwargs) + return this + + +@add_this_arg +def compute_pairwise_similarities(this, + string_series_1: pd.Series, string_series_2: pd.Series, **kwargs) -> pd.Series: """ @@ -55,10 +77,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig :return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2 """ - return StringGrouper(string_series_1, string_series_2, **kwargs).dot() - - -def group_similar_strings(strings_to_group: pd.Series, + sg = StringGrouperPrime(string_series_1, string_series_2, **kwargs) + # error handler (for input Series with values that are not strings) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + string_series_1, 'string_series_1', + string_series_2, 'string_series_2') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'compute_pairwise_similarities')) + return sg.dot() + + +@add_this_arg +def group_similar_strings(this, + strings_to_group: pd.Series, string_ids: Optional[pd.Series] = None, **kwargs) -> Union[pd.DataFrame, pd.Series]: """ @@ -76,11 +109,22 @@ def group_similar_strings(strings_to_group: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit() - return string_grouper.get_groups() - - -def match_most_similar(master: pd.Series, + sg = StringGrouperPrime(strings_to_group, master_id=string_ids, **kwargs) + # error handler (for input Series with values that are not strings) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + strings_to_group, 'strings_to_group', + None, '') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'group_similar_strings')) + fit_sg = sg.fit() + return fit_sg.get_groups() + + +@add_this_arg +def match_most_similar(this, + master: pd.Series, duplicates: pd.Series, master_id: Optional[pd.Series] = None, duplicates_id: Optional[pd.Series] = None, @@ -105,15 +149,26 @@ def match_most_similar(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - string_grouper = StringGrouper(master, - duplicates=duplicates, - master_id=master_id, - duplicates_id=duplicates_id, - **kwargs).fit() - return string_grouper.get_groups() - - -def match_strings(master: pd.Series, + sg = StringGrouperPrime(master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs) + # error handler (for input Series with values that are not strings) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + master, 'master', + duplicates, 'duplicates') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'match_most_similar')) + fit_sg = sg.fit() + return fit_sg.get_groups() + + +@add_this_arg +def match_strings(this, + master: pd.Series, duplicates: Optional[pd.Series] = None, master_id: Optional[pd.Series] = None, duplicates_id: Optional[pd.Series] = None, @@ -130,12 +185,20 @@ def match_strings(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. :return: pandas.Dataframe. """ - string_grouper = StringGrouper(master, - duplicates=duplicates, - master_id=master_id, - duplicates_id=duplicates_id, - **kwargs).fit() - return string_grouper.get_matches() + sg = StringGrouperPrime(master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + master, 'master', + duplicates, 'duplicates') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'match_strings')) + fit_sg = sg.fit() + return fit_sg.get_matches() class StringGrouperConfig(NamedTuple): @@ -194,6 +257,10 @@ class StringGrouperNotFitException(Exception): pass +class StringGrouperNotAllStringsException(TypeError): + """Raised when either input Series master or duplicates contains non-strings""" + pass + class StringGrouper(object): def __init__(self, master: pd.Series, duplicates: Optional[pd.Series] = None, @@ -213,9 +280,9 @@ def __init__(self, master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig """ # Validate match strings input - if not StringGrouper._is_series_of_strings(master) or \ - (duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)): - raise TypeError('Input does not consist of pandas.Series containing only Strings') + self.issues: pd.Series = None + self._check_string_series(master, 'master') + if (duplicates is not None): self._check_string_series(duplicates, 'duplicates') # Validate optional IDs input if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id): raise Exception('List of data Series options is invalid') @@ -601,6 +668,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True) return master_indices, dupe_indices + def _check_string_series(self, series_to_test: pd.Series, which: str): + self.bad_series_name = which + StringGrouper._check_type(series_to_test, which) + self._check_content(series_to_test, which) + + def _check_content(self, series_to_test: pd.Series, which: str): + non_strings_exist = series_to_test.to_frame().applymap( + lambda x: (not isinstance(x, str)) or len(x) == 0 + ).squeeze(axis=1) + if non_strings_exist.any(): + self.issues = series_to_test[non_strings_exist] + sname = f' {series_to_test.name}' if series_to_test.name else '' + self.issues.rename(f'Non-strings in {which} Series{sname}', inplace=True) + raise StringGrouperNotAllStringsException + def _validate_group_rep_specs(self): group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID) if self._config.group_rep not in group_rep_options: @@ -617,6 +699,11 @@ def _validate_replace_na_and_drop(self): "index if the number of index-levels does not equal the number of index-columns." ) + @staticmethod + def _check_type(series_to_test: pd.Series, which: str): + if not isinstance(series_to_test, pd.Series): + raise TypeError(f'Input {which} is not a pandas.Series containing only Strings') + @staticmethod def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix: A = AA.tolil() @@ -656,16 +743,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings elif not dupe_strings.isin([dupe_side]).any(): raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series') - @staticmethod - def _is_series_of_strings(series_to_test: pd.Series) -> bool: - if not isinstance(series_to_test, pd.Series): - return False - elif series_to_test.to_frame().applymap( - lambda x: not isinstance(x, str) - ).squeeze(axis=1).any(): - return False - return True - @staticmethod def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool: if duplicates is None and (duplicates_id is not None) \ @@ -680,3 +757,36 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id): raise Exception('Both master and master_id must be pandas.Series of the same length.') if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id): raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.') + + +class StringGrouperPrime(StringGrouper): + # (To be used in high-level functions) + # Child class of StringGrouper that captures information about the input Series + # that caused the StringGrouperNotAllStringsException even when the StringGrouper + # instance is not fully initialized + def __init__(self, master: pd.Series, + duplicates: Optional[pd.Series] = None, + master_id: Optional[pd.Series] = None, + duplicates_id: Optional[pd.Series] = None, + **kwargs): + self.issues = None + self.non_strings_present = False + self.bad_series_name = None + try: + super().__init__(master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs) + except StringGrouperNotAllStringsException: + self.non_strings_present = True + + def error_msg(self, bad_series_name, function_name): + nl = ':\n' + return ( + f'\n\nERROR: Input pandas Series {bad_series_name} contains values that are not strings!\n' + f'Display the pandas Series \'{function_name}.issues\' to find where these values are' + f'{nl if 0 < len(self.issues) < 12 else "."}' + f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}' + ) + diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 518ebf52..7c4cbb39 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -5,7 +5,8 @@ from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \ DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \ DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ - StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ + StringGrouperConfig, StringGrouper, \ + StringGrouperNotFitException, StringGrouperNotAllStringsException, \ match_most_similar, group_similar_strings, match_strings,\ compute_pairwise_similarities from unittest.mock import patch @@ -144,12 +145,14 @@ def test_compute_pairwise_similarities_data_integrity(self): with self.assertRaises(Exception): _ = compute_pairwise_similarities(df1, df2[:-2]) - @patch('string_grouper.string_grouper.StringGrouper') + @patch('string_grouper.string_grouper.StringGrouperPrime') def test_group_similar_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_groups.return_value = 'whatever' + mock_StringGrouper_instance.non_strings_present = False + mock_StringGrouper_instance.error_msg.return_value = 'mock_error' test_series_1 = None test_series_id_1 = None @@ -162,12 +165,14 @@ def test_group_similar_strings(self, mock_StringGouper): mock_StringGrouper_instance.get_groups.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper') + @patch('string_grouper.string_grouper.StringGrouperPrime') def test_match_most_similar(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_groups.return_value = 'whatever' + mock_StringGrouper_instance.non_strings_present = False + mock_StringGrouper_instance.error_msg.return_value = 'mock_error' test_series_1 = None test_series_2 = None @@ -184,12 +189,14 @@ def test_match_most_similar(self, mock_StringGouper): mock_StringGrouper_instance.get_groups.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper') + @patch('string_grouper.string_grouper.StringGrouperPrime') def test_match_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_matches.return_value = 'whatever' + mock_StringGrouper_instance.non_strings_present = False + mock_StringGrouper_instance.error_msg.return_value = 'mock_error' test_series_1 = None test_series_id_1 = None @@ -792,10 +799,24 @@ def test_string_grouper_type_error(self): """StringGrouper should raise an typeerror master or duplicates are not a series of strings""" with self.assertRaises(TypeError): _ = StringGrouper('foo', 'bar') - with self.assertRaises(TypeError): + with self.assertRaises(StringGrouperNotAllStringsException): _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1])) - with self.assertRaises(TypeError): + with self.assertRaises(StringGrouperNotAllStringsException): _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j'])) + with self.assertRaises(StringGrouperNotAllStringsException): + _ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan])) + + def test_not_all_strings_exception_in_high_level_fucntions(self): + good_series = pd.Series(['foo', 'bar']) + bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes') + with self.assertRaises(TypeError): + _ = compute_pairwise_similarities(good_series, bad_series.rename_axis('dupes_id')) + with self.assertRaises(TypeError): + _ = group_similar_strings(bad_series.rename_axis('string_id')) + with self.assertRaises(TypeError): + _ = match_most_similar(bad_series.rename('master'), good_series) + with self.assertRaises(TypeError): + _ = match_strings(good_series, bad_series.rename('dupes').rename_axis('dupes_id')) def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" @@ -803,7 +824,7 @@ def test_prior_matches_added(self): 'microsoftoffice 365 home', 'microsoftoffice 365 pers', 'microsoft office' - ] + ] df = pd.DataFrame(sample, columns=['name']) From 0bc533f9352675ca7a2131525a137e6173eb8268 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Tue, 11 May 2021 12:14:11 +0200 Subject: [PATCH 08/11] made PEP8-conforming modifications --- string_grouper/string_grouper.py | 94 ++++++++++--------- string_grouper/test/test_string_grouper.py | 24 ++--- string_grouper_utils/string_grouper_utils.py | 4 +- .../test/test_string_grouper_utils.py | 4 +- 4 files changed, 67 insertions(+), 59 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index a619d486..20ccbd43 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -18,27 +18,26 @@ DEFAULT_IGNORE_CASE: bool = True # ignores case by default DEFAULT_DROP_INDEX: bool = False # includes index-columns in output DEFAULT_REPLACE_NA: bool = False # when finding the most similar strings, does not replace NaN values in most - # similar string index-columns with corresponding duplicates-index values -DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity - # matches appear in the output +# similar string index-columns with corresponding duplicates-index values +DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity +# matches appear in the output DEFAULT_SUPPRESS_WARNING: bool = False # when the minimum cosine similarity <=0 and zero-similarity matches are - # requested, determines whether or not to suppress the message warning that - # max_n_matches may be too small +# requested, determines whether or not to suppress the message warning that max_n_matches may be too small GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest - # similarity aggregate as group-representative: +# similarity aggregate as group-representative: GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative: -DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default +DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default # The following string constants are used by (but aren't [yet] options passed to) StringGrouper DEFAULT_COLUMN_NAME: str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches -DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches +DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches LEFT_PREFIX: str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches RIGHT_PREFIX: str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches MOST_SIMILAR_PREFIX: str = 'most_similar_' # used to prefix columns of the output of - # StringGrouper._get_nearest_matches -DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches +# StringGrouper._get_nearest_matches +DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}' # used to name id-column of the output of - # StringGrouper.get_nearest_matches +# StringGrouper.get_nearest_matches GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate @@ -65,7 +64,7 @@ def this(*args, **kwargs): @add_this_arg -def compute_pairwise_similarities(this, +def compute_pairwise_similarities(this, string_series_1: pd.Series, string_series_2: pd.Series, **kwargs) -> pd.Series: @@ -214,11 +213,11 @@ class StringGrouperConfig(NamedTuple): Defaults to number of cores on a machine - 1. :param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case). :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False. - :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches + :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to True. :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to supress the message warning that max_n_matches may be too small. Defaults to False. - :param replace_na: whether or not to replace NaN values in most similar string index-columns with + :param replace_na: whether or not to replace NaN values in most similar string index-columns with corresponding duplicates-index values. Defaults to False. :param group_rep: str. The scheme to select the group-representative. Default is 'centroid'. The other choice is 'first'. @@ -261,6 +260,7 @@ class StringGrouperNotAllStringsException(TypeError): """Raised when either input Series master or duplicates contains non-strings""" pass + class StringGrouper(object): def __init__(self, master: pd.Series, duplicates: Optional[pd.Series] = None, @@ -282,7 +282,8 @@ def __init__(self, master: pd.Series, # Validate match strings input self.issues: pd.Series = None self._check_string_series(master, 'master') - if (duplicates is not None): self._check_string_series(duplicates, 'duplicates') + if (duplicates is not None): + self._check_string_series(duplicates, 'duplicates') # Validate optional IDs input if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id): raise Exception('List of data Series options is invalid') @@ -320,7 +321,7 @@ def fit(self) -> 'StringGrouper': matches = self._build_matches(master_matrix, duplicate_matrix) if self._duplicates is None: # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) - # and each of its diagonal components must be equal to 1 + # and each of its diagonal components must be equal to 1 matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches) # retrieve all matches self._matches_list = self._get_matches_list(matches) @@ -339,15 +340,15 @@ def dot(self) -> pd.Series: @validate_is_fit def get_matches(self, ignore_index: Optional[bool] = None, - include_zeroes: Optional[bool]=None, - suppress_warning: Optional[bool]=None) -> pd.DataFrame: + include_zeroes: Optional[bool] = None, + suppress_warning: Optional[bool] = None) -> pd.DataFrame: """ Returns a DataFrame with all the matches and their cosine similarity. If optional IDs are used, returned as extra columns with IDs matched to respective data rows - :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to + :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to self._config.ignore_index. - :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches + :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to self._config.include_zeroes. :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to suppress the message warning that max_n_matches may be too small. Defaults to self._config.suppress_warning. @@ -372,19 +373,22 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str): else: return data.rename(f"{prefix}{data.name}") - if ignore_index is None: ignore_index = self._config.ignore_index - if include_zeroes is None: include_zeroes = self._config.include_zeroes - if suppress_warning is None: suppress_warning = self._config.suppress_warning + if ignore_index is None: + ignore_index = self._config.ignore_index + if include_zeroes is None: + include_zeroes = self._config.include_zeroes + if suppress_warning is None: + suppress_warning = self._config.suppress_warning if self._config.min_similarity > 0 or not include_zeroes: matches_list = self._matches_list elif include_zeroes: # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic): - # the fix includes zero-similarity matches that are missing by default - # in _matches_list due to our use of sparse matrices + # the fix includes zero-similarity matches that are missing by default + # in _matches_list due to our use of sparse matrices non_matches_list = self._get_non_matches_list(suppress_warning) matches_list = self._matches_list if non_matches_list.empty else \ pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True) - + left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index) similarity = matches_list.similarity.reset_index(drop=True) if self._master_id is None: @@ -426,16 +430,18 @@ def get_groups(self, If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs above are returned as well altogether in a DataFrame. - :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to + :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to self._config.ignore_index. - :param replace_na: whether or not to replace NaN values in most similar string index-columns with + :param replace_na: whether or not to replace NaN values in most similar string index-columns with corresponding duplicates-index values. Defaults to self._config.replace_na. """ - if ignore_index is None: ignore_index = self._config.ignore_index + if ignore_index is None: + ignore_index = self._config.ignore_index if self._duplicates is None: return self._deduplicate(ignore_index=ignore_index) else: - if replace_na is None: replace_na = self._config.replace_na + if replace_na is None: + replace_na = self._config.replace_na return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na) @validate_is_fit @@ -524,7 +530,8 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame: all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side']) matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']]) missing_pairs = all_pairs.difference(matched_pairs) - if missing_pairs.empty: return pd.DataFrame() + if missing_pairs.empty: + return pd.DataFrame() if (self._config.max_n_matches < d_sz) and not suppress_warning: warnings.warn(f'WARNING: max_n_matches={self._config.max_n_matches} may be too small!\n' f'\t\t Some zero-similarity matches returned may be false!\n' @@ -542,8 +549,8 @@ def _get_nearest_matches(self, master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}' master = self._master.rename(master_label).reset_index(drop=ignore_index) dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index) - - # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging + + # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging if isinstance(dupes, pd.DataFrame): master.rename( columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label}, @@ -573,14 +580,14 @@ def _get_nearest_matches(self, if self._master_id is not None: # Also update the master_id-series with the duplicates_id in cases were there is no match dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id - + # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values # appear within them. So here we change them back to their original datatypes if possible: if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \ - self._duplicates_id.dtype == self._master_id.dtype: + self._duplicates_id.dtype == self._master_id.dtype: dupes_max_sim.loc[:, master_id_label] = \ - dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype) - + dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype) + # Prepare the output: required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label] index_column_list = \ @@ -590,13 +597,13 @@ def _get_nearest_matches(self, # Update the master index-columns with the duplicates index-column values in cases were there is no match dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates'] dupes_max_sim.loc[rows_to_update, index_column_list] = \ - dupes_max_sim.loc[rows_to_update, dupes_index_columns].values - + dupes_max_sim.loc[rows_to_update, dupes_index_columns].values + # Restore their original datatypes if possible: for m, d in zip(index_column_list, dupes_index_columns): if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype: dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype) - + # Make sure to keep same order as duplicates dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side') output = dupes_max_sim[index_column_list + required_column_list] @@ -667,9 +674,9 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True) dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True) return master_indices, dupe_indices - + def _check_string_series(self, series_to_test: pd.Series, which: str): - self.bad_series_name = which + self.bad_series_name = which StringGrouper._check_type(series_to_test, which) self._check_content(series_to_test, which) @@ -780,7 +787,7 @@ def __init__(self, master: pd.Series, **kwargs) except StringGrouperNotAllStringsException: self.non_strings_present = True - + def error_msg(self, bad_series_name, function_name): nl = ':\n' return ( @@ -789,4 +796,3 @@ def error_msg(self, bad_series_name, function_name): f'{nl if 0 < len(self.issues) < 12 else "."}' f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}' ) - diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 7c4cbb39..1892a60c 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -12,8 +12,10 @@ from unittest.mock import patch import warnings -def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix: - return A + +def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix: + return x + class SimpleExample(object): def __init__(self): @@ -211,7 +213,7 @@ def test_match_strings(self, mock_StringGouper): side_effect=mock_symmetrize_matrix ) def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix): - """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is + """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" simple_example = SimpleExample() df = simple_example.customers_df2['Customer Name'] @@ -225,7 +227,7 @@ def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_m # obtain the intersection between upper and upper_prime: intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) # if the intersection is empty then _matches_list is completely non-symmetric (this is acceptable) - # if the intersection is not empty then at least some matches are repeated. + # if the intersection is not empty then at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. self.assertFalse(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) @@ -243,7 +245,7 @@ def test_match_list_symmetry_with_symmetrize_function(self): # Obtain the intersection between upper and upper_prime: intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) # If the intersection is empty this means _matches_list is completely non-symmetric (this is acceptable) - # If the intersection is not empty this means at least some matches are repeated. + # If the intersection is not empty this means at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) @@ -276,7 +278,7 @@ def test_match_list_diagonal(self): self.assertEqual(num_self_joins, num_strings) def test_zero_min_similarity(self): - """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are + """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] @@ -285,7 +287,7 @@ def test_zero_min_similarity(self): pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches) def test_zero_min_similarity_small_max_n_matches(self): - """This test ensures that a warning is issued when n_max_matches is suspected to be too small while + """This test ensures that a warning is issued when n_max_matches is suspected to be too small while min_similarity <= 0 and include_zeroes is True""" simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] @@ -675,9 +677,9 @@ def test_get_groups_4_df_same_similarity(self): test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) - sg = StringGrouper(test_series_1, - test_series_2, - master_id=test_series_id_1, + sg = StringGrouper(test_series_1, + test_series_2, + master_id=test_series_id_1, duplicates_id=test_series_id_2, ignore_index=True) sg = sg.fit() @@ -805,7 +807,7 @@ def test_string_grouper_type_error(self): _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j'])) with self.assertRaises(StringGrouperNotAllStringsException): _ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan])) - + def test_not_all_strings_exception_in_high_level_fucntions(self): good_series = pd.Series(['foo', 'bar']) bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes') diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py index 11803a32..cc22cd1b 100644 --- a/string_grouper_utils/string_grouper_utils.py +++ b/string_grouper_utils/string_grouper_utils.py @@ -137,8 +137,8 @@ def get_column(col: Union[str, int, List[Union[str, int]]], data: pd.DataFrame): def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Series: - error_msg = f"timestamps must be a Series of date-like or datetime-like strings" - error_msg += f" or datetime datatype or pandas Timestamp datatype or numbers" + error_msg = "timestamps must be a Series of date-like or datetime-like strings" + error_msg += " or datetime datatype or pandas Timestamp datatype or numbers" if is_series_of_type(str, timestamps): # if any of the strings is not datetime-like raise an exception if timestamps.to_frame().applymap(is_date).squeeze().all(): diff --git a/string_grouper_utils/test/test_string_grouper_utils.py b/string_grouper_utils/test/test_string_grouper_utils.py index 3798e3cd..0c8a8ee4 100644 --- a/string_grouper_utils/test/test_string_grouper_utils.py +++ b/string_grouper_utils/test/test_string_grouper_utils.py @@ -1,8 +1,8 @@ import unittest import pandas as pd from dateutil.parser import parse -from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \ - new_group_rep_by_highest_weight +from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \ + new_group_rep_by_completeness, new_group_rep_by_highest_weight class SimpleExample(object): From 02ad0300aa5ad7f68f7003f52177dea962bbd217 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Tue, 11 May 2021 13:48:20 +0200 Subject: [PATCH 09/11] updated string_grouper_utils.py to quell unittest deprecated warnings --- string_grouper_utils/string_grouper_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py index cc22cd1b..e674367b 100644 --- a/string_grouper_utils/string_grouper_utils.py +++ b/string_grouper_utils/string_grouper_utils.py @@ -1,7 +1,7 @@ -import numpy as np import pandas as pd from typing import List, Optional, Union from dateutil.parser import parse +from dateutil.tz import UTC from numbers import Number from datetime import datetime import re @@ -143,13 +143,13 @@ def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Ser # if any of the strings is not datetime-like raise an exception if timestamps.to_frame().applymap(is_date).squeeze().all(): # convert strings to numpy datetime64 - return timestamps.transform(lambda x: np.datetime64(parse(x, parserinfo, **kwargs))) + return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC)) elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps): # convert pandas Timestamps to numpy datetime64 return timestamps.transform(lambda x: x.to_numpy()) elif is_series_of_type(datetime, timestamps): # convert python datetimes to numpy datetime64 - return timestamps.transform(lambda x: np.datetime64(x)) + return timestamps.transform(lambda x: x.astimezone(UTC)) elif is_series_of_type(Number, timestamps): return timestamps raise Exception(error_msg) From e4686e5dbff386a1d74e306d5d425ef3f09c6362 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 3 Jul 2021 21:15:05 +0200 Subject: [PATCH 10/11] set max_n_matches=1 in match_most_similar() for a performance boost --- string_grouper/string_grouper.py | 9 +++++---- string_grouper/test/test_string_grouper.py | 12 ++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d1612511..243446ee 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -103,6 +103,7 @@ def match_most_similar(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ + kwargs['max_n_matches'] = 1 string_grouper = StringGrouper(master, duplicates=duplicates, master_id=master_id, @@ -455,8 +456,8 @@ def _fit_vectorizer(self) -> TfidfVectorizer: def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix: """Builds the cossine similarity matrix of two csr matrices""" - tf_idf_matrix_1 = master_matrix - tf_idf_matrix_2 = duplicate_matrix.transpose() + tf_idf_matrix_1 = duplicate_matrix + tf_idf_matrix_2 = master_matrix.transpose() optional_kwargs = { 'return_best_ntop': True, @@ -661,8 +662,8 @@ def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix: def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" r, c = matches.nonzero() - matches_list = pd.DataFrame({'master_side': r.astype(np.int64), - 'dupe_side': c.astype(np.int64), + matches_list = pd.DataFrame({'master_side': c.astype(np.int64), + 'dupe_side': r.astype(np.int64), 'similarity': matches.data}) return matches_list diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index f5f0aac8..c12b21db 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -405,8 +405,8 @@ def test_get_matches_single(self): sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - left_index = [0, 0, 1, 2, 3, 3] - right_index = [0, 3, 1, 2, 0, 3] + left_index = [0, 3, 1, 2, 0, 3] + right_index = [0, 0, 1, 2, 3, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'similarity': similarity, @@ -420,11 +420,11 @@ def test_get_matches_1_series_1_id_series(self): sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] - left_index = [0, 0, 1, 2, 3, 3] + left_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] + left_index = [0, 3, 1, 2, 0, 3] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] - right_index = [0, 3, 1, 2, 0, 3] + right_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] + right_index = [0, 0, 1, 2, 3, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, From 859aa4b381d702ca8d16da4358fdcd9711a8d87c Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 5 Jul 2021 05:51:39 +0200 Subject: [PATCH 11/11] changed default value of kwarg max_n_matches to #strings in master --- CHANGELOG.md | 11 +++++++++++ README.md | 2 +- string_grouper/string_grouper.py | 6 +++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e73d33fd..399a1b45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +## [0.5.2?] - 2021-07-05 + +* Provided a more user-friendly error message to be issued when any entries in the input string-Series are not strings. + +## [0.5.1?] - 2021-07-05 + +* Improved the performance of the function `match_most_similar`. +* Changed the default value of the keyword argument `max_n_matches` to the total number of strings in `master`. (`max_n_matches` is now defined as the maximum number of matches allowed per string in `duplicates` \[or `master` if `duplicates` is not given\]). + ## [0.5.0] - 2021-06-11 ### Added diff --git a/README.md b/README.md index 1b18c3c9..2da24cf5 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`. * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`. - * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given). + * **`max_n_matches`**: The maximum number of matching strings in `master` allowed per string in `duplicates` (or `master` if `duplicates` is not given). Default is the total number of strings in `master`. * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match. Defaults to `0.8` * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 2ac60825..f7fa7b75 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -13,7 +13,6 @@ DEFAULT_NGRAM_SIZE: int = 3 DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) DEFAULT_REGEX: str = r'[,-./]|\s' -DEFAULT_MAX_N_MATCHES: int = 20 DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1 DEFAULT_IGNORE_CASE: bool = True # ignores case by default @@ -209,7 +208,8 @@ class StringGrouperConfig(NamedTuple): (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision than np.float64.) :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'. - :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. + :param max_n_matches: int. The maximum number of matching strings in `master` allowed per string in + `duplicates` (or `master` if `duplicates` is not given). Default will be set by StringGrouper. :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match. Defaults to 0.8. :param number_of_processes: int. The number of processes used by the cosine similarity calculation. @@ -297,7 +297,7 @@ def __init__(self, master: pd.Series, self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) if self._config.max_n_matches is None: - self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) + self._max_n_matches = len(self._master) else: self._max_n_matches = self._config.max_n_matches