From 92436e351adcc58117199576ad8ee0884118d669 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Mon, 26 Apr 2021 22:31:08 +0200
Subject: [PATCH 01/11] boosted _symmetrize_matches_list() (5x) and
 _get_matches_list() (33x)

---
 string_grouper/string_grouper.py           | 48 +++++++---------------
 string_grouper/test/test_string_grouper.py |  8 ++--
 2 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 3ab8cc46..c022a1df 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -251,11 +251,11 @@ def fit(self) -> 'StringGrouper':
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
         # Calculate the matches using the cosine similarity
         matches = self._build_matches(master_matrix, duplicate_matrix)
+        if self._duplicates is None:
+            # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
+            matches = StringGrouper._symmetrize_matrix(matches)
         # retrieve all matches
         self._matches_list = self._get_matches_list(matches)
-        if self._duplicates is None:
-            # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            self._symmetrize_matches_list()
         self.is_build = True
         return self
 
@@ -450,18 +450,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
                                    self._config.min_similarity,
                                    **optional_kwargs)
 
-    def _symmetrize_matches_list(self):
-        # [symmetrized matches_list] = [matches_list] UNION [transposed matches_list] (i.e., column-names swapped):
-        self._matches_list = self._matches_list.set_index(['master_side', 'dupe_side'])\
-            .combine_first(
-                self._matches_list.rename(
-                    columns={
-                        'master_side': 'dupe_side',
-                        'dupe_side': 'master_side'
-                    }
-                ).set_index(['master_side', 'dupe_side'])
-            ).reset_index()
-
     def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
         """Returns a list of all the indices of non-matching pairs (with similarity set to 0)"""
         m_sz, d_sz = len(self._master), len(self._master if self._duplicates is None else self._duplicates)
@@ -480,25 +468,19 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
         return missing_pairs
 
     @staticmethod
-    def _get_matches_list(matches) -> pd.DataFrame:
+    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
+        A = AA.tolil()
+        r, c = A.nonzero()
+        A[c, r] = A[r, c]
+        return A.tocsr()
+
+    @staticmethod
+    def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
         """Returns a list of all the indices of matches"""
-        non_zeros = matches.nonzero()
-
-        sparserows = non_zeros[0]
-        sparsecols = non_zeros[1]
-        nr_matches = sparsecols.size
-        master_side = np.empty([nr_matches], dtype=np.int64)
-        dupe_side = np.empty([nr_matches], dtype=np.int64)
-        similarity = np.zeros(nr_matches)
-
-        for index in range(0, nr_matches):
-            master_side[index] = sparserows[index]
-            dupe_side[index] = sparsecols[index]
-            similarity[index] = matches.data[index]
-
-        matches_list = pd.DataFrame({'master_side': master_side,
-                                     'dupe_side': dupe_side,
-                                     'similarity': similarity})
+        r, c = matches.nonzero()
+        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
+                                     'dupe_side': c.astype(np.int64),
+                                     'similarity': matches.data})
         return matches_list
 
     def _get_nearest_matches(self,
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 723d3f22..6ff4cf5e 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -11,6 +11,8 @@
 from unittest.mock import patch
 import warnings
 
+def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
+    return A
 
 class SimpleExample(object):
     def __init__(self):
@@ -197,14 +199,14 @@ def test_match_strings(self, mock_StringGouper):
         mock_StringGrouper_instance.get_matches.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matches_list')
-    def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matches_list):
+    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix)
+    def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
         """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
         simple_example = SimpleExample()
         df = simple_example.customers_df2['Customer Name']
         sg = StringGrouper(df, max_n_matches=2).fit()
-        mock_symmetrize_matches_list.assert_called_once()
+        mock_symmetrize_matrix.assert_called_once()
         # obtain the upper and lower triangular parts of the matrix of matches:
         upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']]
         lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']]

From 99545de3317a62620b1305a300b07e3456a178a8 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 28 Apr 2021 11:49:11 +0200
Subject: [PATCH 02/11] made more pypi-friendly changes in README.md

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 13f22127..3ddc43c7 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`.  Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`).  
 
-The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it.  A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
+The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it.  A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
 
 The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score.    
 
@@ -85,7 +85,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f
    2. `'similarity'` whose column has the similarity-scores as values, and 
    3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`.
    
-   Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
    
    If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above.  Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above.
    
@@ -101,7 +101,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f
    
    The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`.  If `master` has no name, it is assumed to have the name `'master'` before being prefixed.
        
-   If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns.  So it inherits the same index and length as `duplicates`.  The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values.  If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default.  However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`.  Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
+   If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns.  So it inherits the same index and length as `duplicates`.  The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values.  If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default.  However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`.  Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
    
    Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`.
   
@@ -109,7 +109,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f
 
 
 * #### `group_similar_strings` 
-  Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)   
+  Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)   
   
   If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings.  If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`.  
    
@@ -140,11 +140,11 @@ All functions are built using a class **`StringGrouper`**. This class can be use
     Defaults to `0.8`
    * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
     `number of cores on a machine - 1.`
-   * **`ignore_index`**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **`include_zeroes`**: When `min_similarity` &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md) for a demonstration.)  **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false.
+   * **`ignore_index`**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   * **`include_zeroes`**: When `min_similarity` &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md) for a demonstration.)  **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false.
    * **`suppress_warning`**: when `min_similarity` &le; 0 and `include_zeroes`  is `True`, determines whether or not to suppress the message warning that `max_n_matches` may be too small.  Defaults to `False`.
-   * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation.
+   * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation.
 
 ## Examples
 
@@ -306,7 +306,7 @@ Out of the four company names in `duplicates`, three companies are found in the
 
 ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied.
 
-A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](tutorials/tutorial_1.md) that steps though the process with an example data set is available.
+A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available.
 
 
 ### For a second data set, find only the most similar match

From 35dddd9b2841e881075f83490c22a9451a3810a7 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Tue, 4 May 2021 18:37:39 +0200
Subject: [PATCH 03/11] fixed bug related to single-valued input Series

---
 string_grouper/string_grouper.py           | 18 +++++++++--------
 string_grouper/test/test_string_grouper.py | 23 ++++++++++++++++++++--
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index c022a1df..7e99b506 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -139,7 +139,7 @@ def match_strings(master: pd.Series,
 
 
 class StringGrouperConfig(NamedTuple):
-    """
+    r"""
     Class with configuration variables.
 
     :param ngram_size: int. The amount of characters in each n-gram. Default is 3.
@@ -253,7 +253,8 @@ def fit(self) -> 'StringGrouper':
         matches = self._build_matches(master_matrix, duplicate_matrix)
         if self._duplicates is None:
             # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            matches = StringGrouper._symmetrize_matrix(matches)
+            # and each of its diagonal components must be equal to 1 
+            matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches)
         # retrieve all matches
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -468,10 +469,12 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
         return missing_pairs
 
     @staticmethod
-    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
+    def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
         A = AA.tolil()
         r, c = A.nonzero()
         A[c, r] = A[r, c]
+        r = np.arange(A.shape[0])
+        A[r, r] = 1
         return A.tocsr()
 
     @staticmethod
@@ -549,11 +552,10 @@ def _get_nearest_matches(self,
         dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
         output = dupes_max_sim[index_column_list + required_column_list]
         output.index = self._duplicates.index
-        return output.squeeze()
+        return output.squeeze(axis=1)
 
     def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
-        # discard self-matches: A matches A
-        pairs = self._matches_list[self._matches_list['master_side'] != self._matches_list['dupe_side']]
+        pairs = self._matches_list
         # rebuild graph adjacency matrix from already found matches:
         n = len(self._master)
         graph = csr_matrix(
@@ -581,7 +583,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
             graph.data = pairs['similarity'].to_numpy()
             # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ...
             # ... convert to 1D numpy array (using asarray then squeeze) and then to Series:
-            group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze())
+            group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze(axis=1))
             method = 'idxmax'
 
         # Determine the group representatives AND merge with indices:
@@ -605,7 +607,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
             output_id = self._master_id.iloc[group_of_master_index.group_rep].rename(id_label).reset_index(drop=True)
             output = pd.concat([output_id, output], axis=1)
         output.index = self._master.index
-        return output.squeeze()
+        return output
 
     def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, pd.Series]:
         master_strings = self._master
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 6ff4cf5e..e7b39685 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -199,7 +199,10 @@ def test_match_strings(self, mock_StringGouper):
         mock_StringGrouper_instance.get_matches.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix)
+    @patch(
+        'string_grouper.string_grouper.StringGrouper._symmetrize_matrix_and_fix_diagonal',
+        side_effect=mock_symmetrize_matrix
+    )
     def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
         """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
@@ -238,17 +241,33 @@ def test_match_list_symmetry_with_symmetrize_function(self):
         # upper, upper_prime and their intersection should be identical.
         self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
 
-    def test_match_list_diagonal(self):
+    @patch(
+        'string_grouper.string_grouper.StringGrouper._symmetrize_matrix_and_fix_diagonal',
+        side_effect=mock_symmetrize_matrix
+    )
+    def test_match_list_diagonal_without_the_fix(self, mock_symmetrize_matrix):
         """test fails whenever _matches_list's number of self-joins is not equal to the number of strings"""
         # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
         # for small datasets setting max_n_matches=1 reproduces the bug
         simple_example = SimpleExample()
         df = simple_example.customers_df['Customer Name']
         matches = match_strings(df, max_n_matches=1)
+        mock_symmetrize_matrix.assert_called_once()
         num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
         num_strings = len(df)
         self.assertNotEqual(num_self_joins, num_strings)
 
+    def test_match_list_diagonal(self):
+        """This test ensures that all self-joins are present"""
+        # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
+        # for small datasets setting max_n_matches=1 reproduces the bug
+        simple_example = SimpleExample()
+        df = simple_example.customers_df['Customer Name']
+        matches = match_strings(df, max_n_matches=1)
+        num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
+        num_strings = len(df)
+        self.assertEqual(num_self_joins, num_strings)
+
     def test_zero_min_similarity(self):
         """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are 
         returned when min_similarity <= 0.  A bug related to this was first pointed out by @nbcvijanovic"""

From bea485f8cfca65ec5ad9568b1cc1516e51ce5c6a Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 28 Apr 2021 11:49:11 +0200
Subject: [PATCH 04/11] made more pypi-friendly changes in README.md

---
 README.md                        | 18 +++++++++---------
 string_grouper/string_grouper.py | 32 ++++++++++++++++----------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 13f22127..3ddc43c7 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`.  Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`).  
 
-The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it.  A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
+The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it.  A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
 
 The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score.    
 
@@ -85,7 +85,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f
    2. `'similarity'` whose column has the similarity-scores as values, and 
    3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`.
    
-   Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
    
    If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above.  Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above.
    
@@ -101,7 +101,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f
    
    The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`.  If `master` has no name, it is assumed to have the name `'master'` before being prefixed.
        
-   If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns.  So it inherits the same index and length as `duplicates`.  The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values.  If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default.  However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`.  Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
+   If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns.  So it inherits the same index and length as `duplicates`.  The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values.  If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default.  However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`.  Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
    
    Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`.
   
@@ -109,7 +109,7 @@ In the rest of this document the names, `Series` and `DataFrame`, refer to the f
 
 
 * #### `group_similar_strings` 
-  Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)   
+  Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)   
   
   If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings.  If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`.  
    
@@ -140,11 +140,11 @@ All functions are built using a class **`StringGrouper`**. This class can be use
     Defaults to `0.8`
    * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
     `number of cores on a machine - 1.`
-   * **`ignore_index`**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **`include_zeroes`**: When `min_similarity` &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md) for a demonstration.)  **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false.
+   * **`ignore_index`**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   * **`include_zeroes`**: When `min_similarity` &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md) for a demonstration.)  **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false.
    * **`suppress_warning`**: when `min_similarity` &le; 0 and `include_zeroes`  is `True`, determines whether or not to suppress the message warning that `max_n_matches` may be too small.  Defaults to `False`.
-   * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation.
+   * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation.
 
 ## Examples
 
@@ -306,7 +306,7 @@ Out of the four company names in `duplicates`, three companies are found in the
 
 ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied.
 
-A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](tutorials/tutorial_1.md) that steps though the process with an example data set is available.
+A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available.
 
 
 ### For a second data set, find only the most similar match
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index c022a1df..e1a2f80b 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -467,22 +467,6 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
         missing_pairs['similarity'] = 0
         return missing_pairs
 
-    @staticmethod
-    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
-        A = AA.tolil()
-        r, c = A.nonzero()
-        A[c, r] = A[r, c]
-        return A.tocsr()
-
-    @staticmethod
-    def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
-        """Returns a list of all the indices of matches"""
-        r, c = matches.nonzero()
-        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
-                                     'dupe_side': c.astype(np.int64),
-                                     'similarity': matches.data})
-        return matches_list
-
     def _get_nearest_matches(self,
                              ignore_index=False,
                              replace_na=False) -> Union[pd.DataFrame, pd.Series]:
@@ -633,6 +617,22 @@ def _validate_replace_na_and_drop(self):
                 "index if the number of index-levels does not equal the number of index-columns."
             )
 
+    @staticmethod
+    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
+        A = AA.tolil()
+        r, c = A.nonzero()
+        A[c, r] = A[r, c]
+        return A.tocsr()
+
+    @staticmethod
+    def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
+        """Returns a list of all the indices of matches"""
+        r, c = matches.nonzero()
+        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
+                                     'dupe_side': c.astype(np.int64),
+                                     'similarity': matches.data})
+        return matches_list
+
     @staticmethod
     def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame:
         columns_switched = pd.DataFrame({'master_side': new_matches.dupe_side,

From 2e5d9a3530b0cc09607387f99fcaa284903a2724 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 5 May 2021 14:45:19 +0200
Subject: [PATCH 05/11] added unittest for get_groups() with single-valued
 input Series

---
 string_grouper/test/test_string_grouper.py | 37 ++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index e7b39685..518ebf52 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -491,6 +491,43 @@ def test_get_groups_single_df_group_rep_default(self):
             )
         )
 
+    def test_get_groups_single_valued_series(self):
+        """This test ensures that get_groups() returns a single-valued DataFrame or Series object
+        since the input-series is also single-valued.  This test was created in response to a bug discovered
+        by George Walker"""
+        pd.testing.assert_frame_equal(
+            pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']),
+            group_similar_strings(
+                pd.Series(["hello"]),
+                min_similarity=0.6
+            )
+        )
+        pd.testing.assert_series_equal(
+            pd.Series(["hello"], name='group_rep'),
+            group_similar_strings(
+                pd.Series(["hello"]),
+                min_similarity=0.6,
+                ignore_index=True
+            )
+        )
+        pd.testing.assert_frame_equal(
+            pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']),
+            match_most_similar(
+                pd.Series(["hello"]),
+                pd.Series(["hello"]),
+                min_similarity=0.6
+            )
+        )
+        pd.testing.assert_series_equal(
+            pd.Series(["hello"], name='most_similar_master'),
+            match_most_similar(
+                pd.Series(["hello"]),
+                pd.Series(["hello"]),
+                min_similarity=0.6,
+                ignore_index=True
+            )
+        )
+
     def test_get_groups_single_df_keep_index(self):
         """Should return a pd.Series object with the same length as the original df. The series object will contain
         a list of the grouped strings with their indexes displayed in columns"""

From 4a0b22501920ef19c8faf64630d82c9b4a97efff Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 8 May 2021 10:16:43 +0200
Subject: [PATCH 06/11] fixed remaining squeeze() bugs

---
 string_grouper/string_grouper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index d284d50e..86144715 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -266,7 +266,7 @@ def dot(self) -> pd.Series:
             raise Exception("To perform this function, both input Series must have the same length.")
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
         # Calculate pairwise cosine similarities:
-        pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze()
+        pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze(axis=1)
         return pd.Series(pairwise_similarities, name='similarity', index=self._master.index)
 
     @validate_is_fit
@@ -662,7 +662,7 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool:
             return False
         elif series_to_test.to_frame().applymap(
                     lambda x: not isinstance(x, str)
-                ).squeeze().any():
+                ).squeeze(axis=1).any():
             return False
         return True
 

From faa974cbc9d12f5a1026f743e64f9413546c6029 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sun, 9 May 2021 10:56:29 +0200
Subject: [PATCH 07/11] added error-handler to capture non-strings in input
 Series

---
 string_grouper/string_grouper.py           | 186 ++++++++++++++++-----
 string_grouper/test/test_string_grouper.py |  35 +++-
 2 files changed, 176 insertions(+), 45 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 86144715..a619d486 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -41,10 +41,32 @@
                                                                             # StringGrouper.get_nearest_matches
 GROUP_REP_PREFIX: str = 'group_rep_'    # used to prefix and name columns of the output of StringGrouper._deduplicate
 
+
 # High level functions
 
 
-def compute_pairwise_similarities(string_series_1: pd.Series,
+def who(bad_StringGrouper_param, param_1, param_name_1, param_2, param_name_2):
+    # Private utility function used by high-level functions (that call StringGrouper) to form a
+    # descriptive name for their series input parameter which caused the exception of type
+    # StringGrouperNotAllStringsException to occur
+    if bad_StringGrouper_param == 'master':
+        return f'\'{param_1.name}\' ({param_name_1})' if param_1.name else param_name_1
+    else:
+        return f'\'{param_2.name}\' ({param_name_2})' if param_2.name else param_name_2
+
+
+def add_this_arg(func):
+    # Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func")
+    # that shifts the parameters of "func" to the right by one, inserting a reference to local
+    # function "this" in the first parameter position
+    def this(*args, **kwargs):
+        return func(this, *args, **kwargs)
+    return this
+
+
+@add_this_arg
+def compute_pairwise_similarities(this, 
+                                  string_series_1: pd.Series,
                                   string_series_2: pd.Series,
                                   **kwargs) -> pd.Series:
     """
@@ -55,10 +77,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series,
     :param kwargs: All other keyword arguments are passed to StringGrouperConfig
     :return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2
     """
-    return StringGrouper(string_series_1, string_series_2, **kwargs).dot()
-
-
-def group_similar_strings(strings_to_group: pd.Series,
+    sg = StringGrouperPrime(string_series_1, string_series_2, **kwargs)
+    # error handler (for input Series with values that are not strings)
+    if sg.non_strings_present:
+        sname = who(sg.bad_series_name,
+                    string_series_1, 'string_series_1',
+                    string_series_2, 'string_series_2')
+        this.issues = sg.issues
+        this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
+        raise TypeError(sg.error_msg(sname, 'compute_pairwise_similarities'))
+    return sg.dot()
+
+
+@add_this_arg
+def group_similar_strings(this,
+                          strings_to_group: pd.Series,
                           string_ids: Optional[pd.Series] = None,
                           **kwargs) -> Union[pd.DataFrame, pd.Series]:
     """
@@ -76,11 +109,22 @@ def group_similar_strings(strings_to_group: pd.Series,
     :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
     :return: pandas.Series or pandas.DataFrame.
     """
-    string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit()
-    return string_grouper.get_groups()
-
-
-def match_most_similar(master: pd.Series,
+    sg = StringGrouperPrime(strings_to_group, master_id=string_ids, **kwargs)
+    # error handler (for input Series with values that are not strings)
+    if sg.non_strings_present:
+        sname = who(sg.bad_series_name,
+                    strings_to_group, 'strings_to_group',
+                    None, '')
+        this.issues = sg.issues
+        this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
+        raise TypeError(sg.error_msg(sname, 'group_similar_strings'))
+    fit_sg = sg.fit()
+    return fit_sg.get_groups()
+
+
+@add_this_arg
+def match_most_similar(this,
+                       master: pd.Series,
                        duplicates: pd.Series,
                        master_id: Optional[pd.Series] = None,
                        duplicates_id: Optional[pd.Series] = None,
@@ -105,15 +149,26 @@ def match_most_similar(master: pd.Series,
     :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
     :return: pandas.Series or pandas.DataFrame.
     """
-    string_grouper = StringGrouper(master,
-                                   duplicates=duplicates,
-                                   master_id=master_id,
-                                   duplicates_id=duplicates_id,
-                                   **kwargs).fit()
-    return string_grouper.get_groups()
-
-
-def match_strings(master: pd.Series,
+    sg = StringGrouperPrime(master,
+                            duplicates=duplicates,
+                            master_id=master_id,
+                            duplicates_id=duplicates_id,
+                            **kwargs)
+    # error handler (for input Series with values that are not strings)
+    if sg.non_strings_present:
+        sname = who(sg.bad_series_name,
+                    master, 'master',
+                    duplicates, 'duplicates')
+        this.issues = sg.issues
+        this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
+        raise TypeError(sg.error_msg(sname, 'match_most_similar'))
+    fit_sg = sg.fit()
+    return fit_sg.get_groups()
+
+
+@add_this_arg
+def match_strings(this,
+                  master: pd.Series,
                   duplicates: Optional[pd.Series] = None,
                   master_id: Optional[pd.Series] = None,
                   duplicates_id: Optional[pd.Series] = None,
@@ -130,12 +185,20 @@ def match_strings(master: pd.Series,
     :param kwargs: All other keyword arguments are passed to StringGrouperConfig.
     :return: pandas.Dataframe.
     """
-    string_grouper = StringGrouper(master,
-                                   duplicates=duplicates,
-                                   master_id=master_id,
-                                   duplicates_id=duplicates_id,
-                                   **kwargs).fit()
-    return string_grouper.get_matches()
+    sg = StringGrouperPrime(master,
+                            duplicates=duplicates,
+                            master_id=master_id,
+                            duplicates_id=duplicates_id,
+                            **kwargs)
+    if sg.non_strings_present:
+        sname = who(sg.bad_series_name,
+                    master, 'master',
+                    duplicates, 'duplicates')
+        this.issues = sg.issues
+        this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
+        raise TypeError(sg.error_msg(sname, 'match_strings'))
+    fit_sg = sg.fit()
+    return fit_sg.get_matches()
 
 
 class StringGrouperConfig(NamedTuple):
@@ -194,6 +257,10 @@ class StringGrouperNotFitException(Exception):
     pass
 
 
+class StringGrouperNotAllStringsException(TypeError):
+    """Raised when either input Series master or duplicates contains non-strings"""
+    pass
+
 class StringGrouper(object):
     def __init__(self, master: pd.Series,
                  duplicates: Optional[pd.Series] = None,
@@ -213,9 +280,9 @@ def __init__(self, master: pd.Series,
         :param kwargs: All other keyword arguments are passed to StringGrouperConfig
         """
         # Validate match strings input
-        if not StringGrouper._is_series_of_strings(master) or \
-                (duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)):
-            raise TypeError('Input does not consist of pandas.Series containing only Strings')
+        self.issues: pd.Series = None
+        self._check_string_series(master, 'master')
+        if (duplicates is not None): self._check_string_series(duplicates, 'duplicates')
         # Validate optional IDs input
         if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id):
             raise Exception('List of data Series options is invalid')
@@ -601,6 +668,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
         dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
         return master_indices, dupe_indices
     
+    def _check_string_series(self, series_to_test: pd.Series, which: str):
+        self.bad_series_name = which 
+        StringGrouper._check_type(series_to_test, which)
+        self._check_content(series_to_test, which)
+
+    def _check_content(self, series_to_test: pd.Series, which: str):
+        non_strings_exist = series_to_test.to_frame().applymap(
+            lambda x: (not isinstance(x, str)) or len(x) == 0
+        ).squeeze(axis=1)
+        if non_strings_exist.any():
+            self.issues = series_to_test[non_strings_exist]
+            sname = f' {series_to_test.name}' if series_to_test.name else ''
+            self.issues.rename(f'Non-strings in {which} Series{sname}', inplace=True)
+            raise StringGrouperNotAllStringsException
+
     def _validate_group_rep_specs(self):
         group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
         if self._config.group_rep not in group_rep_options:
@@ -617,6 +699,11 @@ def _validate_replace_na_and_drop(self):
                 "index if the number of index-levels does not equal the number of index-columns."
             )
 
+    @staticmethod
+    def _check_type(series_to_test: pd.Series, which: str):
+        if not isinstance(series_to_test, pd.Series):
+            raise TypeError(f'Input {which} is not a  pandas.Series containing only Strings')
+
     @staticmethod
     def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
         A = AA.tolil()
@@ -656,16 +743,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings
         elif not dupe_strings.isin([dupe_side]).any():
             raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series')
 
-    @staticmethod
-    def _is_series_of_strings(series_to_test: pd.Series) -> bool:
-        if not isinstance(series_to_test, pd.Series):
-            return False
-        elif series_to_test.to_frame().applymap(
-                    lambda x: not isinstance(x, str)
-                ).squeeze(axis=1).any():
-            return False
-        return True
-
     @staticmethod
     def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool:
         if duplicates is None and (duplicates_id is not None) \
@@ -680,3 +757,36 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id):
             raise Exception('Both master and master_id must be pandas.Series of the same length.')
         if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id):
             raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.')
+
+
+class StringGrouperPrime(StringGrouper):
+    # (To be used in high-level functions)
+    # Child class of StringGrouper that captures information about the input Series
+    # that caused the StringGrouperNotAllStringsException even when the StringGrouper
+    # instance is not fully initialized
+    def __init__(self, master: pd.Series,
+                 duplicates: Optional[pd.Series] = None,
+                 master_id: Optional[pd.Series] = None,
+                 duplicates_id: Optional[pd.Series] = None,
+                 **kwargs):
+        self.issues = None
+        self.non_strings_present = False
+        self.bad_series_name = None
+        try:
+            super().__init__(master,
+                             duplicates=duplicates,
+                             master_id=master_id,
+                             duplicates_id=duplicates_id,
+                             **kwargs)
+        except StringGrouperNotAllStringsException:
+            self.non_strings_present = True
+            
+    def error_msg(self, bad_series_name, function_name):
+        nl = ':\n'
+        return (
+            f'\n\nERROR: Input pandas Series {bad_series_name} contains values that are not strings!\n'
+            f'Display the pandas Series \'{function_name}.issues\' to find where these values are'
+            f'{nl if 0 < len(self.issues) < 12 else "."}'
+            f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}'
+        )
+        
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 518ebf52..7c4cbb39 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -5,7 +5,8 @@
 from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
     DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \
     DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
-    StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
+    StringGrouperConfig, StringGrouper, \
+    StringGrouperNotFitException, StringGrouperNotAllStringsException, \
     match_most_similar, group_similar_strings, match_strings,\
     compute_pairwise_similarities
 from unittest.mock import patch
@@ -144,12 +145,14 @@ def test_compute_pairwise_similarities_data_integrity(self):
         with self.assertRaises(Exception):
             _ = compute_pairwise_similarities(df1, df2[:-2])
 
-    @patch('string_grouper.string_grouper.StringGrouper')
+    @patch('string_grouper.string_grouper.StringGrouperPrime')
     def test_group_similar_strings(self, mock_StringGouper):
         """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected"""
         mock_StringGrouper_instance = mock_StringGouper.return_value
         mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
         mock_StringGrouper_instance.get_groups.return_value = 'whatever'
+        mock_StringGrouper_instance.non_strings_present = False
+        mock_StringGrouper_instance.error_msg.return_value = 'mock_error'
 
         test_series_1 = None
         test_series_id_1 = None
@@ -162,12 +165,14 @@ def test_group_similar_strings(self, mock_StringGouper):
         mock_StringGrouper_instance.get_groups.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper')
+    @patch('string_grouper.string_grouper.StringGrouperPrime')
     def test_match_most_similar(self, mock_StringGouper):
         """mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected"""
         mock_StringGrouper_instance = mock_StringGouper.return_value
         mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
         mock_StringGrouper_instance.get_groups.return_value = 'whatever'
+        mock_StringGrouper_instance.non_strings_present = False
+        mock_StringGrouper_instance.error_msg.return_value = 'mock_error'
 
         test_series_1 = None
         test_series_2 = None
@@ -184,12 +189,14 @@ def test_match_most_similar(self, mock_StringGouper):
         mock_StringGrouper_instance.get_groups.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper')
+    @patch('string_grouper.string_grouper.StringGrouperPrime')
     def test_match_strings(self, mock_StringGouper):
         """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected"""
         mock_StringGrouper_instance = mock_StringGouper.return_value
         mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
         mock_StringGrouper_instance.get_matches.return_value = 'whatever'
+        mock_StringGrouper_instance.non_strings_present = False
+        mock_StringGrouper_instance.error_msg.return_value = 'mock_error'
 
         test_series_1 = None
         test_series_id_1 = None
@@ -792,10 +799,24 @@ def test_string_grouper_type_error(self):
         """StringGrouper should raise an typeerror master or duplicates are not a series of strings"""
         with self.assertRaises(TypeError):
             _ = StringGrouper('foo', 'bar')
-        with self.assertRaises(TypeError):
+        with self.assertRaises(StringGrouperNotAllStringsException):
             _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1]))
-        with self.assertRaises(TypeError):
+        with self.assertRaises(StringGrouperNotAllStringsException):
             _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j']))
+        with self.assertRaises(StringGrouperNotAllStringsException):
+            _ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan]))
+            
+    def test_not_all_strings_exception_in_high_level_fucntions(self):
+        good_series = pd.Series(['foo', 'bar'])
+        bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes')
+        with self.assertRaises(TypeError):
+            _ = compute_pairwise_similarities(good_series, bad_series.rename_axis('dupes_id'))
+        with self.assertRaises(TypeError):
+            _ = group_similar_strings(bad_series.rename_axis('string_id'))
+        with self.assertRaises(TypeError):
+            _ = match_most_similar(bad_series.rename('master'), good_series)
+        with self.assertRaises(TypeError):
+            _ = match_strings(good_series, bad_series.rename('dupes').rename_axis('dupes_id'))
 
     def test_prior_matches_added(self):
         """When a new match is added, any pre-existing matches should also be updated"""
@@ -803,7 +824,7 @@ def test_prior_matches_added(self):
             'microsoftoffice 365 home',
             'microsoftoffice 365 pers',
             'microsoft office'
-            ]
+        ]
 
         df = pd.DataFrame(sample, columns=['name'])
 

From 0bc533f9352675ca7a2131525a137e6173eb8268 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Tue, 11 May 2021 12:14:11 +0200
Subject: [PATCH 08/11] made PEP8-conforming modifications

---
 string_grouper/string_grouper.py              | 94 ++++++++++---------
 string_grouper/test/test_string_grouper.py    | 24 ++---
 string_grouper_utils/string_grouper_utils.py  |  4 +-
 .../test/test_string_grouper_utils.py         |  4 +-
 4 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index a619d486..20ccbd43 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -18,27 +18,26 @@
 DEFAULT_IGNORE_CASE: bool = True  # ignores case by default
 DEFAULT_DROP_INDEX: bool = False  # includes index-columns in output
 DEFAULT_REPLACE_NA: bool = False    # when finding the most similar strings, does not replace NaN values in most
-                                    # similar string index-columns with corresponding duplicates-index values
-DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
-                                    # matches appear in the output 
+# similar string index-columns with corresponding duplicates-index values
+DEFAULT_INCLUDE_ZEROES: bool = True  # when the minimum cosine similarity <=0, determines whether zero-similarity
+# matches appear in the output
 DEFAULT_SUPPRESS_WARNING: bool = False  # when the minimum cosine similarity <=0 and zero-similarity matches are
-                                        # requested, determines whether or not to suppress the message warning that 
-                                        # max_n_matches may be too small 
+# requested, determines whether or not to suppress the message warning that max_n_matches may be too small
 GROUP_REP_CENTROID: str = 'centroid'    # Option value to select the string in each group with the largest
-                                        # similarity aggregate as group-representative:
+# similarity aggregate as group-representative:
 GROUP_REP_FIRST: str = 'first'  # Option value to select the first string in each group as group-representative:
-DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
+DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID  # chooses group centroid as group-representative by default
 
 # The following string constants are used by (but aren't [yet] options passed to) StringGrouper
 DEFAULT_COLUMN_NAME: str = 'side'   # used to name non-index columns of the output of StringGrouper.get_matches
-DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
+DEFAULT_ID_NAME: str = 'id'  # used to name id-columns in the output of StringGrouper.get_matches
 LEFT_PREFIX: str = 'left_'  # used to prefix columns on the left of the output of StringGrouper.get_matches
 RIGHT_PREFIX: str = 'right_'    # used to prefix columns on the right of the output of StringGrouper.get_matches
 MOST_SIMILAR_PREFIX: str = 'most_similar_'  # used to prefix columns of the output of
-                                            # StringGrouper._get_nearest_matches
-DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
+# StringGrouper._get_nearest_matches
+DEFAULT_MASTER_NAME: str = 'master'  # used to name non-index column of the output of StringGrouper.get_nearest_matches
 DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}'    # used to name id-column of the output of
-                                                                            # StringGrouper.get_nearest_matches
+# StringGrouper.get_nearest_matches
 GROUP_REP_PREFIX: str = 'group_rep_'    # used to prefix and name columns of the output of StringGrouper._deduplicate
 
 
@@ -65,7 +64,7 @@ def this(*args, **kwargs):
 
 
 @add_this_arg
-def compute_pairwise_similarities(this, 
+def compute_pairwise_similarities(this,
                                   string_series_1: pd.Series,
                                   string_series_2: pd.Series,
                                   **kwargs) -> pd.Series:
@@ -214,11 +213,11 @@ class StringGrouperConfig(NamedTuple):
     Defaults to number of cores on a machine - 1.
     :param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
     :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to False.
-    :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
+    :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
     appear in the output.  Defaults to True.
     :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to supress
     the message warning that max_n_matches may be too small.  Defaults to False.
-    :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
+    :param replace_na: whether or not to replace NaN values in most similar string index-columns with
     corresponding duplicates-index values. Defaults to False.
     :param group_rep: str.  The scheme to select the group-representative.  Default is 'centroid'.
     The other choice is 'first'.
@@ -261,6 +260,7 @@ class StringGrouperNotAllStringsException(TypeError):
     """Raised when either input Series master or duplicates contains non-strings"""
     pass
 
+
 class StringGrouper(object):
     def __init__(self, master: pd.Series,
                  duplicates: Optional[pd.Series] = None,
@@ -282,7 +282,8 @@ def __init__(self, master: pd.Series,
         # Validate match strings input
         self.issues: pd.Series = None
         self._check_string_series(master, 'master')
-        if (duplicates is not None): self._check_string_series(duplicates, 'duplicates')
+        if (duplicates is not None):
+            self._check_string_series(duplicates, 'duplicates')
         # Validate optional IDs input
         if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id):
             raise Exception('List of data Series options is invalid')
@@ -320,7 +321,7 @@ def fit(self) -> 'StringGrouper':
         matches = self._build_matches(master_matrix, duplicate_matrix)
         if self._duplicates is None:
             # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            # and each of its diagonal components must be equal to 1 
+            # and each of its diagonal components must be equal to 1
             matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches)
         # retrieve all matches
         self._matches_list = self._get_matches_list(matches)
@@ -339,15 +340,15 @@ def dot(self) -> pd.Series:
     @validate_is_fit
     def get_matches(self,
                     ignore_index: Optional[bool] = None,
-                    include_zeroes: Optional[bool]=None,
-                    suppress_warning: Optional[bool]=None) -> pd.DataFrame:
+                    include_zeroes: Optional[bool] = None,
+                    suppress_warning: Optional[bool] = None) -> pd.DataFrame:
         """
         Returns a DataFrame with all the matches and their cosine similarity.
         If optional IDs are used, returned as extra columns with IDs matched to respective data rows
 
-        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to 
+        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to
         self._config.ignore_index.
-        :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
+        :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
         appear in the output.  Defaults to self._config.include_zeroes.
         :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to suppress
         the message warning that max_n_matches may be too small.  Defaults to self._config.suppress_warning.
@@ -372,19 +373,22 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
             else:
                 return data.rename(f"{prefix}{data.name}")
 
-        if ignore_index is None: ignore_index = self._config.ignore_index
-        if include_zeroes is None: include_zeroes = self._config.include_zeroes
-        if suppress_warning is None: suppress_warning = self._config.suppress_warning
+        if ignore_index is None:
+            ignore_index = self._config.ignore_index
+        if include_zeroes is None:
+            include_zeroes = self._config.include_zeroes
+        if suppress_warning is None:
+            suppress_warning = self._config.suppress_warning
         if self._config.min_similarity > 0 or not include_zeroes:
             matches_list = self._matches_list
         elif include_zeroes:
             # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
-            # the fix includes zero-similarity matches that are missing by default 
-            # in _matches_list due to our use of sparse matrices 
+            # the fix includes zero-similarity matches that are missing by default
+            # in _matches_list due to our use of sparse matrices
             non_matches_list = self._get_non_matches_list(suppress_warning)
             matches_list = self._matches_list if non_matches_list.empty else \
                 pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True)
-            
+
         left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index)
         similarity = matches_list.similarity.reset_index(drop=True)
         if self._master_id is None:
@@ -426,16 +430,18 @@ def get_groups(self,
          If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
          above are returned as well altogether in a DataFrame.
 
-        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to 
+        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to
         self._config.ignore_index.
-        :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
+        :param replace_na: whether or not to replace NaN values in most similar string index-columns with
         corresponding duplicates-index values. Defaults to self._config.replace_na.
          """
-        if ignore_index is None: ignore_index = self._config.ignore_index
+        if ignore_index is None:
+            ignore_index = self._config.ignore_index
         if self._duplicates is None:
             return self._deduplicate(ignore_index=ignore_index)
         else:
-            if replace_na is None: replace_na = self._config.replace_na
+            if replace_na is None:
+                replace_na = self._config.replace_na
             return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na)
 
     @validate_is_fit
@@ -524,7 +530,8 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
         all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side'])
         matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']])
         missing_pairs = all_pairs.difference(matched_pairs)
-        if missing_pairs.empty: return pd.DataFrame()
+        if missing_pairs.empty:
+            return pd.DataFrame()
         if (self._config.max_n_matches < d_sz) and not suppress_warning:
             warnings.warn(f'WARNING: max_n_matches={self._config.max_n_matches} may be too small!\n'
                           f'\t\t Some zero-similarity matches returned may be false!\n'
@@ -542,8 +549,8 @@ def _get_nearest_matches(self,
         master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}'
         master = self._master.rename(master_label).reset_index(drop=ignore_index)
         dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index)
-        
-        # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging 
+
+        # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
         if isinstance(dupes, pd.DataFrame):
             master.rename(
                 columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label},
@@ -573,14 +580,14 @@ def _get_nearest_matches(self,
         if self._master_id is not None:
             # Also update the master_id-series with the duplicates_id in cases were there is no match
             dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id
-            
+
             # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
             # appear within them. So here we change them back to their original datatypes if possible:
             if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \
-                self._duplicates_id.dtype == self._master_id.dtype:
+                    self._duplicates_id.dtype == self._master_id.dtype:
                 dupes_max_sim.loc[:, master_id_label] = \
-                dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
-            
+                    dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
+
         # Prepare the output:
         required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label]
         index_column_list = \
@@ -590,13 +597,13 @@ def _get_nearest_matches(self,
             # Update the master index-columns with the duplicates index-column values in cases were there is no match
             dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates']
             dupes_max_sim.loc[rows_to_update, index_column_list] = \
-            dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
-            
+                dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
+
             # Restore their original datatypes if possible:
             for m, d in zip(index_column_list, dupes_index_columns):
                 if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype:
                     dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype)
-                    
+
         # Make sure to keep same order as duplicates
         dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
         output = dupes_max_sim[index_column_list + required_column_list]
@@ -667,9 +674,9 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
         master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True)
         dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
         return master_indices, dupe_indices
-    
+
     def _check_string_series(self, series_to_test: pd.Series, which: str):
-        self.bad_series_name = which 
+        self.bad_series_name = which
         StringGrouper._check_type(series_to_test, which)
         self._check_content(series_to_test, which)
 
@@ -780,7 +787,7 @@ def __init__(self, master: pd.Series,
                              **kwargs)
         except StringGrouperNotAllStringsException:
             self.non_strings_present = True
-            
+
     def error_msg(self, bad_series_name, function_name):
         nl = ':\n'
         return (
@@ -789,4 +796,3 @@ def error_msg(self, bad_series_name, function_name):
             f'{nl if 0 < len(self.issues) < 12 else "."}'
             f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}'
         )
-        
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 7c4cbb39..1892a60c 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -12,8 +12,10 @@
 from unittest.mock import patch
 import warnings
 
-def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
-    return A
+
+def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix:
+    return x
+
 
 class SimpleExample(object):
     def __init__(self):
@@ -211,7 +213,7 @@ def test_match_strings(self, mock_StringGouper):
         side_effect=mock_symmetrize_matrix
     )
     def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
-        """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
+        """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
         simple_example = SimpleExample()
         df = simple_example.customers_df2['Customer Name']
@@ -225,7 +227,7 @@ def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_m
         # obtain the intersection between upper and upper_prime:
         intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side'])
         # if the intersection is empty then _matches_list is completely non-symmetric (this is acceptable)
-        # if the intersection is not empty then at least some matches are repeated.  
+        # if the intersection is not empty then at least some matches are repeated.
         # To make sure all (and not just some) matches are repeated, the lengths of
         # upper, upper_prime and their intersection should be identical.
         self.assertFalse(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
@@ -243,7 +245,7 @@ def test_match_list_symmetry_with_symmetrize_function(self):
         # Obtain the intersection between upper and upper_prime:
         intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side'])
         # If the intersection is empty this means _matches_list is completely non-symmetric (this is acceptable)
-        # If the intersection is not empty this means at least some matches are repeated.  
+        # If the intersection is not empty this means at least some matches are repeated.
         # To make sure all (and not just some) matches are repeated, the lengths of
         # upper, upper_prime and their intersection should be identical.
         self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
@@ -276,7 +278,7 @@ def test_match_list_diagonal(self):
         self.assertEqual(num_self_joins, num_strings)
 
     def test_zero_min_similarity(self):
-        """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are 
+        """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are
         returned when min_similarity <= 0.  A bug related to this was first pointed out by @nbcvijanovic"""
         simple_example = SimpleExample()
         s_master = simple_example.customers_df['Customer Name']
@@ -285,7 +287,7 @@ def test_zero_min_similarity(self):
         pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches)
 
     def test_zero_min_similarity_small_max_n_matches(self):
-        """This test ensures that a warning is issued when n_max_matches is suspected to be too small while 
+        """This test ensures that a warning is issued when n_max_matches is suspected to be too small while
         min_similarity <= 0 and include_zeroes is True"""
         simple_example = SimpleExample()
         s_master = simple_example.customers_df['Customer Name']
@@ -675,9 +677,9 @@ def test_get_groups_4_df_same_similarity(self):
         test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
         test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
         test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
-        sg = StringGrouper(test_series_1, 
-                           test_series_2, 
-                           master_id=test_series_id_1, 
+        sg = StringGrouper(test_series_1,
+                           test_series_2,
+                           master_id=test_series_id_1,
                            duplicates_id=test_series_id_2,
                            ignore_index=True)
         sg = sg.fit()
@@ -805,7 +807,7 @@ def test_string_grouper_type_error(self):
             _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j']))
         with self.assertRaises(StringGrouperNotAllStringsException):
             _ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan]))
-            
+
     def test_not_all_strings_exception_in_high_level_fucntions(self):
         good_series = pd.Series(['foo', 'bar'])
         bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes')
diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py
index 11803a32..cc22cd1b 100644
--- a/string_grouper_utils/string_grouper_utils.py
+++ b/string_grouper_utils/string_grouper_utils.py
@@ -137,8 +137,8 @@ def get_column(col: Union[str, int, List[Union[str, int]]], data: pd.DataFrame):
 
 
 def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Series:
-    error_msg = f"timestamps must be a Series of date-like or datetime-like strings"
-    error_msg += f" or datetime datatype or pandas Timestamp datatype or numbers"
+    error_msg = "timestamps must be a Series of date-like or datetime-like strings"
+    error_msg += " or datetime datatype or pandas Timestamp datatype or numbers"
     if is_series_of_type(str, timestamps):
         # if any of the strings is not datetime-like raise an exception
         if timestamps.to_frame().applymap(is_date).squeeze().all():
diff --git a/string_grouper_utils/test/test_string_grouper_utils.py b/string_grouper_utils/test/test_string_grouper_utils.py
index 3798e3cd..0c8a8ee4 100644
--- a/string_grouper_utils/test/test_string_grouper_utils.py
+++ b/string_grouper_utils/test/test_string_grouper_utils.py
@@ -1,8 +1,8 @@
 import unittest
 import pandas as pd
 from dateutil.parser import parse
-from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \
-    new_group_rep_by_highest_weight
+from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \
+    new_group_rep_by_completeness, new_group_rep_by_highest_weight
 
 
 class SimpleExample(object):

From 02ad0300aa5ad7f68f7003f52177dea962bbd217 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Tue, 11 May 2021 13:48:20 +0200
Subject: [PATCH 09/11] updated string_grouper_utils.py to quell unittest
 deprecated warnings

---
 string_grouper_utils/string_grouper_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py
index cc22cd1b..e674367b 100644
--- a/string_grouper_utils/string_grouper_utils.py
+++ b/string_grouper_utils/string_grouper_utils.py
@@ -1,7 +1,7 @@
-import numpy as np
 import pandas as pd
 from typing import List, Optional, Union
 from dateutil.parser import parse
+from dateutil.tz import UTC
 from numbers import Number
 from datetime import datetime
 import re
@@ -143,13 +143,13 @@ def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Ser
         # if any of the strings is not datetime-like raise an exception
         if timestamps.to_frame().applymap(is_date).squeeze().all():
             # convert strings to numpy datetime64
-            return timestamps.transform(lambda x: np.datetime64(parse(x, parserinfo, **kwargs)))
+            return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC))
     elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps):
         # convert pandas Timestamps to numpy datetime64
         return timestamps.transform(lambda x: x.to_numpy())
     elif is_series_of_type(datetime, timestamps):
         # convert python datetimes to numpy datetime64
-        return timestamps.transform(lambda x: np.datetime64(x))
+        return timestamps.transform(lambda x: x.astimezone(UTC))
     elif is_series_of_type(Number, timestamps):
         return timestamps
     raise Exception(error_msg)

From e4686e5dbff386a1d74e306d5d425ef3f09c6362 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 3 Jul 2021 21:15:05 +0200
Subject: [PATCH 10/11] set max_n_matches=1 in match_most_similar() for a
 performance boost

---
 string_grouper/string_grouper.py           |  9 +++++----
 string_grouper/test/test_string_grouper.py | 12 ++++++------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index d1612511..243446ee 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -103,6 +103,7 @@ def match_most_similar(master: pd.Series,
     :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
     :return: pandas.Series or pandas.DataFrame.
     """
+    kwargs['max_n_matches'] = 1
     string_grouper = StringGrouper(master,
                                    duplicates=duplicates,
                                    master_id=master_id,
@@ -455,8 +456,8 @@ def _fit_vectorizer(self) -> TfidfVectorizer:
 
     def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix:
         """Builds the cossine similarity matrix of two csr matrices"""
-        tf_idf_matrix_1 = master_matrix
-        tf_idf_matrix_2 = duplicate_matrix.transpose()
+        tf_idf_matrix_1 = duplicate_matrix
+        tf_idf_matrix_2 = master_matrix.transpose()
 
         optional_kwargs = {
             'return_best_ntop': True,
@@ -661,8 +662,8 @@ def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix:
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
         """Returns a list of all the indices of matches"""
         r, c = matches.nonzero()
-        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
-                                     'dupe_side': c.astype(np.int64),
+        matches_list = pd.DataFrame({'master_side': c.astype(np.int64),
+                                     'dupe_side': r.astype(np.int64),
                                      'similarity': matches.data})
         return matches_list
 
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index f5f0aac8..c12b21db 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -405,8 +405,8 @@ def test_get_matches_single(self):
         sg = sg.fit()
         left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        left_index = [0, 0, 1, 2, 3, 3]
-        right_index = [0, 3, 1, 2, 0, 3]
+        left_index = [0, 3, 1, 2, 0, 3]
+        right_index = [0, 0, 1, 2, 3, 3]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
@@ -420,11 +420,11 @@ def test_get_matches_1_series_1_id_series(self):
         sg = StringGrouper(test_series_1, master_id=test_series_id_1)
         sg = sg.fit()
         left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
-        left_index = [0, 0, 1, 2, 3, 3]
+        left_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
+        left_index = [0, 3, 1, 2, 0, 3]
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
-        right_index = [0, 3, 1, 2, 0, 3]
+        right_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
+        right_index = [0, 0, 1, 2, 3, 3]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,

From 859aa4b381d702ca8d16da4358fdcd9711a8d87c Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Mon, 5 Jul 2021 05:51:39 +0200
Subject: [PATCH 11/11] changed default value of kwarg max_n_matches to
 #strings in master

---
 CHANGELOG.md                     | 11 +++++++++++
 README.md                        |  2 +-
 string_grouper/string_grouper.py |  6 +++---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e73d33fd..399a1b45 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+## [0.5.2?] - 2021-07-05
+
+* Provided a more user-friendly error message to be issued when any entries in the input string-Series are not strings. 
+
+## [0.5.1?] - 2021-07-05
+
+* Improved the performance of the function `match_most_similar`.
+* Changed the default value of the keyword argument `max_n_matches` to the total number of strings in `master`.  (`max_n_matches` is now defined as the maximum number of matches allowed per string in `duplicates` \[or `master` if `duplicates` is not given\]).
+
 ## [0.5.0] - 2021-06-11
 
 ### Added
diff --git a/README.md b/README.md
index 1b18c3c9..2da24cf5 100644
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use
    * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
    * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`.  Default is `numpy.float32`.  (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
    * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
-   * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
+   * **`max_n_matches`**: The maximum number of matching strings in `master` allowed per string in `duplicates` (or `master` if `duplicates` is not given). Default is the total number of strings in `master`.
    * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
     Defaults to `0.8`
    * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 2ac60825..f7fa7b75 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -13,7 +13,6 @@
 DEFAULT_NGRAM_SIZE: int = 3
 DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32   # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
 DEFAULT_REGEX: str = r'[,-./]|\s'
-DEFAULT_MAX_N_MATCHES: int = 20
 DEFAULT_MIN_SIMILARITY: float = 0.8  # minimum cosine similarity for an item to be considered a match
 DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1
 DEFAULT_IGNORE_CASE: bool = True  # ignores case by default
@@ -209,7 +208,8 @@ class StringGrouperConfig(NamedTuple):
     (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
     than np.float64.)
     :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
-    :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
+    :param max_n_matches: int. The maximum number of matching strings in `master` allowed per string in
+    `duplicates` (or `master` if `duplicates` is not given). Default will be set by StringGrouper.
     :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
     Defaults to 0.8.
     :param number_of_processes: int. The number of processes used by the cosine similarity calculation.
@@ -297,7 +297,7 @@ def __init__(self, master: pd.Series,
 
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
         if self._config.max_n_matches is None:
-            self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
+            self._max_n_matches = len(self._master)
         else:
             self._max_n_matches = self._config.max_n_matches