4141 # StringGrouper.get_nearest_matches
4242GROUP_REP_PREFIX : str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
4343
44+
4445# High level functions
4546
4647
47- def compute_pairwise_similarities (string_series_1 : pd .Series ,
48+ def who (bad_StringGrouper_param , param_1 , param_name_1 , param_2 , param_name_2 ):
49+ # Private utility function used by high-level functions (that call StringGrouper) to form a
50+ # descriptive name for their series input parameter which caused the exception of type
51+ # StringGrouperNotAllStringsException to occur
52+ if bad_StringGrouper_param == 'master' :
53+ return f'\' { param_1 .name } \' ({ param_name_1 } )' if param_1 .name else param_name_1
54+ else :
55+ return f'\' { param_2 .name } \' ({ param_name_2 } )' if param_2 .name else param_name_2
56+
57+
58+ def add_this_arg (func ):
59+ # Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func")
60+ # that shifts the parameters of "func" to the right by one, inserting a reference to local
61+ # function "this" in the first parameter position
62+ def this (* args , ** kwargs ):
63+ return func (this , * args , ** kwargs )
64+ return this
65+
66+
67+ @add_this_arg
68+ def compute_pairwise_similarities (this ,
69+ string_series_1 : pd .Series ,
4870 string_series_2 : pd .Series ,
4971 ** kwargs ) -> pd .Series :
5072 """
@@ -55,10 +77,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series,
5577 :param kwargs: All other keyword arguments are passed to StringGrouperConfig
5678 :return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2
5779 """
58- return StringGrouper (string_series_1 , string_series_2 , ** kwargs ).dot ()
59-
60-
61- def group_similar_strings (strings_to_group : pd .Series ,
80+ sg = StringGrouperPrime (string_series_1 , string_series_2 , ** kwargs )
81+ # error handler (for input Series with values that are not strings)
82+ if sg .non_strings_present :
83+ sname = who (sg .bad_series_name ,
84+ string_series_1 , 'string_series_1' ,
85+ string_series_2 , 'string_series_2' )
86+ this .issues = sg .issues
87+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
88+ raise TypeError (sg .error_msg (sname , 'compute_pairwise_similarities' ))
89+ return sg .dot ()
90+
91+
92+ @add_this_arg
93+ def group_similar_strings (this ,
94+ strings_to_group : pd .Series ,
6295 string_ids : Optional [pd .Series ] = None ,
6396 ** kwargs ) -> Union [pd .DataFrame , pd .Series ]:
6497 """
@@ -76,11 +109,22 @@ def group_similar_strings(strings_to_group: pd.Series,
76109 :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
77110 :return: pandas.Series or pandas.DataFrame.
78111 """
79- string_grouper = StringGrouper (strings_to_group , master_id = string_ids , ** kwargs ).fit ()
80- return string_grouper .get_groups ()
81-
82-
83- def match_most_similar (master : pd .Series ,
112+ sg = StringGrouperPrime (strings_to_group , master_id = string_ids , ** kwargs )
113+ # error handler (for input Series with values that are not strings)
114+ if sg .non_strings_present :
115+ sname = who (sg .bad_series_name ,
116+ strings_to_group , 'strings_to_group' ,
117+ None , '' )
118+ this .issues = sg .issues
119+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
120+ raise TypeError (sg .error_msg (sname , 'group_similar_strings' ))
121+ fit_sg = sg .fit ()
122+ return fit_sg .get_groups ()
123+
124+
125+ @add_this_arg
126+ def match_most_similar (this ,
127+ master : pd .Series ,
84128 duplicates : pd .Series ,
85129 master_id : Optional [pd .Series ] = None ,
86130 duplicates_id : Optional [pd .Series ] = None ,
@@ -105,15 +149,26 @@ def match_most_similar(master: pd.Series,
105149 :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
106150 :return: pandas.Series or pandas.DataFrame.
107151 """
108- string_grouper = StringGrouper (master ,
109- duplicates = duplicates ,
110- master_id = master_id ,
111- duplicates_id = duplicates_id ,
112- ** kwargs ).fit ()
113- return string_grouper .get_groups ()
114-
115-
116- def match_strings (master : pd .Series ,
152+ sg = StringGrouperPrime (master ,
153+ duplicates = duplicates ,
154+ master_id = master_id ,
155+ duplicates_id = duplicates_id ,
156+ ** kwargs )
157+ # error handler (for input Series with values that are not strings)
158+ if sg .non_strings_present :
159+ sname = who (sg .bad_series_name ,
160+ master , 'master' ,
161+ duplicates , 'duplicates' )
162+ this .issues = sg .issues
163+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
164+ raise TypeError (sg .error_msg (sname , 'match_most_similar' ))
165+ fit_sg = sg .fit ()
166+ return fit_sg .get_groups ()
167+
168+
169+ @add_this_arg
170+ def match_strings (this ,
171+ master : pd .Series ,
117172 duplicates : Optional [pd .Series ] = None ,
118173 master_id : Optional [pd .Series ] = None ,
119174 duplicates_id : Optional [pd .Series ] = None ,
@@ -130,12 +185,20 @@ def match_strings(master: pd.Series,
130185 :param kwargs: All other keyword arguments are passed to StringGrouperConfig.
131186 :return: pandas.Dataframe.
132187 """
133- string_grouper = StringGrouper (master ,
134- duplicates = duplicates ,
135- master_id = master_id ,
136- duplicates_id = duplicates_id ,
137- ** kwargs ).fit ()
138- return string_grouper .get_matches ()
188+ sg = StringGrouperPrime (master ,
189+ duplicates = duplicates ,
190+ master_id = master_id ,
191+ duplicates_id = duplicates_id ,
192+ ** kwargs )
193+ if sg .non_strings_present :
194+ sname = who (sg .bad_series_name ,
195+ master , 'master' ,
196+ duplicates , 'duplicates' )
197+ this .issues = sg .issues
198+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
199+ raise TypeError (sg .error_msg (sname , 'match_strings' ))
200+ fit_sg = sg .fit ()
201+ return fit_sg .get_matches ()
139202
140203
141204class StringGrouperConfig (NamedTuple ):
@@ -194,6 +257,10 @@ class StringGrouperNotFitException(Exception):
194257 pass
195258
196259
260+ class StringGrouperNotAllStringsException (TypeError ):
261+ """Raised when either input Series master or duplicates contains non-strings"""
262+ pass
263+
197264class StringGrouper (object ):
198265 def __init__ (self , master : pd .Series ,
199266 duplicates : Optional [pd .Series ] = None ,
@@ -213,9 +280,9 @@ def __init__(self, master: pd.Series,
213280 :param kwargs: All other keyword arguments are passed to StringGrouperConfig
214281 """
215282 # Validate match strings input
216- if not StringGrouper . _is_series_of_strings ( master ) or \
217- ( duplicates is not None and not StringGrouper . _is_series_of_strings ( duplicates )):
218- raise TypeError ( 'Input does not consist of pandas.Series containing only Strings ' )
283+ self . issues : pd . Series = None
284+ self . _check_string_series ( master , 'master' )
285+ if ( duplicates is not None ): self . _check_string_series ( duplicates , 'duplicates ' )
219286 # Validate optional IDs input
220287 if not StringGrouper ._is_input_data_combination_valid (duplicates , master_id , duplicates_id ):
221288 raise Exception ('List of data Series options is invalid' )
@@ -601,6 +668,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
601668 dupe_indices = dupe_strings [dupe_strings == dupe_side ].index .to_series ().reset_index (drop = True )
602669 return master_indices , dupe_indices
603670
671+ def _check_string_series (self , series_to_test : pd .Series , which : str ):
672+ self .bad_series_name = which
673+ StringGrouper ._check_type (series_to_test , which )
674+ self ._check_content (series_to_test , which )
675+
676+ def _check_content (self , series_to_test : pd .Series , which : str ):
677+ non_strings_exist = series_to_test .to_frame ().applymap (
678+ lambda x : (not isinstance (x , str )) or len (x ) == 0
679+ ).squeeze (axis = 1 )
680+ if non_strings_exist .any ():
681+ self .issues = series_to_test [non_strings_exist ]
682+ sname = f' { series_to_test .name } ' if series_to_test .name else ''
683+ self .issues .rename (f'Non-strings in { which } Series{ sname } ' , inplace = True )
684+ raise StringGrouperNotAllStringsException
685+
604686 def _validate_group_rep_specs (self ):
605687 group_rep_options = (GROUP_REP_FIRST , GROUP_REP_CENTROID )
606688 if self ._config .group_rep not in group_rep_options :
@@ -617,6 +699,11 @@ def _validate_replace_na_and_drop(self):
617699 "index if the number of index-levels does not equal the number of index-columns."
618700 )
619701
702+ @staticmethod
703+ def _check_type (series_to_test : pd .Series , which : str ):
704+ if not isinstance (series_to_test , pd .Series ):
705+ raise TypeError (f'Input { which } is not a pandas.Series containing only Strings' )
706+
620707 @staticmethod
621708 def _symmetrize_matrix_and_fix_diagonal (AA : csr_matrix ) -> csr_matrix :
622709 A = AA .tolil ()
@@ -656,16 +743,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings
656743 elif not dupe_strings .isin ([dupe_side ]).any ():
657744 raise ValueError (f'{ dupe_side } not found in StringGrouper dupe string series' )
658745
659- @staticmethod
660- def _is_series_of_strings (series_to_test : pd .Series ) -> bool :
661- if not isinstance (series_to_test , pd .Series ):
662- return False
663- elif series_to_test .to_frame ().applymap (
664- lambda x : not isinstance (x , str )
665- ).squeeze (axis = 1 ).any ():
666- return False
667- return True
668-
669746 @staticmethod
670747 def _is_input_data_combination_valid (duplicates , master_id , duplicates_id ) -> bool :
671748 if duplicates is None and (duplicates_id is not None ) \
@@ -680,3 +757,36 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id):
680757 raise Exception ('Both master and master_id must be pandas.Series of the same length.' )
681758 if duplicates is not None and duplicates_id is not None and len (duplicates ) != len (duplicates_id ):
682759 raise Exception ('Both duplicates and duplicates_id must be pandas.Series of the same length.' )
760+
761+
762+ class StringGrouperPrime (StringGrouper ):
763+ # (To be used in high-level functions)
764+ # Child class of StringGrouper that captures information about the input Series
765+ # that caused the StringGrouperNotAllStringsException even when the StringGrouper
766+ # instance is not fully initialized
767+ def __init__ (self , master : pd .Series ,
768+ duplicates : Optional [pd .Series ] = None ,
769+ master_id : Optional [pd .Series ] = None ,
770+ duplicates_id : Optional [pd .Series ] = None ,
771+ ** kwargs ):
772+ self .issues = None
773+ self .non_strings_present = False
774+ self .bad_series_name = None
775+ try :
776+ super ().__init__ (master ,
777+ duplicates = duplicates ,
778+ master_id = master_id ,
779+ duplicates_id = duplicates_id ,
780+ ** kwargs )
781+ except StringGrouperNotAllStringsException :
782+ self .non_strings_present = True
783+
784+ def error_msg (self , bad_series_name , function_name ):
785+ nl = ':\n '
786+ return (
787+ f'\n \n ERROR: Input pandas Series { bad_series_name } contains values that are not strings!\n '
788+ f'Display the pandas Series \' { function_name } .issues\' to find where these values are'
789+ f'{ nl if 0 < len (self .issues ) < 12 else "." } '
790+ f'{ self .issues .to_frame () if 0 < len (self .issues ) < 12 else "" } '
791+ )
792+
0 commit comments