parser Intents() clean() now has a "deep" cleaning function that also removes Intents found in "inc" attribute

sedthh · sedthh · commit 1c5bf66c3ca5 · 2018-05-04T14:06:56.000+02:00
- fixed major bug in parser Intents() clean()
- added deep cleaning function for parser Intents() clean that also removes Intents in "inc" attribute if both the parent Intent and at least one of the "inc" Intents were found
- pareser Intents() clean() now removes "typo_stem" matches as well and returnes the merged results
- test_entities now take accents, improved regex declarations, accidental regex declarations and affixes and prefixes not declared as lists into account and prints a warning message
- increased version number to 1.1.0 due to major update
diff --git a/lara/__init__.py b/lara/__init__.py
@@ -3,7 +3,7 @@
 # Lara - Lingusitic Aim Recognizer API
 
 __all__				= 'nlp','parser','stemmer','entities'
-__version__ 		= '1.0.8'
+__version__ 		= '1.1.0'
 __version_info__	= tuple(int(num) for num in __version__.split('.'))
 
 import sys
diff --git a/lara/parser.py b/lara/parser.py
@@ -271,21 +271,22 @@ def match_set(self, text=""):
 			return set()
 	
 	# Remove matches from text
-	def clean(self, text=""):
+	def clean(self, text="", deep=False):
 		if text:
-			return self._get_clean_text(text)
+			return self._get_clean_text(text,deep)
 		else:
 			return ""
 		
 	# Returns text without the inflected forms of matched intents
-	def _get_clean_text(self, text):
-		text			= lara.nlp.trim(text)
+	def _get_clean_text(self, text, deep):
+		text		= lara.nlp.trim(text)
 		typo_text	= lara.nlp.strip_accents(lara.nlp.remove_double_letters(text))
-		fix_text		= text
+		c_text		= text
+		c_typo_text	= typo_text
 		if text:
 			for key, value in self.intents.items():
 				ignore	= False
-				allow		= -1
+				allow	= -1
 				for item in self.intents[key]:
 					if 'exc' in item and item['exc']:
 						for exc in item['exc']:
@@ -301,14 +302,25 @@ def _get_clean_text(self, text):
 								allow	= 1
 							elif self._match_pattern(typo_text,inc,True)[0]: # typo_stem
 								allow	= 1
-				if not ignore and allow in (-1,1):
-					max_words	= _re.words(text)
-					for item in self.intents[key]:
+					if not ignore and allow in (-1,1):
+						max_words	= _re.words(text)
 						if item['max_words'] <= max_words:
-							fix_text		= self._match_pattern(fix_text,item,False,True)	# stem
-							fix_text		= self._match_pattern(fix_text,item,True,True)	# typo_stem
-						
-		return fix_text
+							c_text		= self._match_pattern(c_text,item,False,True,deep)	# stem
+							c_typo_text	= self._match_pattern(c_typo_text,item,True,True,deep)	# typo_stem
+			# attempt to merge results
+			c_text		= lara.nlp.trim(c_text).split()
+			c_typo_text	= lara.nlp.trim(c_typo_text).split()
+			fix_text	= []
+			last		= 0
+			for word in c_text:
+				x_word	= lara.nlp.strip_accents(lara.nlp.remove_double_letters(word))
+				for i in range(last,len(c_typo_text)):
+					if x_word==c_typo_text[i]:
+						fix_text.append(word)
+						last		= i
+						break
+			return ' '.join(fix_text)
+		return text
 			
 	# Get score for intents in text
 	def _get_score(self, text, greedy=True):
@@ -355,12 +367,12 @@ def _get_score(self, text, greedy=True):
 		return score
 	
 	# Find an intent in text
-	def _match_pattern(self, text, item, is_clean=False, delete=False):
+	def _match_pattern(self, text, item, is_clean=False, delete=False, deep=False):
 		if text:	
 			if not delete and item['max_words']:
 				if _re.words(text)>item['max_words']:
 					return (False,0)
-			
+		
 			if is_clean:
 				select		= 'typo_'
 			else:
@@ -383,6 +395,9 @@ def _match_pattern(self, text, item, is_clean=False, delete=False):
 							match	= match[0]
 						if item['match_stem'] or (item['ignorecase'] and match.lower() != item[select+'stem'].lower()) or (match.lower() != item[select+'stem']):
 							tmp	= _re.sub(boundary+r'('+re.escape(match)+r')'+boundary,re.IGNORECASE,'',tmp)
+					if deep and 'inc' in item:
+						for inc in item['inc']:
+							tmp	= self._match_pattern(tmp, inc, is_clean, delete, deep)
 					return tmp
 				else:	
 					if not item['match_stem']:
diff --git a/tests/test_entities.py b/tests/test_entities.py
@@ -8,17 +8,22 @@
 valid_keys	= set(['stem','clean_stem','affix','clean_affix','prefix','clean_prefix','wordclass','inc','exc','score','clean_score','match_stem','ignorecase','boundary','max_words'])
 valid_class = set(['noun','verb','adjective','regex','emoji','special'])
 is_regex	= set(['|','(',')','+','*','+','?','\\','[',']','{','}'])
+accents		= set(['á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'])
 	
 def validate_intent(intents):
 	for intent,declaration in intents.items():
-		for item in declaration:
+		for char in intent:
+			if char in accents:
+				print(intent,"key has accents in declaration")
+				break
+		for item in declaration:				
 			validate_intent_item(item,intent)
 			if 'inc' in item:
 				for sub_item in item['inc']:
 					validate_intent_item(sub_item,intent)
 			if 'exc' in item:
 				for sub_item in item['exc']:
-					validate_intent_item(sub_item,intent)
+					validate_intent_item(sub_item,intent)	
 			
 def validate_intent_item(item,intent):
 	for key in item:
@@ -27,6 +32,12 @@ def validate_intent_item(item,intent):
 	if 'wordclass' in item:
 		if item['wordclass'] not in valid_class:
 			print(intent,'has invalid "wordclass" declared')
+	if 'affix' in item:
+		if not isinstance(item['affix'], list) and not isinstance(item['affix'], tuple):
+			print(intent,'has "affix" declared, but not as a list:',item['stem'])
+	if 'prefix' in item:
+		if not isinstance(item['prefix'], list) and not isinstance(item['prefix'], tuple):
+			print(intent,'has "prefix" declared, but not as a list:',item['stem'])
 	if 'stem' not in item:
 		print(intent,'missing "stem" key')
 	else:
@@ -40,7 +51,7 @@ def validate_intent_item(item,intent):
 					switch	= True
 				elif char==']':
 					switch	= False
-				elif char in ('á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'):
+				elif char in accents:
 					if not switch:
 						print(intent,'has accents declared in regular expression without counterparts:',item['stem'])
 						break
@@ -51,13 +62,29 @@ def validate_intent_item(item,intent):
 					last	= last+char
 				else:
 					last	= char
+		elif 'wordclass' in item and item['wordclass']=='regex':
+			really	= False
+			for char in item['stem']:
+				if not char.isalnum() and char not in (' ','-'):
+					really	= True
+					break
+			if not really:
+				print(intent,'probably has a regex "wordclass" declared by accident in',item['stem'])
 		if any(test in item['stem'] for test in is_regex):
 			if 'wordclass' not in item or item['wordclass']!='regex':
 				print(intent,'probably has a regex "wordclass" declared otherwise in',item['stem'])
-						
+							
 @pytest.mark.parametrize("entity", [
     "common","commands","counties","dow","smalltalk","emoji","disallow","tone"
 ])
 def test_entities(entity):
 	parenthesis_check = eval('parser.Intents(entities.'+entity+'()).match_set("test")')
-	eval('validate_intent(entities.'+entity+'())')
+	eval('validate_intent(entities.'+entity+'())')
+	
+	
+	
+valid_keys	= set(['stem','clean_stem','affix','clean_affix','prefix','clean_prefix','wordclass','inc','exc','score','clean_score','match_stem','ignorecase','boundary','max_words'])
+valid_class = set(['noun','verb','adjective','regex','emoji','special'])
+is_regex	= set(['|','(',')','+','*','+','?','\\','[',']','{','}'])
+accents		= set(['á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'])
+