Skip to content

Commit 1c5bf66

Browse files
committed
parser Intents() clean() now has a "deep" cleaning function that also removes Intents found in "inc" attribute
- fixed major bug in parser Intents() clean() - added deep cleaning function for parser Intents() clean that also removes Intents in "inc" attribute if both the parent Intent and at least one of the "inc" Intents were found - pareser Intents() clean() now removes "typo_stem" matches as well and returnes the merged results - test_entities now take accents, improved regex declarations, accidental regex declarations and affixes and prefixes not declared as lists into account and prints a warning message - increased version number to 1.1.0 due to major update
1 parent 43aeb77 commit 1c5bf66

File tree

3 files changed

+63
-21
lines changed

3 files changed

+63
-21
lines changed

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.0.8'
6+
__version__ = '1.1.0'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/parser.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -271,21 +271,22 @@ def match_set(self, text=""):
271271
return set()
272272

273273
# Remove matches from text
274-
def clean(self, text=""):
274+
def clean(self, text="", deep=False):
275275
if text:
276-
return self._get_clean_text(text)
276+
return self._get_clean_text(text,deep)
277277
else:
278278
return ""
279279

280280
# Returns text without the inflected forms of matched intents
281-
def _get_clean_text(self, text):
282-
text = lara.nlp.trim(text)
281+
def _get_clean_text(self, text, deep):
282+
text = lara.nlp.trim(text)
283283
typo_text = lara.nlp.strip_accents(lara.nlp.remove_double_letters(text))
284-
fix_text = text
284+
c_text = text
285+
c_typo_text = typo_text
285286
if text:
286287
for key, value in self.intents.items():
287288
ignore = False
288-
allow = -1
289+
allow = -1
289290
for item in self.intents[key]:
290291
if 'exc' in item and item['exc']:
291292
for exc in item['exc']:
@@ -301,14 +302,25 @@ def _get_clean_text(self, text):
301302
allow = 1
302303
elif self._match_pattern(typo_text,inc,True)[0]: # typo_stem
303304
allow = 1
304-
if not ignore and allow in (-1,1):
305-
max_words = _re.words(text)
306-
for item in self.intents[key]:
305+
if not ignore and allow in (-1,1):
306+
max_words = _re.words(text)
307307
if item['max_words'] <= max_words:
308-
fix_text = self._match_pattern(fix_text,item,False,True) # stem
309-
fix_text = self._match_pattern(fix_text,item,True,True) # typo_stem
310-
311-
return fix_text
308+
c_text = self._match_pattern(c_text,item,False,True,deep) # stem
309+
c_typo_text = self._match_pattern(c_typo_text,item,True,True,deep) # typo_stem
310+
# attempt to merge results
311+
c_text = lara.nlp.trim(c_text).split()
312+
c_typo_text = lara.nlp.trim(c_typo_text).split()
313+
fix_text = []
314+
last = 0
315+
for word in c_text:
316+
x_word = lara.nlp.strip_accents(lara.nlp.remove_double_letters(word))
317+
for i in range(last,len(c_typo_text)):
318+
if x_word==c_typo_text[i]:
319+
fix_text.append(word)
320+
last = i
321+
break
322+
return ' '.join(fix_text)
323+
return text
312324

313325
# Get score for intents in text
314326
def _get_score(self, text, greedy=True):
@@ -355,12 +367,12 @@ def _get_score(self, text, greedy=True):
355367
return score
356368

357369
# Find an intent in text
358-
def _match_pattern(self, text, item, is_clean=False, delete=False):
370+
def _match_pattern(self, text, item, is_clean=False, delete=False, deep=False):
359371
if text:
360372
if not delete and item['max_words']:
361373
if _re.words(text)>item['max_words']:
362374
return (False,0)
363-
375+
364376
if is_clean:
365377
select = 'typo_'
366378
else:
@@ -383,6 +395,9 @@ def _match_pattern(self, text, item, is_clean=False, delete=False):
383395
match = match[0]
384396
if item['match_stem'] or (item['ignorecase'] and match.lower() != item[select+'stem'].lower()) or (match.lower() != item[select+'stem']):
385397
tmp = _re.sub(boundary+r'('+re.escape(match)+r')'+boundary,re.IGNORECASE,'',tmp)
398+
if deep and 'inc' in item:
399+
for inc in item['inc']:
400+
tmp = self._match_pattern(tmp, inc, is_clean, delete, deep)
386401
return tmp
387402
else:
388403
if not item['match_stem']:

tests/test_entities.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,22 @@
88
valid_keys = set(['stem','clean_stem','affix','clean_affix','prefix','clean_prefix','wordclass','inc','exc','score','clean_score','match_stem','ignorecase','boundary','max_words'])
99
valid_class = set(['noun','verb','adjective','regex','emoji','special'])
1010
is_regex = set(['|','(',')','+','*','+','?','\\','[',']','{','}'])
11+
accents = set(['á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'])
1112

1213
def validate_intent(intents):
1314
for intent,declaration in intents.items():
14-
for item in declaration:
15+
for char in intent:
16+
if char in accents:
17+
print(intent,"key has accents in declaration")
18+
break
19+
for item in declaration:
1520
validate_intent_item(item,intent)
1621
if 'inc' in item:
1722
for sub_item in item['inc']:
1823
validate_intent_item(sub_item,intent)
1924
if 'exc' in item:
2025
for sub_item in item['exc']:
21-
validate_intent_item(sub_item,intent)
26+
validate_intent_item(sub_item,intent)
2227

2328
def validate_intent_item(item,intent):
2429
for key in item:
@@ -27,6 +32,12 @@ def validate_intent_item(item,intent):
2732
if 'wordclass' in item:
2833
if item['wordclass'] not in valid_class:
2934
print(intent,'has invalid "wordclass" declared')
35+
if 'affix' in item:
36+
if not isinstance(item['affix'], list) and not isinstance(item['affix'], tuple):
37+
print(intent,'has "affix" declared, but not as a list:',item['stem'])
38+
if 'prefix' in item:
39+
if not isinstance(item['prefix'], list) and not isinstance(item['prefix'], tuple):
40+
print(intent,'has "prefix" declared, but not as a list:',item['stem'])
3041
if 'stem' not in item:
3142
print(intent,'missing "stem" key')
3243
else:
@@ -40,7 +51,7 @@ def validate_intent_item(item,intent):
4051
switch = True
4152
elif char==']':
4253
switch = False
43-
elif char in ('á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'):
54+
elif char in accents:
4455
if not switch:
4556
print(intent,'has accents declared in regular expression without counterparts:',item['stem'])
4657
break
@@ -51,13 +62,29 @@ def validate_intent_item(item,intent):
5162
last = last+char
5263
else:
5364
last = char
65+
elif 'wordclass' in item and item['wordclass']=='regex':
66+
really = False
67+
for char in item['stem']:
68+
if not char.isalnum() and char not in (' ','-'):
69+
really = True
70+
break
71+
if not really:
72+
print(intent,'probably has a regex "wordclass" declared by accident in',item['stem'])
5473
if any(test in item['stem'] for test in is_regex):
5574
if 'wordclass' not in item or item['wordclass']!='regex':
5675
print(intent,'probably has a regex "wordclass" declared otherwise in',item['stem'])
57-
76+
5877
@pytest.mark.parametrize("entity", [
5978
"common","commands","counties","dow","smalltalk","emoji","disallow","tone"
6079
])
6180
def test_entities(entity):
6281
parenthesis_check = eval('parser.Intents(entities.'+entity+'()).match_set("test")')
63-
eval('validate_intent(entities.'+entity+'())')
82+
eval('validate_intent(entities.'+entity+'())')
83+
84+
85+
86+
valid_keys = set(['stem','clean_stem','affix','clean_affix','prefix','clean_prefix','wordclass','inc','exc','score','clean_score','match_stem','ignorecase','boundary','max_words'])
87+
valid_class = set(['noun','verb','adjective','regex','emoji','special'])
88+
is_regex = set(['|','(',')','+','*','+','?','\\','[',']','{','}'])
89+
accents = set(['á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'])
90+

0 commit comments

Comments
 (0)