version 1.05 released

sedthh · sedthh · commit 20f838067042 · 2018-04-20T18:02:25.000+02:00
- tippmix stemmer is now stemmer.tippmix instead of the other way around as more stemmer functions are to be expected
- updated example_stemmer
- updated READM.md example for tippmix stemmer
- updated init and increased version
- updated unpleasant entities
diff --git a/README.md b/README.md
@@ -135,7 +135,7 @@ elif 'are_you_a_robot' in chitchat_match:
 Can create features from short Hungarian texts for Machine Learning models, without large dictionaries:
 
 ```python
-from lara import tippmix, nlp
+from lara import stemmer, nlp
 
 text 	= '''
 	A szövegbányászat a strukturálatlan vagy kis mértékben strukturált 
@@ -147,7 +147,7 @@ text 	= '''
 	'''
 
 clean	= nlp.remove_stopwords(text)
-stems	= tippmix.stemmer(clean)
+stems	= stemmer.tippmix(clean)
 bigrams = nlp.ngram(stems,2)
 print(bigrams)
 
diff --git a/examples/example_stemmer.py b/examples/example_stemmer.py
@@ -2,7 +2,7 @@
 
 import os.path, sys
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
-from lara import tippmix, nlp
+from lara import stemmer, nlp
 
 ''' Stemmer and n-gram example '''
 
@@ -22,7 +22,7 @@
 	print(bigrams)
 	
 	# A szöveg szavait stemmeli és ezekből bigramokat generál
-	stems	= tippmix.stemmer(text)
+	stems	= stemmer.tippmix(text)
 	bigrams = nlp.ngram(stems,2)
 	print(bigrams)
 	
@@ -33,6 +33,6 @@
 	print(bigrams)
 	
 	# A stopszavak eltávolítása után megmaradt szavakat stemmeli és ezekből generál bigramokat
-	stems	= tippmix.stemmer(text)
+	stems	= stemmer.tippmix(text)
 	bigrams = nlp.ngram(stems,2)
 	print(bigrams)
diff --git a/lara/__init__.py b/lara/__init__.py
@@ -2,12 +2,12 @@
 
 # Lara - Lingusitic Aim Recognizer API
 
-__all__				= 'nlp','parser','tippmix','entities'
-__version__ 		= '1.0.4'
+__all__				= 'nlp','parser','stemmer','entities'
+__version__ 		= '1.0.5'
 __version_info__	= tuple(int(num) for num in __version__.split('.'))
 
 import sys
 import lara.nlp
 import lara.parser
-import lara.tippmix
+import lara.stemmer
 import lara.entities
diff --git a/lara/entities.py b/lara/entities.py
@@ -159,7 +159,7 @@ def disallow():
 		"obscene"			: [{"stem":"(fel|le|meg|r[aá]|ki|be|oda|[oö]s+ze|bele|hoz+[aá])?bas*z+d?\s?(at)?(hat)?(us|a[dk]?|n?[aá][kl]|[aá]?t[aáo][lkm]?|ot+|ni|n[aá]n?[dlkm]?|va|meg)?","wordclass":"regex","exc":[{"stem":"megye"}]},{"stem":"((l[oó]|agy)?fasz|fas+z+op[oó]|geci\w*|kurv[aá]([eé]let|an+yj?[aá])?|(be)?fos|ribanc|(be)?szar|buzi|k[oö]cs[oö]g|pin[aá]|pics[aá]|p[oö]cs|p[eé]nisz|(kurva)?any[aá]d)([oö]?k)?r?[aáeoö]?(\w{0,2}[aeoöőu][dnklt]*)?(n[ae]k)?\b","wordclass":"regex","boundary":False},{"stem":"((mother)?f\s?u\s?c\s?k|shit(as{2})?|bitch|pus{2}y|cunt|fag(g?[eo]t)?|penis|blowjob|but{2}(plug|head)?|as{2}|arse|homo|gay|dyke|cock|dick(pic)?)(e?s|ing|e?r)?","wordclass":"regex"}],
 		"racist"				: [{"stem":"(fek[aá]|nig+(er|a)|n[aá]ci|cig[oó]|cig[aá]n+y|gypsy|dzsip[oó]|zsidr?[ó])[aáeégklnmstv]*","wordclass":"regex","boundary":False}],
 		"erotic"				: [{"stem":"(sz?ex|an[aá]l|[bv]agina|[bp][eé][np]isz?|creampie|cum|sperma?|fuck|homo(kos|sexu[aá]l(is)?)?|milf|bisexual|gay|dild[oó]|vibr[aá]tor|fel+atio|blow\s?job|whore|geci|pus{2}y|pics[aá]|pin[aá]|fasz|pis{2}|boner|dick(pic)?|x{3,}|hentai|catgirl|ec+hi|yaoi|loli|shot[aá]|\w*porn[oó]?(film)?)[aáeéioöőuüdgklmnprstvz]*","wordclass":"regex","boundary":False},{"stem":"maki verem"}],
-		"unpleasant"		: [{"stem":"AIDS","wordclass":"noun"},{"stem":"HIV","ignorecase":False},{"stem":"Hitler","wordclass":"noun"},{"stem":"(Sz?t[aá]lin|Len+in)\w*","wordclass":"regex"},{"stem":"pedof[ií]l(i[aá])?[aokltv]*","wordclass":"regex"},{"stem":"(fur{2}y|bestiality|yif{2}y?)[aáeégklnmstv]*","wordclass":"regex"}],
+		"unpleasant"		: [{"stem":"AIDS","wordclass":"noun"},{"stem":"HIV","ignorecase":False},{"stem":"Hitler","wordclass":"noun"},{"stem":"(Sz?t[aá]lin|Len+in)\w*","wordclass":"regex"},{"stem":"pedof[ií]l(i[aá])?[aokltv]*","wordclass":"regex"},{"stem":"(fur{2}y|bestiality|yif{2}y?)[aáeégklnmstv]*","wordclass":"regex"},{"stem":"mej?i?n\s?kamp+f+\w*","wordclass":"regex"},{"stem":"(any[aá]d|gy[oö]k[eé]r)\w*","wordclass":"regex"}],
 	}
 	
 # decide whether user is talking to you in a formal or informal way
diff --git a/lara/stemmer.py b/lara/stemmer.py
@@ -8,7 +8,7 @@
 
 
 # a stemmer that's slightly better than random guessing
-def stemmer(text):
+def tippmix(text):
 	if text:
 		word_list	= lara.nlp.tokenize(text)
 		if word_list:

Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ def disallow():`
`159`	`159`	"obscene" : [{"stem":"(fel\|le\|meg\|r[aá]\|ki\|be\|oda\|[oö]s+ze\|bele\|hoz+[aá])?basz+d?\s?(at)?(hat)?(us\|a[dk]?\|n?[aá][kl]\|[aá]?t[aáo][lkm]?\|ot+\|ni\|n[aá]n?[dlkm]?\|va\|meg)?","wordclass":"regex","exc":[{"stem":"megye"}]},{"stem":"((l[oó]\|agy)?fasz\|fas+z+op[oó]\|geci\w\|kurv[aá]([eé]let\|an+yj?[aá])?\|(be)?fos\|ribanc\|(be)?szar\|buzi\|k[oö]cs[oö]g\|pin[aá]\|pics[aá]\|p[oö]cs\|p[eé]nisz\|(kurva)?any[aá]d)([oö]?k)?r?[aáeoö]?(\w{0,2}[aeoöőu][dnklt]*)?(n[ae]k)?\b","wordclass":"regex","boundary":False},{"stem":"((mother)?f\s?u\s?c\s?k\|shit(as{2})?\|bitch\|pus{2}y\|cunt\|fag(g?[eo]t)?\|penis\|blowjob\|but{2}(plug\|head)?\|as{2}\|arse\|homo\|gay\|dyke\|cock\|dick(pic)?)(e?s\|ing\|e?r)?","wordclass":"regex"}],
`160`	`160`	`"racist" : [{"stem":"(fek[aá]\|nig+(er\|a)\|n[aá]ci\|cig[oó]\|cig[aá]n+y\|gypsy\|dzsip[oó]\|zsidr?[ó])[aáeégklnmstv]*","wordclass":"regex","boundary":False}],`
`161`	`161`	`"erotic" : [{"stem":"(sz?ex\|an[aá]l\|[bv]agina\|[bp][eé][np]isz?\|creampie\|cum\|sperma?\|fuck\|homo(kos\|sexu[aá]l(is)?)?\|milf\|bisexual\|gay\|dild[oó]\|vibr[aá]tor\|fel+atio\|blow\s?job\|whore\|geci\|pus{2}y\|pics[aá]\|pin[aá]\|fasz\|pis{2}\|boner\|dick(pic)?\|x{3,}\|hentai\|catgirl\|ec+hi\|yaoi\|loli\|shot[aá]\|\wporn[oó]?(film)?)[aáeéioöőuüdgklmnprstvz]","wordclass":"regex","boundary":False},{"stem":"maki verem"}],`
`162`		`- "unpleasant" : [{"stem":"AIDS","wordclass":"noun"},{"stem":"HIV","ignorecase":False},{"stem":"Hitler","wordclass":"noun"},{"stem":"(Sz?t[aá]lin\|Len+in)\w","wordclass":"regex"},{"stem":"pedof[ií]l(i[aá])?[aokltv]","wordclass":"regex"},{"stem":"(fur{2}y\|bestiality\|yif{2}y?)[aáeégklnmstv]*","wordclass":"regex"}],`
	`162`	`+ "unpleasant" : [{"stem":"AIDS","wordclass":"noun"},{"stem":"HIV","ignorecase":False},{"stem":"Hitler","wordclass":"noun"},{"stem":"(Sz?t[aá]lin\|Len+in)\w","wordclass":"regex"},{"stem":"pedof[ií]l(i[aá])?[aokltv]","wordclass":"regex"},{"stem":"(fur{2}y\|bestiality\|yif{2}y?)[aáeégklnmstv]","wordclass":"regex"},{"stem":"mej?i?n\s?kamp+f+\w","wordclass":"regex"},{"stem":"(any[aá]d\|gy[oö]k[eé]r)\w*","wordclass":"regex"}],`
`163`	`163`	`}`
`164`	`164`
`165`	`165`	`# decide whether user is talking to you in a formal or informal way`