Skip to content

Commit 20f8380

Browse files
committed
version 1.05 released
- tippmix stemmer is now stemmer.tippmix instead of the other way around as more stemmer functions are to be expected - updated example_stemmer - updated READM.md example for tippmix stemmer - updated init and increased version - updated unpleasant entities
1 parent dac36f4 commit 20f8380

File tree

5 files changed

+10
-10
lines changed

5 files changed

+10
-10
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ elif 'are_you_a_robot' in chitchat_match:
135135
Can create features from short Hungarian texts for Machine Learning models, without large dictionaries:
136136

137137
```python
138-
from lara import tippmix, nlp
138+
from lara import stemmer, nlp
139139

140140
text = '''
141141
A szövegbányászat a strukturálatlan vagy kis mértékben strukturált
@@ -147,7 +147,7 @@ text = '''
147147
'''
148148

149149
clean = nlp.remove_stopwords(text)
150-
stems = tippmix.stemmer(clean)
150+
stems = stemmer.tippmix(clean)
151151
bigrams = nlp.ngram(stems,2)
152152
print(bigrams)
153153

examples/example_stemmer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import os.path, sys
44
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
5-
from lara import tippmix, nlp
5+
from lara import stemmer, nlp
66

77
''' Stemmer and n-gram example '''
88

@@ -22,7 +22,7 @@
2222
print(bigrams)
2323

2424
# A szöveg szavait stemmeli és ezekből bigramokat generál
25-
stems = tippmix.stemmer(text)
25+
stems = stemmer.tippmix(text)
2626
bigrams = nlp.ngram(stems,2)
2727
print(bigrams)
2828

@@ -33,6 +33,6 @@
3333
print(bigrams)
3434

3535
# A stopszavak eltávolítása után megmaradt szavakat stemmeli és ezekből generál bigramokat
36-
stems = tippmix.stemmer(text)
36+
stems = stemmer.tippmix(text)
3737
bigrams = nlp.ngram(stems,2)
3838
print(bigrams)

lara/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22

33
# Lara - Lingusitic Aim Recognizer API
44

5-
__all__ = 'nlp','parser','tippmix','entities'
6-
__version__ = '1.0.4'
5+
__all__ = 'nlp','parser','stemmer','entities'
6+
__version__ = '1.0.5'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys
1010
import lara.nlp
1111
import lara.parser
12-
import lara.tippmix
12+
import lara.stemmer
1313
import lara.entities

lara/entities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def disallow():
159159
"obscene" : [{"stem":"(fel|le|meg|r[aá]|ki|be|oda|[oö]s+ze|bele|hoz+[aá])?bas*z+d?\s?(at)?(hat)?(us|a[dk]?|n?[aá][kl]|[aá]?t[aáo][lkm]?|ot+|ni|n[aá]n?[dlkm]?|va|meg)?","wordclass":"regex","exc":[{"stem":"megye"}]},{"stem":"((l[oó]|agy)?fasz|fas+z+op[oó]|geci\w*|kurv[aá]([eé]let|an+yj?[aá])?|(be)?fos|ribanc|(be)?szar|buzi|k[oö]cs[oö]g|pin[aá]|pics[aá]|p[oö]cs|p[eé]nisz|(kurva)?any[aá]d)([oö]?k)?r?[aáeoö]?(\w{0,2}[aeoöőu][dnklt]*)?(n[ae]k)?\b","wordclass":"regex","boundary":False},{"stem":"((mother)?f\s?u\s?c\s?k|shit(as{2})?|bitch|pus{2}y|cunt|fag(g?[eo]t)?|penis|blowjob|but{2}(plug|head)?|as{2}|arse|homo|gay|dyke|cock|dick(pic)?)(e?s|ing|e?r)?","wordclass":"regex"}],
160160
"racist" : [{"stem":"(fek[aá]|nig+(er|a)|n[aá]ci|cig[oó]|cig[aá]n+y|gypsy|dzsip[oó]|zsidr?[ó])[aáeégklnmstv]*","wordclass":"regex","boundary":False}],
161161
"erotic" : [{"stem":"(sz?ex|an[aá]l|[bv]agina|[bp][eé][np]isz?|creampie|cum|sperma?|fuck|homo(kos|sexu[aá]l(is)?)?|milf|bisexual|gay|dild[oó]|vibr[aá]tor|fel+atio|blow\s?job|whore|geci|pus{2}y|pics[aá]|pin[aá]|fasz|pis{2}|boner|dick(pic)?|x{3,}|hentai|catgirl|ec+hi|yaoi|loli|shot[aá]|\w*porn[oó]?(film)?)[aáeéioöőuüdgklmnprstvz]*","wordclass":"regex","boundary":False},{"stem":"maki verem"}],
162-
"unpleasant" : [{"stem":"AIDS","wordclass":"noun"},{"stem":"HIV","ignorecase":False},{"stem":"Hitler","wordclass":"noun"},{"stem":"(Sz?t[aá]lin|Len+in)\w*","wordclass":"regex"},{"stem":"pedof[ií]l(i[aá])?[aokltv]*","wordclass":"regex"},{"stem":"(fur{2}y|bestiality|yif{2}y?)[aáeégklnmstv]*","wordclass":"regex"}],
162+
"unpleasant" : [{"stem":"AIDS","wordclass":"noun"},{"stem":"HIV","ignorecase":False},{"stem":"Hitler","wordclass":"noun"},{"stem":"(Sz?t[aá]lin|Len+in)\w*","wordclass":"regex"},{"stem":"pedof[ií]l(i[aá])?[aokltv]*","wordclass":"regex"},{"stem":"(fur{2}y|bestiality|yif{2}y?)[aáeégklnmstv]*","wordclass":"regex"},{"stem":"mej?i?n\s?kamp+f+\w*","wordclass":"regex"},{"stem":"(any[aá]d|gy[oö]k[eé]r)\w*","wordclass":"regex"}],
163163
}
164164

165165
# decide whether user is talking to you in a formal or informal way

lara/tippmix.py renamed to lara/stemmer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99

1010
# a stemmer that's slightly better than random guessing
11-
def stemmer(text):
11+
def tippmix(text):
1212
if text:
1313
word_list = lara.nlp.tokenize(text)
1414
if word_list:

0 commit comments

Comments
 (0)