-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpygenus.py
115 lines (86 loc) · 3.31 KB
/
pygenus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import string
import sys
from itertools import chain
import nltk
from nltk.corpus import names
MALE_PRONOUN_SEQ = ('he', 'him', 'his', 'himself')
FEMALE_PRONOUN_SEQ = ('she', 'her', 'hers', 'herself')
NEUTRAL_PRONOUN_SEQ = ('they', 'them', 'their', 'theirs', 'themselves')
ABBREVIATION_SEQ = ('i.e',)
"""Abbreviation last period gets removed by tokenizer."""
PREPOSITION_SEQ = ('for',)
CLASSIFIER_CACHE = None
"""Cache trained classifier."""
class Classifier:
def __init__(self, classifier=None):
self.classifier = classifier
def classify(self, text):
pos_tag_list = nltk.pos_tag(nltk.word_tokenize(text))
for pos_tag in pos_tag_list:
classification = self.classify_word(pos_tag)
if classification not in ('neutral', 'excess'):
yield pos_tag[0], classification
def classify_word(self, pos_tag):
return self.classifier.classify(_gender_features(pos_tag))
def classify(text):
_classifier = Classifier(classifier=new_naive_bayes_classifier())
return _classifier.classify(text)
def new_naive_bayes_classifier():
# Create feature set consisting of male and female names for training
global CLASSIFIER_CACHE
if CLASSIFIER_CACHE:
return CLASSIFIER_CACHE
else:
male_word_seq = _new_training_set(
'male', names.words('male.txt'), MALE_PRONOUN_SEQ)
female_word_seq = _new_training_set(
'female', names.words('female.txt'), FEMALE_PRONOUN_SEQ)
neutral_pronoun_seq = _new_training_set(
'neutral', NEUTRAL_PRONOUN_SEQ)
excess_seq = _new_training_set(
'excess', ABBREVIATION_SEQ, PREPOSITION_SEQ, string.punctuation,
('looking', 'is'),
)
featureset_seq = (
(_gender_features(word), gender)
for word, gender in chain(
male_word_seq,
female_word_seq,
neutral_pronoun_seq,
excess_seq,
))
CLASSIFIER_CACHE = nltk.NaiveBayesClassifier.train(featureset_seq)
return CLASSIFIER_CACHE
def _new_training_set(tag, *word_seq):
return (((word, pos_tag), tag)
for word, pos_tag in
nltk.pos_tag(list(chain.from_iterable(word_seq))))
def _gender_features(pos_tag):
word_raw, tag = pos_tag
word = _normalize_word(word_raw)
feature_dict = {'pos_tag': tag}
if word in MALE_PRONOUN_SEQ + FEMALE_PRONOUN_SEQ + NEUTRAL_PRONOUN_SEQ:
feature_dict = {'pronoun': word}
elif word in ABBREVIATION_SEQ:
# XXX: for abbreviations we could check for punctuation and length
feature_dict = {'abbr': word}
elif word in PREPOSITION_SEQ:
feature_dict = {'prep': word}
elif word in string.punctuation:
feature_dict = {'punctuation': word}
elif any(map(tag.startswith, ['VB', 'TO', 'DT', 'IN'])):
feature_dict.update({'some_oher_stuff': word})
else:
feature_dict.update({'last_letters': word[-2:]})
return feature_dict
def _normalize_word(word_raw):
return word_raw.lower()
def main():
"""Take input from stdin and print all matches as word, tag tuples."""
try:
for match in classify(sys.stdin.read()):
print(match)
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()