-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathsa_nltk_sklearn.py
137 lines (118 loc) · 5.49 KB
/
sa_nltk_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import collections
import random
import sys
from datetime import datetime
import nltk
from nltk import SklearnClassifier
from nltk.metrics.scores import f_measure, precision, recall
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC
from textutil import *
def do_sa(n, classifier, classifier_name):
pos_train_file = 'arabic_tweets_tsv/train_pos_20181206_1k.tsv'
neg_train_file = 'arabic_tweets_tsv/train_neg_20181206_1k.tsv'
pos_test_file = 'arabic_tweets_tsv/test_pos_20181206_1k.tsv'
neg_test_file = 'arabic_tweets_tsv/test_neg_20181206_1k.tsv'
print('files')
print('train file (pos)', pos_train_file)
print('train file (neg)', neg_train_file)
print('test file (pos)', pos_test_file)
print('test file (neg)', neg_test_file)
print('------------------------------------')
print('parameters')
min_freq = 5
print('n grams:', n)
print('min freq:', min_freq)
print('------------------------------------')
print('loading train data ....')
pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
print('loading test data ....')
pos_test_data, pos_test_feat = load_tsv(pos_test_file, 'pos', n)
neg_test_data, neg_test_feat = load_tsv(neg_test_file, 'neg', n)
print('------------------------------------')
print('train data info')
train_data = pos_train_data + neg_train_data
print('train data size', len(train_data))
print('# of positive', len(pos_train_data))
print('# of negative', len(neg_train_data))
print('------------------------------------')
sample_size = 100
print('{} random tweets .... '.format(sample_size))
print(random.sample(train_data, sample_size))
print('------------------------------------')
print('------------------------------------')
print('train data info')
test_data = pos_test_data + neg_test_data
print('test data size', len(train_data))
print('# of positive', len(pos_test_data))
print('# of negative', len(neg_test_data))
print('------------------------------------')
print('merging all features ... ')
all_features = pos_train_feat + neg_train_feat + \
pos_test_feat + pos_test_feat
print('len(all_features):', len(all_features))
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))
print('------------------------------------')
print('compute frequencies')
all_features_freq = nltk.FreqDist(w for w in all_features)
print(all_features_freq)
print('sample frequencies')
print(all_features_freq.most_common(20))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_freq.freq('في')))
print('features frequencies are computed')
print('------------------------------------')
thr = min_freq / len(all_features)
print('threshold:', thr)
print('selecting features ...')
###################################
# remove features that have frequency below the threshold
my_features = set([word for word in all_features if all_features_freq.freq(word) > thr])
###################################
# other method: top 3k features
# my_features = list(all_features_freq)[:3000]
###################################
print(len(my_features), 'are kept out of', len(all_features))
print('features are selected')
print('------------------------------------')
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))
print('------------------------------------')
print('generating features for training documents ...')
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]
print('------------------------------------')
print('training ...')
classifier.train(feature_sets)
print('classifier: {}'.format(classifier_name))
print('training is done')
print('------------------------------------')
print('generating features for test documents ...')
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)
for i, (feats, label) in enumerate(test_features):
ref_sets[label].add(i)
observed = classifier.classify(feats)
test_sets[observed].add(i)
print('test results:')
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))
if __name__ == '__main__':
time_stamp = datetime.now().strftime('%Y%m%d_%H%M')
ngrams = (1, 2, 3)
algorithms = [LinearSVC(), SVC(), MultinomialNB(), BernoulliNB(), SGDClassifier()]
for alg in algorithms:
alg_name = alg.__class__.__name__
for n in ngrams:
time_stamp = '_'
outfile = sys.argv[0][:-2] + time_stamp + '_' + alg_name + '_' + str(n) + '.result'
sys.stdout = open(outfile, mode='w', encoding='utf-8')
do_sa(n, SklearnClassifier(alg), alg_name)