-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsts.py
168 lines (142 loc) · 4.72 KB
/
sts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import lsaWordSim
import nltk
import math
import wordNet
import wn3
import extract
import pickle
class sts():
def __init__(self, S1 = None, S2 = None):
self.lsa = lsaWordSim.lsaWordSim()
self.CustomizedWordNet = wordNet.wordNet()
self.S1 = S1
self.S2 = S2
self.allowedTags = ['NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', \
'RB', 'RBR', 'RBS', 'RP', 'NNP', 'NNPS', 'NNS', \
'JJ', 'JJR', 'JJS']
self.count = self.totalWordsInDictionary(self.lsa.wordCount)
def preprocessing(self, text = None):
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(text)
ps = PorterStemmer()
text = [ps.stem(word.lower()) for word in text]
#text = [word.lower() for word in text]
text = [word for word in text if word not in stopwords.words('english')]
text = nltk.pos_tag(text)
returnText = []
for i in text:
if i[1] in self.allowedTags:
returnText.append(i)
return returnText
def steps(self, S1, S2):
S1 = self.preprocessing(S1)
S2 = self.preprocessing(S2)
print S1, S2
sentClose1 = self.wordMatching(S1, S2)
sentClose2 = self.wordMatching(S2, S1)
print sentClose1, sentClose2
#print S1, "\n\n", S2, "\n\n\n"
#print sentClose1, "\n\n", sentClose2
sent1 = self.infoContent(sentClose1)
sent2 = self.infoContent(sentClose2)
print sent1, sent2
termAlignScore1 = self.TermAlignment(sent1)
termAlignScore2 = self.TermAlignment(sent2)
avg = (termAlignScore1 + termAlignScore2) / 2.0
print termAlignScore1, termAlignScore2, avg
#self.saveToFile("Socre1.pickle", termAlignScore1)
#self.saveToFile("Score2.pickle", termAlignScore2)
#self.saveToFile("Avg.pickle", avg)
return termAlignScore1, termAlignScore2, avg
def saveToFile(self, fileName, variable):
with open(fileName, "wb") as handle:
pickle.dump(variable, handle)
def retreiveFromFile(self, fileName):
with open(fileName, "rb") as handle:
return pickle.load(handle)
def TermAlignment(self, S1):
divisor = 0
numerator = 0
for i in S1:
divisor += i[3]
numerator += (i[2] * i[3])
return numerator / divisor
def wordMatching(self, S1, S2):
SimilairtyList = []
for i in xrange(len(S1)):
maxSimi = 0
maxSimiIndex = 0
for j in xrange(len(S2)):
if S1[i][0] == S2[j][0]:
maxSimi = 1
maxSimiIndex = j
SimilairtyList.append([S1[i][0], S2[j][0], maxSimi])
break
else:
similarityScore = self.lsa.calculateSimilarity(S1[i][0], S2[j][0])
similarityScore += 0.5 * math.exp(-0.25 * self.CustomizedWordNet.findShortestPath(S1[i][0], S2[j][0]))
similarityScore = min(1, similarityScore)
#similarityScore = wn3.returnWordSim(S1[i][0], S2[j][0])
if maxSimi < similarityScore:
maxSimi = similarityScore
maxSimiIndex = j
if maxSimi!=1: SimilairtyList.append([S1[i][0], \
S2[maxSimiIndex][0], maxSimi])
return SimilairtyList
def totalWordsInDictionary(self, dictionary):
count = 0.0
for i in dictionary:
count += dictionary[i]
return count
def infoContent(self, S1):
for i in xrange(len(S1)):
if S1[i][0] not in self.lsa.wordCount:
S1[i].append(4.0)
continue
count = self.lsa.wordCount[S1[i][0]]
if count != 0:
S1[i].append(math.log(self.count/count))
else:
S1[i].append(0)
#print S1
return S1
if __name__ == '__main__':
matrix = []
sts = sts()
termAlignScore1 = []
termAlignScore2 = []
database = extract.extract()
for i in database:
print i
term1, term2, value = sts.steps(i[0], i[1])
matrix.append([i[2], value])
termAlignScore1.append(term1)
termAlignScore2.append(term2)
print matrix
sts.saveToFile("term1.pickle", termAlignScore1)
sts.saveToFile("term2.pickle", termAlignScore2)
sts.saveToFile("avg.pickle", matrix)
#with open('results2.pickle', 'wb') as handle:
# pickle.dump(matrix, handle)
'''
S1 = "A cemetry is a place where dead people's bodies or \
their ashes are buried."
S2 = "A graveyard is an area of land, sometimes near a \
church, where dead people are buried."
#S1 = "People hate pope"
#S2 = "football is an adventrous sport"
S1 = "The problem is simpler than that"
S2 = "The problem is simple"
S1 = "Gameday for the orioles game is frozen for me."
S2 = "not astros or orioles bad Gameday for the orioles game is frozen for me."
S1 = "Tell us what the charges were"
S2 = "Yes what are his charges."
S1 = "Those are partial psychopaths."
S2 = "Was Ted Bundy a partial psychopath?"
#sts = sts()
#sts.steps(S1, S2)
'''