forked from messner1/Python-Delta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
delta.py
152 lines (110 loc) · 5.97 KB
/
delta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import nltk
import string
import sys
import argparse
import math
import re
import numpy
import matplotlib.pyplot as plt
class fullCorpus:
def __init__(self, textCollections, numFeatures, cullPercentage):
self.overallList = nltk.FreqDist()
self.standardDev = 0
self.nonPercent = {}
for collection in textCollections:
for text in collection.collection:
self.overallList.update(collection.collection[text].freqList) #build an overall list of word frequencies out of the individual text lists contained in the passed textcollections (candidate and target generally)
#cull overrepresented features here (as long as the option has been passed)
if cullPercentage > 0:
for feature in self.overallList:
for collection in textCollections:
for text in collection.collection:
if feature in collection.collection[text].freqList: #if the given feature shows up in the text
if (float(self.overallList[feature]) > 0) and (float(collection.collection[text].freqList[feature])/float(self.overallList[feature]) * 100) >= cullPercentage:
self.overallList.pop(feature) #if it meets the percentage threshold (IE one text accounts for at least 70 occurences of a feature of the overall corpus) cut it
#cull list down to numfeatures
tempList = nltk.FreqDist()
for r in range(0, numFeatures):
key, value = self.overallList.items()[r]
tempList.inc(key, value)
self.overallList = tempList
#get standard deviation of main corpus set
self.standardDev = numpy.std(self.overallList.values())
self.setPercentageDict() #build a dictionary of the percentage of each word
def getList(self): #returns the dict of token|number of times token occurs
return self.overallList
def setPercentageDict(self):
self.percentageDict = {key: self.overallList.freq(key) for key in self.overallList}
def getPercentageDict(self):
return self.percentageDict
class individualText:
def __init__(self, file): #an individual text is a single freq dist for a single inputted file
raw = file.read()
raw = re.sub('\.|,|\(|\)|;|:|!|&|\*|\'','', raw) #remove punctuation
self.freqList = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(raw)) #tokenize
self.zDict = {}
self.setPercentageDict()#build a dictionary of the percentages
def getList(self): #returns the dict of token|number of times token occurs for an individual text
return self.freqList
def getPercentageDict(self):
return self.percentageDict
def setPercentageDict(self):
self.percentageDict = {key: self.freqList.freq(key) for key in self.freqList}
def setZScoreDict(self, fullCorpus): #take the overall corpus and using its percentage dictionary find the difference between this texts percentage of a feature and the overall, then make that into a dict
for dist in fullCorpus.percentageDict:
if dist in self.percentageDict:
self.zDict.update({dist : (self.percentageDict[dist] - fullCorpus.percentageDict[dist])/fullCorpus.standardDev}) #go right to an array of difference in percentages divided by standard dev of word (if that's right)
else:
self.zDict.update({dist : (0-fullCorpus.percentageDict[dist])/fullCorpus.standardDev}) #if the token appeared zero times in this text (but enough times to be under the feature limit in main corpus)
#outfile.write('\n' + self + " " + self.zDict + '\n')
def compareZScore(self, compareText):
testZMean = 0
for feature in self.zDict:
testZMean += abs(self.zDict[feature] - compareText.zDict[feature])
#this should be the final measure
#make more intelligible by getting out of sci note!
return testZMean/len(self.zDict) * 10000000
class textCollection:
def __init__(self, filelist):
self.collection = {file.name: individualText(file) for file in filelist} #a textCollection is a dictionary of individual texts
def getList(self): #returns the dict of all the individual texts (key is filename, value is the individualText object)
return self.collection
def getPercentages(self):
for text in self.collection:
print self.collection[text].getPercentageDict()
def getZScoreDicts(self, fullCorpus):
for text in self.collection:
self.collection[text].setZScoreDict(fullCorpus)
def compareZScores(self, compareCollection, outfile, plot):
zScoresList = []
textList = []
for text in self.collection:
outfile.write(text)
textList.append(text)
for compareText in compareCollection.collection:
zScoresList.append(self.collection[text].compareZScore(compareCollection.collection[compareText])) #compare each zscore in the passed collections
outfile.write("\n" + str(zScoresList[len(zScoresList)-1]) + "\n")
if plot:
plt.plot(zScoresList, 'ro')
plt.title(compareText)
plt.ylabel('Zscore')
plt.xlabel('text')
plt.xticks(numpy.arange(0, len(textList)), textList, size='small')
plt.show()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('infiles', nargs='+', type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('-t', '--target', nargs='+', type=argparse.FileType('r'), help = "name of unknown file")
parser.add_argument("-f", '--features', type=int, default = 500, help="Set amount of features to use")
parser.add_argument("-c", '--cull', type=int, default = 0, help="Percentage for culling overrepresented words in main corpus")
parser.add_argument("-o", '--out', type=argparse.FileType('w'), default=sys.stdout, help="Optional file to write program output to. Otherwise writes to stdout")
parser.add_argument("-p", '--plot', type=bool, default = 0, help="Enable to display a plot of Zscores")
arguments = parser.parse_args()
candidateTexts = textCollection(arguments.infiles)
targetTexts = textCollection(arguments.target)
corpus = fullCorpus([candidateTexts,targetTexts], arguments.features, arguments.cull)
candidateTexts.getZScoreDicts(corpus)
targetTexts.getZScoreDicts(corpus)
candidateTexts.compareZScores(targetTexts, arguments.out, arguments.plot)
if __name__ == "__main__":
main()