-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordcount.py
104 lines (87 loc) · 2.97 KB
/
wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Document parsing script
# count up the number of words of each type
from collections import OrderedDict
filein = 'ffts_ps.html'
#filein = "test.txt"
fileout = 'wordcount.txt'
#fileout = 'testproduct.txt'
#number of characters to compare for each word
comp_len = 5
#open the file and read it into memory as "data"
'''f = open(filein, 'r')
data = f.read()
f.close()'''
data = '''Placeholder'''
#print(data[:50])
#clean out anything between brackets, replace with whitespace
print("removing newlines")
data = data.replace('\n',' ')
print("Removing tags")
curpos = 0
while True:
#parse looking for keyphrase
found_pos = data.find('<', curpos)
#print(found_pos)
#loop until end of file reached
if found_pos == -1:
break
#parse looking for end keyphrase
end_pos = data.find('>', found_pos)
#print(end_pos)
if end_pos == -1:
print("no close tag found, abort")
break
# remove the whole tag, but not the tagged text
# Do this to the whole dataset, since tags are likely to repeat
cut_string = data[found_pos:end_pos+1]
#print(cut_string)
data = data.replace(cut_string, "")
#data = data[:cut_start] + ' ' + data[cut_end:]
curpos = found_pos + 1
print("Removing Punctuation!")
#remove all punctuation
for character in '''"”“'’‘,;:.·()!?-*–%''':
data = data.replace(character, '')
#print(data)
word_count = {}
all_words = data.split()
full_len = len(all_words)
print(full_len, "words")
# count all the words
print("Counting the words")
for word in all_words:
search_word = word
#uniform case
search_word = search_word.capitalize()
#trim plurality and posession from longer words
if len(search_word) > 3:
search_word = search_word.rstrip("s'’‘")
#trim long words
if len(search_word) > comp_len:
search_word = search_word[:comp_len]
#if nothing is left, continue
if search_word == '': continue
#increment the word count
if search_word in word_count.keys():
word_count[search_word] += 1
else:
word_count[search_word] = 1
print(len(word_count), "unique")
#sort the counts
word_list = OrderedDict(sorted(word_count.items(), key=lambda t: t[1], reverse=True))
#print(word_list.items())
#print it nicely
output = "A total of {} words were parsed,\n".format(full_len)
output += "of which {} were identified as unique.\n".format(len(word_count))
output += "Only the initial {} characters were considered.\n\n".format(comp_len)
output += (" qty : {:<" + str(comp_len) + "} : ppm\n\n").format("word")
for datapair in word_list.items():
#print(datapair[1], " : ", datapair[0])
fraction = (datapair[1] / full_len) * 1000000
output += ("{:>10,} : {:<" + str(comp_len) + "} : {:.2f}\n").format(datapair[1], datapair[0], fraction)
#output += " %\n"
#save the file out
f = open(fileout, 'w')
f.write(output)
f.close()
print("done!")