-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinvert2.py
More file actions
158 lines (104 loc) · 4.19 KB
/
invert2.py
File metadata and controls
158 lines (104 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from PorterStemmer import PorterStemmer
def run(SW, STEMMER):
d = open("cacm/cacm.all", 'r')
sw = open("cacm/common_words", 'r')
global p
p = PorterStemmer()
global wDocs
wDocs = open("documents.txt", 'w')
global STEMMER_ENABLED
global SW_ENABLED
STEMMER_ENABLED = STEMMER
SW_ENABLED = SW
print(f"Word stemming: {STEMMER}")
print(f"Remove stopwords: {SW}")
# load into memory
global docs
docs = d.read()
# fields to record include .I (doc ID), .T(title), .W(abstract), B(publication date), .A (author list)
global stopwords
stopwords = sw.read()
stopwords = stopwords.split() # create the set of stopwords
if STEMMER_ENABLED and SW_ENABLED:
for i in range(len(stopwords)):
stopwords[i] = p.stem(stopwords[i], 0, len(stopwords[i]) - 1)
stopwords = set(stopwords)
global dict
dict = {} # this will hold the term and document frequency
updateDict()
writeFiles()
d.close() # close master file
sw.close() # close stopWords file
print("\nFINISHED BUILDING DICTIONARY AND POSTINGS \n")
def updateDict():
#This method reads the cacm file and saves all relavent lines, filtered by the fields, into a variable called docs so it may be processed
includedFields = {'.I', '.T', '.W', '.B', '.A'}
excludedFields = {'.N', '.X', '.K', '.C'}
Fields = includedFields.union(excludedFields)
currField = ""
doc = ""
for i in docs.splitlines():
if i.split(' ')[0] in Fields:
currField = i.split(' ')[0]
#print(f"currField {currField}") #set current field
if currField == '.I': #If new document is detected, then send off saved doc to addTerms() method and clear docs variable
docId = int(i.split(' ')[1]) - 1
#print(f"\n{docId} \n")
#print(doc)
#print(doc)
wDocs.write(f"{docId}")
wDocs.write(f"\n{doc}\n")
addTerms(docId, doc)
doc=""
continue
if currField in excludedFields:
continue
else:
doc+= i + "\n"
def addTerms(docId, doc):
#this method adds terms to dictionary data structure
punctuation = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
doc = doc.lower() #make everything in the doc lowercase
#print("BEFORE")
#print(doc)
for char in doc: #this loop removes all punctuation
if char in punctuation:
doc = doc.replace(char, " ")
words = doc.split() #doc after punctuation is removed, made into a list of strings
#STEM FINDER FUNCTION
if STEMMER_ENABLED:
for w in words:
doc = doc.replace(w, p.stem(w, 0, len(w)-1))
#print("AFTER")
#print(doc)
words = doc.split()
#print(words)
for t in words:
if SW_ENABLED and t in stopwords: #if the term is a stopword, the continue
continue
elif t in dict: #if the term exists in the dictionary, then update
#print("DUPLICATE")
#print(f"duplicate term is {t}")
l = dict[t]
if l.find(str(docId))==-1:
string = f", ({docId},{words.count(t)})"
new = dict[t] + string
dict[t] = new
#print(f"dict[{t}] = {dict[t]}")
#print("\n")
else: #if the term does not exist in the dictonary, then add it
#print("NEW")
string = f"({docId},{words.count(t)})"
dict[t] = string
#print(f"dict[{t}] = {dict[t]}")
#print('\n')
def writeFiles(): #this function will create and write to dictionary.txt
dictionary = open("dictionary.txt", 'w')
postings = open("postings.txt", 'w')
invertedindex = open("inverted_index.txt", 'w')
for i in sorted(dict):
#print(f"{i}: {dict[i]}")
dictionary.write(f"{i} [{dict[i].count('(')}] \n")
postings.write(f"{dict[i]} \n")
invertedindex.write(f"{i} [{dict[i].count('(')}] >> {dict[i]} \n")
dictionary.close()