Vector-Space-Model-Search-Engine/invert2.py at main · Monso0n/Vector-Space-Model-Search-Engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from PorterStemmer import PorterStemmer

def run(SW, STEMMER):
    d = open("cacm/cacm.all", 'r')
    sw = open("cacm/common_words", 'r')
    global p
    p = PorterStemmer()

    global wDocs
    wDocs = open("documents.txt", 'w')


    global STEMMER_ENABLED
    global SW_ENABLED

    STEMMER_ENABLED = STEMMER
    SW_ENABLED = SW

    print(f"Word stemming: {STEMMER}")
    print(f"Remove stopwords: {SW}")
    # load into memory
    global docs
    docs = d.read()

    # fields to record include .I (doc ID), .T(title), .W(abstract), B(publication date), .A (author list)

    global stopwords
    stopwords = sw.read()
    stopwords = stopwords.split()  # create the set of stopwords

    if STEMMER_ENABLED and SW_ENABLED:
        for i in range(len(stopwords)):
            stopwords[i] = p.stem(stopwords[i], 0, len(stopwords[i]) - 1)


    stopwords = set(stopwords)

    global dict
    dict = {}  # this will hold the term and document frequency

    updateDict()
    writeFiles()

    d.close()  # close master file
    sw.close()  # close stopWords file

    print("\nFINISHED BUILDING DICTIONARY AND POSTINGS \n")


def updateDict():
    #This method reads the cacm file and saves all relavent lines, filtered by the fields, into a variable called docs so it may be processed

    includedFields = {'.I', '.T', '.W', '.B', '.A'}
    excludedFields = {'.N', '.X', '.K', '.C'}
    Fields = includedFields.union(excludedFields)

    currField = ""
    doc = ""


    for i in docs.splitlines():
        if i.split(' ')[0] in Fields:
            currField = i.split(' ')[0]
            #print(f"currField {currField}") #set current field
            if currField == '.I': #If new document is detected, then send off saved doc to addTerms() method and clear docs variable
                docId = int(i.split(' ')[1]) - 1
                #print(f"\n{docId} \n")
                #print(doc)
                #print(doc)
                wDocs.write(f"{docId}")
                wDocs.write(f"\n{doc}\n")

                addTerms(docId, doc)

                doc=""
            continue

        if currField in excludedFields:
            continue
        else:
            doc+= i + "\n"


def addTerms(docId, doc):

    #this method adds terms to dictionary data structure

    punctuation = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    doc = doc.lower() #make everything in the doc lowercase
    #print("BEFORE")
    #print(doc)

    for char in doc: #this loop removes all punctuation
        if char in punctuation:
            doc = doc.replace(char, " ")

    words = doc.split() #doc after punctuation is removed, made into a list of strings


    #STEM FINDER FUNCTION
    if STEMMER_ENABLED:
        for w in words:
            doc = doc.replace(w, p.stem(w, 0, len(w)-1))

    #print("AFTER")
    #print(doc)

    words = doc.split()

    #print(words)


    for t in words:
        if SW_ENABLED and t in stopwords: #if the term is a stopword, the continue
            continue
        elif t in dict: #if the term exists in the dictionary, then update
            #print("DUPLICATE")
            #print(f"duplicate term is {t}")

            l = dict[t]

            if l.find(str(docId))==-1:
                string = f", ({docId},{words.count(t)})"
                new = dict[t] + string
                dict[t] = new


            #print(f"dict[{t}] = {dict[t]}")
            #print("\n")
        else: #if the term does not exist in the dictonary, then add it
            #print("NEW")
            string = f"({docId},{words.count(t)})"
            dict[t] = string
            #print(f"dict[{t}] = {dict[t]}")
            #print('\n')


def writeFiles(): #this function will create and write to dictionary.txt
    dictionary = open("dictionary.txt", 'w')
    postings = open("postings.txt", 'w')
    invertedindex = open("inverted_index.txt", 'w')

    for i in sorted(dict):
        #print(f"{i}: {dict[i]}")
        dictionary.write(f"{i} [{dict[i].count('(')}] \n")
        postings.write(f"{dict[i]} \n")
        invertedindex.write(f"{i} [{dict[i].count('(')}] >> {dict[i]} \n")


    dictionary.close()