-
Notifications
You must be signed in to change notification settings - Fork 0
/
PositionalInvertedIndex.py
91 lines (70 loc) · 2.57 KB
/
PositionalInvertedIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
'''
An index can retrieve postings for a term from a data structure associating terms and
the documents that contain them.
<term, <PositionalPosting1>, <PositinalPosting2>....>
<PositionalPosting1: <docId, [pos1, pos2]...>...>
<PositionalPosting2: <docId, [pos1, pos2...]>, <docId, [pos1, pos2...]>...>
'''
'''
Steps:
create an index obj
build the corpus
build the index --
for each doc in corpus
--> get content
--> add term into the index
'''
from Postings import PositionalPostings
class PositionalInvertedIndex:
def __init__(self):
self.__index = {}
# list of all terms
self.__mVocabulary = []
self.__tokens = {}
self.__terms = {}
# def getPostings(self, term:str, position:bool):
def getPostings(self, term:str):
postingList = []
if term in self.__index:
postingList = self.__index[term]
return postingList
def getVocabulary(self):
vocabulary = self.__index.keys()
return sorted(vocabulary)
def addTerm(self, term:str, docId:int, ind:int):
# if term not existed in the dictionary
if term not in self.__index:
postingList = []
# create new posting
new_posting = PositionalPostings(docId)
# insert the term position in postings
new_posting.insertIndex(ind)
postingList.append(new_posting)
self.__index[term] = postingList
else:
# if term already exist in the dictionary
postingList = self.__index[term]
# if term appear again in the same document
if postingList[len(postingList)-1].getDocumentId() == docId:
posting = postingList[len(postingList)-1]
# add new term position to the existed posting
posting.insertIndex(ind)
else:
# term appears in the new document
new_posting = PositionalPostings(docId)
new_posting.insertIndex(ind)
# add new posting into the exiting postingList
postingList.append(new_posting)
def setTokens(self, docId:int, count:int):
tokens[docId] = count
def getTokenSize(self, docId:int):
return tokens[docId]
def setTerms(self, docId:int, count:int):
terms[docId] = count
def termSize(self, docId):
return terms[docId]
def getTokens():
total = 0
for count in tokens.keys():
total += count
return total