-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_query11.py
51 lines (43 loc) · 2.05 KB
/
process_query11.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from readers_11 import getDocs
from math import log2
import heapq
def perQscore(term, postings_file, pointers, numdocs, accumulators):
# accumulators is a dictionary Num->score that is initialised to empty
# one iteration of this function updates scores for each document already present, else adds it
docnums_for_this_term, tfs = getDocs(term, pointers, postings_file)
if term not in pointers.keys():
term = "a"
for i in range(len(docnums_for_this_term)):
doc = docnums_for_this_term[i] ## is an integer
tf_i = 1 + log2(tfs[i]) #term does occur here, tf_i > 0
idfi_term = log2(1 + (numdocs/pointers[term][1])) ##correct here
score_to_be_added = tf_i * idfi_term
if doc in accumulators:
accumulators[doc] += score_to_be_added
else:
accumulators[doc] = score_to_be_added
return accumulators
def acc_query(terms, postings_file, pointers, numdocs):
accumulators = {}
for term in terms:
perQscore(term, postings_file, pointers, numdocs, accumulators)
return accumulators
def normalise_documents(maptoDocs, accumulators):
for doc in accumulators.keys():
accumulators[doc] = (accumulators[doc]/maptoDocs[doc][1])
return accumulators
def rank_results(accumulators, mapIDtoDoc):
heap = [(value, key) for key, value in accumulators.items()]
top_100_keys_with_values = heapq.nlargest(100, heap)
top_100_docs = [(mapIDtoDoc[key][0], value) for value, key in top_100_keys_with_values]
return top_100_docs
def processQ(terms, postings_file, pointers, numdocs, maptoDocs):
accumulators = acc_query(terms, postings_file, pointers, numdocs)
accumulators = normalise_documents(maptoDocs, accumulators)
top_100 = rank_results(accumulators, maptoDocs)
# print("Ranking for query Q = " + str(terms) + " is :>")
# for key, val in top_100:
# print(key + " " + str(val))
return top_100
#terms = "issuance of general exclusion order".split()
#processQ(terms, postings_file, pointers, docIDsize, tf_length, numdocs, mapToDocs)