-
Notifications
You must be signed in to change notification settings - Fork 0
/
IR HW#5 Code.py
191 lines (122 loc) · 7.65 KB
/
IR HW#5 Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import math
import json
import string
from matplotlib import pyplot as plt
#Problem 3:
#part a:
def JM_score(Lambda, background_ULM_dict, query, document):
# convert strings to Sets:
query_set = set(query.split())
document_set = set(document.split())
# find list of words that are in BOTH Query and Document:
common_words = list(query_set.intersection(document_set))
#print(common_words)
common_words_query_count_dict = {} # key = word ... value = count in query
common_words_document_count_dict = {} #key = word ... value = count in document
# find c(w,Q) for each word found in BOTH Query and Document
for word in common_words:
common_words_query_count_dict[word] = common_words_query_count_dict.setdefault(word,0) + query.split().count(word)
# find c(w,D) for each word found in BOTH Query and Document
for word in common_words:
common_words_document_count_dict[word] = common_words_document_count_dict.setdefault(word,0) + document.split().count(word)
# find size of Document (number of words)
document_size = len(document.split())
#calculating the score for the given query + document pair:
jm_score = 0
for word in common_words:
jm_score += common_words_query_count_dict[word] * math.log2(1 + (((1-Lambda) * common_words_document_count_dict[word]) / (Lambda * background_ULM_dict[word] * document_size)))
return jm_score
#############################################################################################################################################################################
def Dirichlet_score(Mu, background_ULM_dict, query, document):
# convert strings to Sets:
query_set = set(query.split())
document_set = set(document.split())
# find list of words that are in BOTH Query and Document:
common_words = list(query_set.intersection(document_set))
print(common_words)
common_words_query_count_dict = {} # key = word ... value = count in query
common_words_document_count_dict = {} #key = word ... value = count in document
# find c(w,Q) for each word found in BOTH Query and Document
for word in common_words:
common_words_query_count_dict[word] = common_words_query_count_dict.setdefault(word,0) + query.split().count(word)
# find c(w,D) for each word found in BOTH Query and Document
for word in common_words:
common_words_document_count_dict[word] = common_words_document_count_dict.setdefault(word,0) + document.split().count(word)
# find size of Document (number of words)
document_size = len(document.split())
# find the size of Query (number of words)
query_size = len(query.split())
#calculating the score for the given query + document pair:
dirichlet_score = 0
for word in common_words:
dirichlet_score += common_words_query_count_dict[word] * math.log2(1 + (common_words_document_count_dict[word] / (Mu * background_ULM_dict[word]))) - (query_size * math.log2(document_size + Mu))
return dirichlet_score
###########################################################################################################################################################################################################
def generate_avg_score(smoothing_parameter, background_ULM_dict, queries, documents, scoring_method):
query_avg_scores = [] # the average score of the top 5 docuements for the ith query
# getting the jm score for each document query pair and storing in query_JM_scores:
for query_dict in queries:
scores_list = []
sorted_doc_ids = []
for doc_dict in documents:
query = str(query_dict['query']).translate(str.maketrans('', '', string.punctuation))
document = str(doc_dict['body']).translate(str.maketrans('', '', string.punctuation))
score = scoring_method(smoothing_parameter, background_ULM_dict, query, document)
scores_list.append([doc_dict['id'], score])
# sorting the docs on the jm score:
top_5_sorted_scores_list = sorted(scores_list,key=lambda l:l[1], reverse=True)[:5]
# just want the doc id ...dont need the score any more:
for pair in top_5_sorted_scores_list:
sorted_doc_ids.append(pair[0])
print(sorted_doc_ids)
pos_sum = 0 # the cumulative sum of relevance scores for the current query
avg_score = 0 # the average relevance score among the top 5 documents for the current query
exists = False # boolean for keeping track if there exists a relavance value for a query/document pair
for value in sorted_doc_ids:
for rel_dict in relativity_scores:
if(query_dict['query number'] == int(rel_dict['query_num']) and int(rel_dict['id']) == value):
pos_sum += rel_dict['position']
exists = True
if(exists == False):
pos_sum += 5
exists = False
avg_score = pos_sum / 5
query_avg_scores.append(avg_score)
return query_avg_scores
###################################################################################################################################################################################################
###################################################################################################################################################################################################
# main
with open(r'C:\Users\JoshG\Documents\Information Retrieval HW Code\cranfield_data.json', 'r') as f1:
# returns JSON object as a dictionary
documents = json.loads(f1.read())
with open(r'C:\Users\JoshG\Documents\Information Retrieval HW Code\cran.qry.json', 'r') as f2:
# returns JSON object as a dictionary
queries = json.loads(f2.read())
with open(r'C:\Users\JoshG\Documents\Information Retrieval HW Code\cranqrel.json', 'r') as f3:
# returns JSON object as a dictionary
relativity_scores = json.loads(f3.read())
# building the Reference language model for each word based off the entire collection of documents:
data_dict = {} # key: word ... value: count of word over total corpus
total_words = 0 # number of total count of words in our corpus
for dict in documents:
for word in dict['body'].split():
word = word.translate(str.maketrans('', '', string.punctuation))
total_words += 1
data_dict[word] = data_dict.get(word, 0) + 1
background_ULM_dict = {} # key: word ... value: unigram language model value for 'word'
# generating the background language model for each word in the corpus:
for word in data_dict:
background_ULM_dict[word] = data_dict.get(word, 0) + (data_dict[word] / total_words)
##########################################################################################################
Lambda_list = [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in range(len(Lambda_list)):
query_avg_scores = generate_avg_score(Lambda_list[i], background_ULM_dict, queries, documents, JM_score)
# plotting:
plt.hist(query_avg_scores, bins=[0, 1, 2, 3, 4, 5])
plt.show()
Mu_list = [100,500,1000,2000,4000,8000,10000]
for smooth_parameter in Mu_list:
query_avg_scores = generate_avg_score(smooth_parameter, background_ULM_dict, queries, documents, Dirichlet_score)
# plotting:
plt.hist(query_avg_scores, bins=[0, 1, 2, 3, 4, 5])
plt.show()