-
Notifications
You must be signed in to change notification settings - Fork 0
/
q_01.py
93 lines (76 loc) · 3.08 KB
/
q_01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
## To read queries from test_query.txt, process them and generate keywords for searching
from readers_01 import getMapping, getVals, pointers_to_post, get_merges
from process_query01 import processQ
from bpetoken import make_vocab, merge_vocab
from lxml import etree
import re
import time
intm_file = "intm_queries.xml"
def q01(queryfile, outputfile,indexfile, dictfile):
with open(queryfile, 'r') as file:
lines = file.readlines()
processed_lines = ['<INIT>\n']
for line in lines:
if line.startswith('<title>'):
processed_lines.append('</num>\n')
elif line.startswith('<desc>'):
processed_lines.append('</title>\n')
elif line.startswith('<narr>'):
processed_lines.append('</desc>\n')
elif line.startswith('</top>'):
processed_lines.append('</narr>\n')
line = line.replace("&", "&")
processed_lines.append(line)
processed_lines.append('</INIT>\n')
with open(intm_file, 'w') as pf:
pf.writelines(processed_lines)
parser = etree.XMLParser()
tree = etree.parse(intm_file, parser)
query_nums_pp = tree.xpath(".//top/num/text()")
query_titles_pp = tree.xpath(".//top/title/text()")
query_desc_pp = tree.xpath(".//top/desc/text()")
query_narr_pp = tree.xpath(".//top/narr/text()")
query_nums = []
for qn in query_nums_pp:
pattern = r'\d+(\d+)?'
match = re.search(pattern, qn)
if match:
query_nums.append(match.group())
merges = get_merges(dictfile)
# read merges from some document somewhere
query_titles = []
query_desc = []
query_narr = []
# prune content for processing
translation_table = str.maketrans("", "", "(),?.:" )
# prunes full sentences
def prune(istr):
istr = istr.lower()
istr = istr.split()
outistr = []
for issstr in istr:
outistr.append(issstr.translate(translation_table))
return outistr
query_titles = list(map(prune, query_titles_pp))
query_desc = list(map(prune, query_desc_pp))
pointers = pointers_to_post(dictfile)
mapToDocs = getMapping(indexfile, dictfile)
[_, numdocs, docIDsize, tf_length, _, _] = getVals(dictfile)
for i in range(len(query_nums)):
start_time = time.time()
terms_non_tokenised = query_titles[i] + query_desc[i]
terms_split = " ".join(terms_non_tokenised)
vocab_q = make_vocab(terms_split)
for m in merges:
vocab_q = merge_vocab(m, vocab_q)
terms_tokenised = []
for key, freq in vocab_q.items():
for el in key:
for _ in range(freq):
terms_tokenised.append(el)
top_100 = processQ(terms_tokenised, indexfile, pointers, docIDsize, tf_length, numdocs, mapToDocs)
with open(outputfile, 'a') as of_file:
for key, val in top_100:
of_file.write(query_nums[i] + " 0 " + key + " " + str(val) + "\n")
end_time = time.time()
print("Time to process docID = " + query_nums[i] + " is " + str(end_time-start_time))