-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_engine.py
112 lines (94 loc) · 3.25 KB
/
search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from rank import reference_rank
def search_words(trie, words, graph, hashmap):
if len(words) >= 3:
condition = words[1]
if condition in ["AND", "OR", "NOT"]:
words = words[0::2]
else:
condition = None
if condition == "AND":
return and_search(trie, words, hashmap)
elif condition == "OR":
return or_search(trie, words, graph, hashmap)
elif condition == "NOT":
return not_search(trie, words, hashmap)
else:
return default_search(trie, words, graph, hashmap)
def and_search(trie, words, hashmap):
pages_with_first_word = trie.count(words[0].lower(), hashmap)
if not pages_with_first_word:
return []
pages_with_second_word = trie.count(words[1].lower(), hashmap)
if not pages_with_second_word:
return pages_with_first_word
filtered_pages = []
for page in pages_with_first_word:
if page in pages_with_second_word:
filtered_pages.append(page)
for page in pages_with_second_word:
if page in pages_with_first_word:
filtered_pages.append(page)
return filtered_pages
def or_search(trie, words, graph, hashmap):
all_pages = {}
for word in words:
pages = trie.count(word.lower(), hashmap)
if pages:
all_pages[word] = pages
else:
return []
medium_rank = {}
i = 0
while i < 2:
if i + 1 == 2:
other = i - 1
else:
other = i + 1
for page in all_pages[words[i]]:
if page not in all_pages[words[other]]:
medium_rank[page] = all_pages[words[i]].count(page) / 2
else:
medium_rank[page] = (all_pages[words[i]].count(page) + all_pages[words[other]].count(page)) / 2
i += 1
return reference_rank(graph, medium_rank, 2)
def not_search(trie, words, hashmap):
pages_with_first_word = trie.count(words[0].lower(), hashmap)
if not pages_with_first_word:
return []
pages_with_second_word = trie.count(words[1].lower(), hashmap)
if not pages_with_second_word:
return pages_with_first_word
print(pages_with_first_word)
print(pages_with_second_word)
filtered_pages = []
for page in pages_with_first_word:
if page not in pages_with_second_word:
filtered_pages.append(page)
print(filtered_pages)
return filtered_pages
def default_search(trie, words, graph, hashmap):
all_pages = {}
for word in words:
pages = trie.count(word.lower(), hashmap)
if pages:
all_pages[word] = pages
else:
return []
medium_rank = {}
max_i = len(words)
for i in range(max_i):
for page in all_pages[words[i]]:
count = all_pages[words[i]].count(page)
for j in range(i + 1, max_i):
count += all_pages[words[j]].count(page)
if page not in list(medium_rank.keys()):
medium_rank[page] = count / max_i
return reference_rank(graph, medium_rank, max_i)
def search_phrase(phrase, hashmap):
result = []
for page, text in hashmap.items():
lines = text.split("\n")
for line in lines:
if phrase in line:
result.append(page)
return result