Skip to content

Commit 3338ad6

Browse files
committed
so what do we do when we encounter unknown
term? we pick the most similar random term.
1 parent ade4767 commit 3338ad6

File tree

5 files changed

+55
-8
lines changed

5 files changed

+55
-8
lines changed

engine/scorer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def relevant_docs(self, query: str, k: int = 10) -> list[tuple]:
3131

3232
scores = defaultdict(int)
3333
for term in tokens:
34-
for posting, doc_freq in self.index.fetch_docs(term):
34+
for posting, doc_freq in self.index.fetch_index_record(term):
3535
_, did, freq = posting
3636
scores[did] += self.score(
3737
q_freq[term],

lib/indexing.py

+46-5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import random
12
from lib.codec import Codec, TextCodec
23
from collections import deque
34
import os
@@ -9,6 +10,11 @@
910

1011
from lib.lexers import AbstractLexer, WikiLexer
1112

13+
from sklearn.feature_extraction.text import TfidfVectorizer
14+
15+
# maximum number of terms to compare when we encounter unknown term
16+
MAX_TERMS = int(os.getenv("MAX_TERMS", 10))
17+
1218

1319
class Index:
1420
"""
@@ -24,6 +30,7 @@ def __init__(
2430

2531
self.codec = codec
2632
self._avg_dl = None
33+
self.vectorizer = TfidfVectorizer()
2734

2835
def doc_length(self, doc_id: int):
2936
return self.doc_stats[doc_id]
@@ -43,7 +50,8 @@ def corpus_size(self):
4350
def release(self) -> None:
4451
self.posting_file.close()
4552

46-
def fetch_docs(self, term: str) -> tuple[list[list[int]], int]:
53+
def fetch_index_record(self, term: str) -> tuple[list[list[int]], int]:
54+
term = term if term in self.lexicon else self.handle_unknown_term(term)
4755
_, doc_freq, offset = self.lexicon[term]
4856

4957
self.posting_file.seek(offset)
@@ -52,6 +60,39 @@ def fetch_docs(self, term: str) -> tuple[list[list[int]], int]:
5260
self.posting_file, self.codec)
5361
yield (self.codec.decode(bytes_), doc_freq)
5462

63+
def compute_similarity(self, term0: str, term1: str) -> float:
64+
"""
65+
calculates the cosine similarity of the two terms
66+
@param term0
67+
@desc: first term
68+
69+
@param term1
70+
@desc: second term
71+
72+
@return float
73+
@desc: range [0, 1]
74+
"""
75+
try:
76+
tfidf = self.vectorizer.fit_transform([term0, term1])
77+
return (tfidf * tfidf.T).A[0, 1]
78+
except:
79+
return 0
80+
81+
def handle_unknown_term(self, term: str) -> str:
82+
best_match = None
83+
best_score = -1
84+
terms = list(self.lexicon.keys())
85+
86+
random.shuffle(terms)
87+
tslice = terms[:MAX_TERMS]
88+
for key in tslice:
89+
score = self.compute_similarity(term, key)
90+
if score > best_score:
91+
best_match = key
92+
best_score = score
93+
94+
return best_match
95+
5596

5697
class Indexer:
5798
"""
@@ -107,7 +148,7 @@ def index(self):
107148
def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1) -> Index:
108149
"""
109150
indexes the corpus in represented by @param filenames
110-
151+
111152
@param: filenames
112153
@desc: list of files to index
113154
@@ -128,7 +169,7 @@ def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1)
128169
block.append(self.lexer.lex(doc.strip()))
129170

130171
posting_filenames.appendleft((self.algo.index(block), 0))
131-
172+
132173
index_filename = self.algo.merge(posting_filenames)
133174
os.rename(index_filename, self.index_filename)
134175
index_file: IO[bytes] = open(self.index_filename, "rb")
@@ -142,12 +183,12 @@ def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1)
142183
self.codec,
143184
)
144185

145-
return self._index
186+
return self._index
146187

147188
def export_index(self):
148189
"""
149190
exports lexicon, term lexicon and document stats using the pickle protocol
150-
"""
191+
"""
151192
FilePickler.dump(self.index.lexicon, self._lexicon_filename)
152193
FilePickler.dump(self.index.doc_stats, self._doc_stat_filename)
153194
FilePickler.dump(self.algo.term_lexicon, self._terms_lexicon_filename)

lib/lexers.py

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ def lex(self, content: str) -> Document:
2727
def stem(self, tokens: list[str]) -> None:
2828
pass
2929

30+
@abstractmethod
31+
def word_tokenize(self, query: str) -> list[str]:
32+
pass
33+
3034
@property
3135
def doc_stats(self):
3236
return self._doc_stats

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ readme="README.md"
99
requires-python = ">=3.10"
1010
version = "0.0.1"
1111
dependencies = [
12-
"nltk"
12+
"nltk",
13+
"sklearn"
1314
]
1415

1516
[tool.setuptools.packages.find]

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pysimplegui
2-
nltk
2+
nltk
3+
sklearn

0 commit comments

Comments
 (0)