so what do we do when we encounter unknown

cleverchuk · cleverchuk · commit 3338ad6f49cf · 2022-08-08T20:38:23.000-04:00
term? we pick the most similar random term.
diff --git a/engine/scorer.py b/engine/scorer.py
@@ -31,7 +31,7 @@ def relevant_docs(self, query: str, k: int = 10) -> list[tuple]:
 
         scores = defaultdict(int)
         for term in tokens:
-            for posting, doc_freq in self.index.fetch_docs(term):
+            for posting, doc_freq in self.index.fetch_index_record(term):
                 _, did, freq = posting
                 scores[did] += self.score(
                     q_freq[term],
diff --git a/lib/indexing.py b/lib/indexing.py
@@ -1,3 +1,4 @@
+import random
 from lib.codec import Codec, TextCodec
 from collections import deque
 import os
@@ -9,6 +10,11 @@
 
 from lib.lexers import AbstractLexer, WikiLexer
 
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+# maximum number of terms to compare when we encounter unknown term
+MAX_TERMS = int(os.getenv("MAX_TERMS", 10))
+
 
 class Index:
     """
@@ -24,6 +30,7 @@ def __init__(
 
         self.codec = codec
         self._avg_dl = None
+        self.vectorizer = TfidfVectorizer()
 
     def doc_length(self, doc_id: int):
         return self.doc_stats[doc_id]
@@ -43,7 +50,8 @@ def corpus_size(self):
     def release(self) -> None:
         self.posting_file.close()
 
-    def fetch_docs(self, term: str) -> tuple[list[list[int]], int]:
+    def fetch_index_record(self, term: str) -> tuple[list[list[int]], int]:
+        term = term if term in self.lexicon else self.handle_unknown_term(term)
         _, doc_freq, offset = self.lexicon[term]
 
         self.posting_file.seek(offset)
@@ -52,6 +60,39 @@ def fetch_docs(self, term: str) -> tuple[list[list[int]], int]:
                 self.posting_file, self.codec)
             yield (self.codec.decode(bytes_), doc_freq)
 
+    def compute_similarity(self, term0: str, term1: str) -> float:
+        """
+            calculates the cosine similarity of the two terms
+            @param term0
+            @desc: first term 
+
+            @param term1
+            @desc: second term
+
+            @return float
+            @desc: range [0, 1]
+        """
+        try:
+            tfidf = self.vectorizer.fit_transform([term0, term1])
+            return (tfidf * tfidf.T).A[0, 1]
+        except:
+            return 0
+
+    def handle_unknown_term(self, term: str) -> str:
+        best_match = None
+        best_score = -1
+        terms = list(self.lexicon.keys())
+
+        random.shuffle(terms)
+        tslice = terms[:MAX_TERMS]
+        for key in tslice:
+            score = self.compute_similarity(term, key)
+            if score > best_score:
+                best_match = key
+                best_score = score
+
+        return best_match
+
 
 class Indexer:
     """
@@ -107,7 +148,7 @@ def index(self):
     def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1) -> Index:
         """
             indexes the corpus in represented by @param filenames
-            
+
             @param: filenames
             @desc: list of files to index
 
@@ -128,7 +169,7 @@ def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1)
                     block.append(self.lexer.lex(doc.strip()))
 
                 posting_filenames.appendleft((self.algo.index(block), 0))
-        
+
         index_filename = self.algo.merge(posting_filenames)
         os.rename(index_filename, self.index_filename)
         index_file: IO[bytes] = open(self.index_filename, "rb")
@@ -142,12 +183,12 @@ def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1)
             self.codec,
         )
 
-        return self._index 
+        return self._index
 
     def export_index(self):
         """
             exports lexicon, term lexicon and document stats using the pickle protocol
-        """ 
+        """
         FilePickler.dump(self.index.lexicon, self._lexicon_filename)
         FilePickler.dump(self.index.doc_stats, self._doc_stat_filename)
         FilePickler.dump(self.algo.term_lexicon, self._terms_lexicon_filename)
diff --git a/lib/lexers.py b/lib/lexers.py
@@ -27,6 +27,10 @@ def lex(self, content: str) -> Document:
     def stem(self, tokens: list[str]) -> None:
         pass
 
+    @abstractmethod
+    def word_tokenize(self, query: str) -> list[str]:
+        pass
+
     @property
     def doc_stats(self):
         return self._doc_stats
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,8 @@ readme="README.md"
 requires-python = ">=3.10"
 version = "0.0.1"
 dependencies = [
-    "nltk"
+    "nltk",
+    "sklearn"
 ]
 
 [tool.setuptools.packages.find]
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 pysimplegui
-nltk
+nltk
+sklearn

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,8 @@ readme="README.md"`
`9`	`9`	`requires-python = ">=3.10"`
`10`	`10`	`version = "0.0.1"`
`11`	`11`	`dependencies = [`
`12`		`- "nltk"`
	`12`	`+ "nltk",`
	`13`	`+ "sklearn"`
`13`	`14`	`]`
`14`	`15`
`15`	`16`	`[tool.setuptools.packages.find]`
-Original file line number
+Diff line change
 pysimplegui
 -nltk
 +nltk
 +sklearn