1
+ import random
1
2
from lib .codec import Codec , TextCodec
2
3
from collections import deque
3
4
import os
9
10
10
11
from lib .lexers import AbstractLexer , WikiLexer
11
12
13
+ from sklearn .feature_extraction .text import TfidfVectorizer
14
+
15
+ # maximum number of terms to compare when we encounter unknown term
16
+ MAX_TERMS = int (os .getenv ("MAX_TERMS" , 10 ))
17
+
12
18
13
19
class Index :
14
20
"""
@@ -24,6 +30,7 @@ def __init__(
24
30
25
31
self .codec = codec
26
32
self ._avg_dl = None
33
+ self .vectorizer = TfidfVectorizer ()
27
34
28
35
def doc_length (self , doc_id : int ):
29
36
return self .doc_stats [doc_id ]
@@ -43,7 +50,8 @@ def corpus_size(self):
43
50
def release (self ) -> None :
44
51
self .posting_file .close ()
45
52
46
- def fetch_docs (self , term : str ) -> tuple [list [list [int ]], int ]:
53
+ def fetch_index_record (self , term : str ) -> tuple [list [list [int ]], int ]:
54
+ term = term if term in self .lexicon else self .handle_unknown_term (term )
47
55
_ , doc_freq , offset = self .lexicon [term ]
48
56
49
57
self .posting_file .seek (offset )
@@ -52,6 +60,39 @@ def fetch_docs(self, term: str) -> tuple[list[list[int]], int]:
52
60
self .posting_file , self .codec )
53
61
yield (self .codec .decode (bytes_ ), doc_freq )
54
62
63
+ def compute_similarity (self , term0 : str , term1 : str ) -> float :
64
+ """
65
+ calculates the cosine similarity of the two terms
66
+ @param term0
67
+ @desc: first term
68
+
69
+ @param term1
70
+ @desc: second term
71
+
72
+ @return float
73
+ @desc: range [0, 1]
74
+ """
75
+ try :
76
+ tfidf = self .vectorizer .fit_transform ([term0 , term1 ])
77
+ return (tfidf * tfidf .T ).A [0 , 1 ]
78
+ except :
79
+ return 0
80
+
81
+ def handle_unknown_term (self , term : str ) -> str :
82
+ best_match = None
83
+ best_score = - 1
84
+ terms = list (self .lexicon .keys ())
85
+
86
+ random .shuffle (terms )
87
+ tslice = terms [:MAX_TERMS ]
88
+ for key in tslice :
89
+ score = self .compute_similarity (term , key )
90
+ if score > best_score :
91
+ best_match = key
92
+ best_score = score
93
+
94
+ return best_match
95
+
55
96
56
97
class Indexer :
57
98
"""
@@ -107,7 +148,7 @@ def index(self):
107
148
def execute (self , filenames : list [str ], block_size : int = 33554432 , n : int = - 1 ) -> Index :
108
149
"""
109
150
indexes the corpus in represented by @param filenames
110
-
151
+
111
152
@param: filenames
112
153
@desc: list of files to index
113
154
@@ -128,7 +169,7 @@ def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1)
128
169
block .append (self .lexer .lex (doc .strip ()))
129
170
130
171
posting_filenames .appendleft ((self .algo .index (block ), 0 ))
131
-
172
+
132
173
index_filename = self .algo .merge (posting_filenames )
133
174
os .rename (index_filename , self .index_filename )
134
175
index_file : IO [bytes ] = open (self .index_filename , "rb" )
@@ -142,12 +183,12 @@ def execute(self, filenames: list[str], block_size: int = 33554432, n: int = -1)
142
183
self .codec ,
143
184
)
144
185
145
- return self ._index
186
+ return self ._index
146
187
147
188
def export_index (self ):
148
189
"""
149
190
exports lexicon, term lexicon and document stats using the pickle protocol
150
- """
191
+ """
151
192
FilePickler .dump (self .index .lexicon , self ._lexicon_filename )
152
193
FilePickler .dump (self .index .doc_stats , self ._doc_stat_filename )
153
194
FilePickler .dump (self .algo .term_lexicon , self ._terms_lexicon_filename )
0 commit comments