-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
56 lines (44 loc) · 1.42 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
import numpy as np
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import TfidfModel, Word2Vec
# loading
tfidf = TfidfModel.load("trained/tfidf.model")
corpus = MmCorpus("data/text.mm")
w2v = Word2Vec.load("trained/w2v/trained.w2v")
dictionary = Dictionary.load("data/dict.dict")
# varibles
articles = ["a", "an", "the"]
threshold = 0.9
def compare(def1, def2):
# process words and split into array
def1 = re.sub("[^a-zA-Z\s]", " ", def1.lower()).split()
def2 = re.sub("[^a-zA-Z\s]", " ", def2.lower()).split()
def1 = [x for x in def1 if x not in articles]
def2 = [x for x in def2 if x not in articles]
def1_len = len(def1)
def2_len = len(def2)
# vectors of words in sentences
def1v = w2v[def1]
def2v = w2v[def2]
# processing of definitions go here
def1v = np.sum(def1v) / def1_len
def2v = np.sum(def2v) / def2_len
'''
# tfidf weights of each word
def1w = tfidf_values[dictionary.token2id[def1]]
def2w = tfidf_values[dictionary.token2id[def2]]
# take dot product of vector and weight
def1v = np.dot(def1v, def1w)
def2v = np.dot(def2v, def2w)
'''
# take cos distance
difference = np.inner(def1v, def2v) / (np.linalg.norm(def1v) * np.linalg.norm(def2v))
return difference
while True:
try:
str1 = input('1 > ')
str2 = input('2 > ')
print(compare(str1, str2))
except Exception:
pass