-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsim2id.py
executable file
·34 lines (25 loc) · 1.24 KB
/
sim2id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# function to output top n ids for the similar paragraph based on tfidf values
def sim2id(n, sections, index_list, query):
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(sections)
titles = [i for i in sections]
titles_vector = vectorizer.transform(titles)
titles_arr = titles_vector.toarray()
query_vector = vectorizer.transform([query])
query_arr = query_vector.toarray()
sim_mat = cosine_similarity(titles_arr, query_arr)
top_n = [i[0] for i in sorted(enumerate(sim_mat), key=lambda x:x[1], reverse=True)][:n]
top_sim = [i[1][0] for i in sorted(enumerate(sim_mat), key=lambda x:x[1], reverse=True)][:n]
top_n_ids = [index_list[top_n[i]][2] for i in range(n)]
top_n_titles = [index_list[top_n[i]][0] for i in range(n)]
top_n_sim = [sim_mat[top_n[i]][0] for i in range(n)]
# print(top_n_sim)
# for i in range(n):
# print(i+1, "section ids:", top_n_ids[i], " section: ", top_n_titles[i])
print(top_n_ids)
print(top_n_titles)
# print('\n\n\n')
# print(sections[top_n[0]])
return top_n, top_sim, top_n_titles, top_n_ids