-
Notifications
You must be signed in to change notification settings - Fork 0
/
semantic_similarity.py
55 lines (51 loc) · 2.53 KB
/
semantic_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# usado para importar o csv
import csv
# Usado para fazer o tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
# diretorio dataset
directory_dataset = 'data_academia.csv'
def calc_tfidf():
with open('data_academia_results.csv', 'w') as csvfile:
# fields do csv
fieldnames = ['id_question',
'answer_count',
'AcceptedAnswerId',
'body_question',
'id_author_question',
'name_author_question',
'reputation_author_question',
'id_answer',
'body_answer',
'id_author_answer',
'name_author_answer',
'reputation_author_answer',
'score_answer',
'id_parent',
'accepted_answer',
'tf_idf']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
with open(directory_dataset) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
vect = TfidfVectorizer(min_df=1)
list = [row['body_question'], row['body_answer']]
tfidf = vect.fit_transform(list)
result = (tfidf * tfidf.T).A
writer.writerow({'id_question': row['id_question'],
'answer_count': row['answer_count'],
'AcceptedAnswerId': row['AcceptedAnswerId'],
'body_question': row['body_question'],
'id_author_question': row['id_author_question'],
'name_author_question': row['name_author_question'],
'reputation_author_question': row['reputation_author_question'],
'id_answer': row['id_answer'],
'body_answer': row['body_answer'],
'id_author_answer': row['id_author_answer'],
'name_author_answer': row['name_author_answer'],
'reputation_author_answer': row['reputation_author_answer'],
'score_answer': row['score_answer'],
'id_parent': row['id_parent'],
'accepted_answer': row['accepted_answer'],
'tf_idf': result[0][1]})
calc_tfidf()