This repository has been archived by the owner on Feb 28, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathretrieval_model.py
49 lines (41 loc) · 1.66 KB
/
retrieval_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
data_file = open('replika_chat_backup.csv')
reader = csv.reader(data_file)
data = list(reader)
context = []
responses = []
speaking = 'Me'
for response in data:
if response[1] == 'Me' and speaking == 'Rep':
context.append(response[2])
speaking = 'Me'
if response[1] == 'Rep' and speaking == 'Me': #Only append her response to a question from me (not a double response)
responses.append(response[2])
speaking = 'Rep'
responses.pop(0) #Remove her first intro blurb
vectorizer= TfidfVectorizer()
X = vectorizer.fit(context)
array = X.transform(context).toarray()
def cosine_similarity(a, b):
"""Takes 2 vectors a, b and returns the cosine similarity according
to the definition of the dot product"""
dot_product = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
return dot_product / (norm_a * norm_b)
def get_matching_response(test_context):
"""Takes a sample phrase (test_context) & returns the response with
the highest cosine similarity"""
test_context = [test_context]
test_vector = X.transform(test_context).toarray()
response = ""
most_sim = 0
for i in range(len(context)):
if most_sim < cosine_similarity(array[i], test_vector[0]):
most_sim = cosine_similarity(array[i], test_vector[0])
answer_index = i #get the index of the current most similar context
response = responses[answer_index] #get the answer of the most similar context.
response_value = most_sim
print(f"{response} ({response_value})")