-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
110 lines (74 loc) · 3.63 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import yake
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from data import embeddings, movies, keywords
import transformers
sen_transformer = SentenceTransformer('all-mpnet-base-v2')
def get_len(row: str, plots: pd.DataFrame):
length_all = 0
number_of_samples = len(plots)
for delta in plots[row]:
length_all += len(delta) #We add length of the each plot to 'length_all'.
return round((length_all / number_of_samples))
#This function above will help us to demonstrate average length of the plots
def get_keywords(movies : pd.DataFrame):
kw_extraction = yake.KeywordExtractor(lan = 'en', n = 3, top = 25)
"""
lan - represents the language that you want to extract keyword.
n - represents maximum n-gram length.
top - number of keywords that will be extracted.
"""
from tqdm import tqdm
sentences = []
total_iteration = len(movies.sample(25000))
with tqdm(total= total_iteration) as pbar:
for delta in movies['Plot']:
texts = []
keywords = kw_extraction.extract_keywords(delta)
for keyword, _ in keywords:
texts.append(keyword.lower())
joined_text = ','.join(texts)
sentences.append(joined_text)
pbar.update(1)
return sentences
#This function takes plot_index, embeddings, number of similar plots that you want to see and returns the index of similar movies in the dataset.
def get_similarity_score(plot_index, embeddings, num_of_similars):
similarity = faiss.IndexFlatL2(768) #It uses L2 norm for calculating similarity
similarity.add(embeddings)
query_vector = embeddings[plot_index]
#We will calculate similarity scores from all plots against query_vector.
query_vector = query_vector.reshape(1, 768) #Adding 1 extra dimension to query vector.
D, I = similarity.search(query_vector, num_of_similars) #This gives as most similar embeddings indexes with our query vector.
return I
"""
This function is similar the function above, but this time it takes a plot our a message from user and print movies that similar the movie that
the user describe
"""
def get_similarity_from_plot(user_input, embeddings, num_of_similars, print_plot):
kw_extractor = yake.KeywordExtractor(lan = 'en', n = 3, top = 25)
similarity = faiss.IndexFlatL2(768)
sen_transformer = SentenceTransformer('all-mpnet-base-v2')
similarity.add(embeddings)
text = []
movies_reset_index = movies.reset_index(drop = True)
keywords = kw_extractor.extract_keywords(user_input)
for kw, _ in keywords:
text.append(kw)
joined_keywords = ','.join(text)
input_embedding = sen_transformer.encode(joined_keywords)
query_vector = input_embedding.reshape(-1, 768)
#We are extracting keywords from user input and then convert them into embeddings.
D, I = similarity.search(query_vector, num_of_similars)
movie_titles = movies_reset_index.loc[I.reshape(-1), 'Title'].tolist()
movie_plots = movies_reset_index.loc[I.reshape(-1), 'Plot'].tolist()
rand_index = np.random.randint(num_of_similars, size = 1)
#print(f'I recommend, {movie_titles}')
if print_plot == True:
print(f'Here is the plot:\n{movie_plots[rand_index[0]]}')
movie_names = ''
for x in movie_titles:
movie_names = movie_names + x + ', '
return movie_names[:-2]
#print(f'This is shape : {embeddings.shape}')