-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
86 lines (70 loc) · 2.54 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
# Function definitions
def convert(text):
L = []
for i in ast.literal_eval(text):
L.append(i['name'])
return L
def convert3(text):
L = []
counter = 0
for i in ast.literal_eval(text):
if counter < 3:
L.append(i['name'])
counter += 1
return L
def fetch_director(text):
L = []
for i in ast.literal_eval(text):
if i['job'] == 'Director':
L.append(i['name'])
return L
def collapse(L):
L1 = []
for i in L:
L1.append(i.replace(" ", ""))
return L1
def recommend(movie):
index = new[new['title'] == movie].index[0]
distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
for i in distances[1:6]:
print(new.iloc[i[0]].title)
# Load datasets
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')
# Merge datasets on the title column
movies = movies.merge(credits, on='title')
# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)
# Convert JSON-like columns to lists
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)
# Collapse spaces in lists
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
# Split overview into words and create tags
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
# Create a new DataFrame with relevant columns
new = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
# Text vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()
# Calculate cosine similarity
similarity = cosine_similarity(vector)
# Test the recommendation function
recommend('Gandhi')
# Save the necessary files for deployment
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))