diff --git a/.joblib/partial_cv_stochastic_gradient_descent_4b79a978baafb561e9bc7b0317ee92dd2080b44004885c45a98437936c7f8f01.job b/.joblib/partial_cv_stochastic_gradient_descent_4b79a978baafb561e9bc7b0317ee92dd2080b44004885c45a98437936c7f8f01.job new file mode 100644 index 0000000..665a368 Binary files /dev/null and b/.joblib/partial_cv_stochastic_gradient_descent_4b79a978baafb561e9bc7b0317ee92dd2080b44004885c45a98437936c7f8f01.job differ diff --git a/dataset/preprocess.py b/dataset/preprocess.py index aa62b82..85df4c5 100644 --- a/dataset/preprocess.py +++ b/dataset/preprocess.py @@ -13,8 +13,8 @@ def unzip_file(zip_file_path, extract_to_path): # Unzip t he ml-latest and ml-latest-small datasets -# unzip_file(os.path.join(script_directory, "ml-latest-small.zip"), script_directory) -# unzip_file(os.path.join(script_directory, "ml-latest.zip"), script_directory) +unzip_file(os.path.join(script_directory, "ml-latest-small.zip"), script_directory) +unzip_file(os.path.join(script_directory, "ml-latest.zip"), script_directory) # Helper functions for constructing the input paths make_path_full = lambda path: os.path.join(script_directory, "ml-latest", path) diff --git a/src/data/Data.py b/src/data/Data.py index 099361b..1843e01 100644 --- a/src/data/Data.py +++ b/src/data/Data.py @@ -82,6 +82,11 @@ def _create_interactions( self.ratings_test_df = pd.DataFrame() def process_user(df: DataFrame, kind: Literal["train", "test"]): + """ + Given a dataframe containing a single user's collection of ratings, + iterate through all of them, update the dictionaries and add the data + to the correct lists + """ nonlocal new_user_index for _, row in df.iterrows(): movie_id, r = row["movieId"], row["rating"] @@ -141,6 +146,8 @@ def _load_ratings(self, test_size: float) -> tuple[coo_array, coo_array]: ) df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s") df.sort_values(by="timestamp", inplace=True) + + # shape of the interactions matrix (n_unique_users, n_unique_movies) shape = (df["userId"].nunique(), self.how_many_unique_movie_ids) self.interactions_train, self.interactions_test = self._create_interactions( @@ -155,30 +162,12 @@ def _load_ratings(self, test_size: float) -> tuple[coo_array, coo_array]: return self.interactions_train, self.interactions_test - def id_to_index(self, id: int, kind: Literal["user", "item"]) -> int: - """ - Convert an ID to an index of the user-item matrix - """ - if kind == "user": - return self.user_id_to_index[id] - else: - return self.item_id_to_index[id] - - def index_to_id(self, index: int, kind: Literal["user", "item"]) -> int: - """ - Convert an index of the user-item matrix to an ID - """ - if kind == "user": - return self.user_index_to_id[index] - else: - return self.item_index_to_id[index] - def _load_movies(self) -> None: """ - Load the movies information as DataFrame + Load the movies information as DataFrame alongside the tags """ movies_df = pd.read_csv( - self.data_path + "movies.csv", # "movies_filtered.csv", + self.data_path + "movies.csv", dtype={"movieId": int, "title": str, "genres": str}, ) @@ -233,17 +222,23 @@ def get_movie_from_indices(self, movie_indices: list[int]) -> DataFrame: Given a movie index, return the movie information """ return self.get_movies_from_ids( - [self.index_to_id(i, "item") for i in movie_indices] + [self.item_index_to_id[i] for i in movie_indices] ) def get_user_ratings( self, user_id: int, dataset: Literal["train", "test"] - ) -> NDArray: + ) -> NDArray[np.float64]: + """ + Return the given user's ratings + """ user_index = self.user_id_to_index[user_id] arr = self.interactions_train if dataset == "train" else self.interactions_test return csr_array(arr.getrow(user_index)).toarray()[0] - def get_weighed_user_ratings(self, user_id: int): + def get_weighed_user_ratings(self, user_id: int) -> list[tuple[int, float, float]]: + """ + Return the given user's ratings alongside a weight computed from the age of the rating + """ user_df = self.ratings_train_df[self.ratings_train_df["userId"] == user_id] timestamps = user_df["timestamp"] try: @@ -251,7 +246,7 @@ def get_weighed_user_ratings(self, user_id: int): except ValueError: return [] weights = [exponential_decay(timestamp, base_time) for timestamp in timestamps] - ratings: list[tuple[int, float]] = list( + ratings = list( user_df[["movieId", "rating"]].itertuples(index=False, name=None) ) return [(tup[0], tup[1], weight) for tup, weight in zip(ratings, weights)] @@ -259,6 +254,9 @@ def get_weighed_user_ratings(self, user_id: int): def get_liked_movies_indices( self, user_id: int, biased: bool, dataset: Literal["train", "test"] ) -> list[int]: + """ + Return the indices of movies liked by a given user + """ user_ratings = self.get_user_ratings(user_id, dataset) nz = user_ratings.nonzero() if len(user_ratings[nz]) == 0: @@ -277,6 +275,9 @@ def get_liked_movies_indices( return sorted(liked_indices, key=lambda x: user_ratings[x], reverse=True) def get_user_bias(self, user_id: int): + """ + Compute a given user's rating bias + """ user_index = self.user_id_to_index[user_id] user_ratings = self.get_user_ratings(user_id, "train") if np.count_nonzero(user_ratings) == 0: @@ -288,6 +289,9 @@ def get_user_bias(self, user_id: int): return user_bias def get_weighed_liked_movie_indices(self, user_id: int, biased: bool): + """ + Return the indices of movies liked by a given user, weighed by the age of the rating + """ user_ratings = self.get_weighed_user_ratings(user_id) if len(user_ratings) == 0: return [] @@ -307,5 +311,8 @@ def get_weighed_liked_movie_indices(self, user_id: int, biased: bool): return user_likes def get_ratings_count(self, user_id: int): + """ + Compute how many ratings a given user has provided + """ ratings = self.get_user_ratings(user_id, "train") return np.count_nonzero(ratings) diff --git a/src/models/collaborative_filtering/ensemble/Ensemble.py b/src/models/collaborative_filtering/ensemble/Ensemble.py index 8723fae..48ceb6b 100644 --- a/src/models/collaborative_filtering/ensemble/Ensemble.py +++ b/src/models/collaborative_filtering/ensemble/Ensemble.py @@ -1,4 +1,3 @@ -from numpy import int64 from numpy.typing import NDArray from data import Data from ..matrix_factorization import ALS, SGD @@ -41,7 +40,7 @@ def predict(self, u: int, i: int): predictions.append(nn_prediction) return float(np.mean(predictions)) - def top_n(self, user_index: int, n: int) -> list[int] | NDArray[int64]: + def top_n(self, user_index: int, n: int) -> list[int] | NDArray[np.int64]: if not self.is_fit: raise RuntimeError("Untrained model, invoke fit before predicting") sgd_recs = self.sgd_model.top_n(user_index, n) diff --git a/src/models/collaborative_filtering/matrix_factorization/SGD/SGD.py b/src/models/collaborative_filtering/matrix_factorization/SGD/SGD.py index ed65d5f..7a22766 100644 --- a/src/models/collaborative_filtering/matrix_factorization/SGD/SGD.py +++ b/src/models/collaborative_filtering/matrix_factorization/SGD/SGD.py @@ -3,7 +3,6 @@ from data import Data from ..MF_Base import MF_Base from utils import RandomSingleton -from typing_extensions import Self from tqdm import tqdm @@ -24,7 +23,7 @@ def fit( batch_size: int = 8, lr_decay_factor: float = 0.5, silent=False, - ) -> Self: + ): """ Mini batch SGD training algorithm """ diff --git a/src/models/content_based/Content_Based.py b/src/models/content_based/Content_Based.py index 36d915a..59c5b45 100644 --- a/src/models/content_based/Content_Based.py +++ b/src/models/content_based/Content_Based.py @@ -1,4 +1,3 @@ -from typing_extensions import Self import numpy as np from numpy.typing import NDArray from data import Data @@ -83,7 +82,7 @@ def _get_movie_vector(self, movie_index: int) -> spmatrix: """ Given a movie index compute the respective tfidf matrix """ - movie_id = self.data.index_to_id(movie_index, "item") + movie_id = self.data.item_index_to_id[movie_index] movie = self.data.get_movies_from_ids([movie_id]) movie_genres = movie["genres"].values diff --git a/src/models/non_personalized/Highest_Rated.py b/src/models/non_personalized/Highest_Rated.py index 2a7b192..1978f8c 100644 --- a/src/models/non_personalized/Highest_Rated.py +++ b/src/models/non_personalized/Highest_Rated.py @@ -18,7 +18,5 @@ def top_n(self, user_index: int, n: int): z = sorted(z, key=lambda x: x[1], reverse=True) top_n_indices = [x[0] for x in z][:n] - movie_ids = np.array( - [self.data.index_to_id(idx, "item") for idx in top_n_indices] - ) + movie_ids = np.array([self.data.item_index_to_id[idx] for idx in top_n_indices]) return movie_ids diff --git a/src/models/non_personalized/Most_Popular.py b/src/models/non_personalized/Most_Popular.py index ba8db6d..2e797de 100644 --- a/src/models/non_personalized/Most_Popular.py +++ b/src/models/non_personalized/Most_Popular.py @@ -28,7 +28,5 @@ def top_n(self, user_index: int, n: int): z = sorted(z, key=lambda x: x[1], reverse=True) top_n_indices = [x[0] for x in z][:n] - movie_ids = np.array( - [self.data.index_to_id(idx, "item") for idx in top_n_indices] - ) + movie_ids = np.array([self.data.item_index_to_id[idx] for idx in top_n_indices]) return movie_ids diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 4267958..7d9b4ee 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -66,9 +66,6 @@ def normalized_discounted_cumulative_gain( relevant_items: list[int] | NDArray[np.int64], recommended_items: list[int] | NDArray[np.int64], ): - """ - Compute the normalized discounted cumulative gain - """ binary_relevance = [int(idx in relevant_items) for idx in recommended_items] ideal_relevance = sorted(binary_relevance, reverse=True) return ndcg_score(np.array([ideal_relevance]), np.array([binary_relevance]))