Refactor code and fix imports

owsky · Dec 18, 2023 · ac20fdb · ac20fdb
1 parent 53ff3f0
commit ac20fdb
Show file tree

Hide file tree

Showing 9 changed files with 38 additions and 41 deletions.
diff --git a/...tic_gradient_descent_4b79a978baafb561e9bc7b0317ee92dd2080b44004885c45a98437936c7f8f01.job b/...tic_gradient_descent_4b79a978baafb561e9bc7b0317ee92dd2080b44004885c45a98437936c7f8f01.job
diff --git a/dataset/preprocess.py b/dataset/preprocess.py
@@ -13,8 +13,8 @@ def unzip_file(zip_file_path, extract_to_path):
 
 
 # Unzip t he ml-latest and ml-latest-small datasets
-# unzip_file(os.path.join(script_directory, "ml-latest-small.zip"), script_directory)
-# unzip_file(os.path.join(script_directory, "ml-latest.zip"), script_directory)
+unzip_file(os.path.join(script_directory, "ml-latest-small.zip"), script_directory)
+unzip_file(os.path.join(script_directory, "ml-latest.zip"), script_directory)
 
 # Helper functions for constructing the input paths
 make_path_full = lambda path: os.path.join(script_directory, "ml-latest", path)

diff --git a/src/data/Data.py b/src/data/Data.py
@@ -82,6 +82,11 @@ def _create_interactions(
         self.ratings_test_df = pd.DataFrame()
 
         def process_user(df: DataFrame, kind: Literal["train", "test"]):
+            """
+            Given a dataframe containing a single user's collection of ratings,
+            iterate through all of them, update the dictionaries and add the data
+            to the correct lists
+            """
             nonlocal new_user_index
             for _, row in df.iterrows():
                 movie_id, r = row["movieId"], row["rating"]
@@ -141,6 +146,8 @@ def _load_ratings(self, test_size: float) -> tuple[coo_array, coo_array]:
         )
         df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
         df.sort_values(by="timestamp", inplace=True)
+
+        # shape of the interactions matrix (n_unique_users, n_unique_movies)
         shape = (df["userId"].nunique(), self.how_many_unique_movie_ids)
 
         self.interactions_train, self.interactions_test = self._create_interactions(
@@ -155,30 +162,12 @@ def _load_ratings(self, test_size: float) -> tuple[coo_array, coo_array]:
 
         return self.interactions_train, self.interactions_test
 
-    def id_to_index(self, id: int, kind: Literal["user", "item"]) -> int:
-        """
-        Convert an ID to an index of the user-item matrix
-        """
-        if kind == "user":
-            return self.user_id_to_index[id]
-        else:
-            return self.item_id_to_index[id]
-
-    def index_to_id(self, index: int, kind: Literal["user", "item"]) -> int:
-        """
-        Convert an index of the user-item matrix to an ID
-        """
-        if kind == "user":
-            return self.user_index_to_id[index]
-        else:
-            return self.item_index_to_id[index]
-
     def _load_movies(self) -> None:
         """
-        Load the movies information as DataFrame
+        Load the movies information as DataFrame alongside the tags
         """
         movies_df = pd.read_csv(
-            self.data_path + "movies.csv",  # "movies_filtered.csv",
+            self.data_path + "movies.csv",
             dtype={"movieId": int, "title": str, "genres": str},
         )
 
@@ -233,32 +222,41 @@ def get_movie_from_indices(self, movie_indices: list[int]) -> DataFrame:
         Given a movie index, return the movie information
         """
         return self.get_movies_from_ids(
-            [self.index_to_id(i, "item") for i in movie_indices]
+            [self.item_index_to_id[i] for i in movie_indices]
         )
 
     def get_user_ratings(
         self, user_id: int, dataset: Literal["train", "test"]
-    ) -> NDArray:
+    ) -> NDArray[np.float64]:
+        """
+        Return the given user's ratings
+        """
         user_index = self.user_id_to_index[user_id]
         arr = self.interactions_train if dataset == "train" else self.interactions_test
         return csr_array(arr.getrow(user_index)).toarray()[0]
 
-    def get_weighed_user_ratings(self, user_id: int):
+    def get_weighed_user_ratings(self, user_id: int) -> list[tuple[int, float, float]]:
+        """
+        Return the given user's ratings alongside a weight computed from the age of the rating
+        """
         user_df = self.ratings_train_df[self.ratings_train_df["userId"] == user_id]
         timestamps = user_df["timestamp"]
         try:
             base_time = timestamps.tail(1).item()
         except ValueError:
             return []
         weights = [exponential_decay(timestamp, base_time) for timestamp in timestamps]
-        ratings: list[tuple[int, float]] = list(
+        ratings = list(
             user_df[["movieId", "rating"]].itertuples(index=False, name=None)
         )
         return [(tup[0], tup[1], weight) for tup, weight in zip(ratings, weights)]
 
     def get_liked_movies_indices(
         self, user_id: int, biased: bool, dataset: Literal["train", "test"]
     ) -> list[int]:
+        """
+        Return the indices of movies liked by a given user
+        """
         user_ratings = self.get_user_ratings(user_id, dataset)
         nz = user_ratings.nonzero()
         if len(user_ratings[nz]) == 0:
@@ -277,6 +275,9 @@ def get_liked_movies_indices(
         return sorted(liked_indices, key=lambda x: user_ratings[x], reverse=True)
 
     def get_user_bias(self, user_id: int):
+        """
+        Compute a given user's rating bias
+        """
         user_index = self.user_id_to_index[user_id]
         user_ratings = self.get_user_ratings(user_id, "train")
         if np.count_nonzero(user_ratings) == 0:
@@ -288,6 +289,9 @@ def get_user_bias(self, user_id: int):
         return user_bias
 
     def get_weighed_liked_movie_indices(self, user_id: int, biased: bool):
+        """
+        Return the indices of movies liked by a given user, weighed by the age of the rating
+        """
         user_ratings = self.get_weighed_user_ratings(user_id)
         if len(user_ratings) == 0:
             return []
@@ -307,5 +311,8 @@ def get_weighed_liked_movie_indices(self, user_id: int, biased: bool):
         return user_likes
 
     def get_ratings_count(self, user_id: int):
+        """
+        Compute how many ratings a given user has provided
+        """
         ratings = self.get_user_ratings(user_id, "train")
         return np.count_nonzero(ratings)
diff --git a/src/models/collaborative_filtering/ensemble/Ensemble.py b/src/models/collaborative_filtering/ensemble/Ensemble.py
@@ -1,4 +1,3 @@
-from numpy import int64
 from numpy.typing import NDArray
 from data import Data
 from ..matrix_factorization import ALS, SGD
@@ -41,7 +40,7 @@ def predict(self, u: int, i: int):
             predictions.append(nn_prediction)
         return float(np.mean(predictions))
 
-    def top_n(self, user_index: int, n: int) -> list[int] | NDArray[int64]:
+    def top_n(self, user_index: int, n: int) -> list[int] | NDArray[np.int64]:
         if not self.is_fit:
             raise RuntimeError("Untrained model, invoke fit before predicting")
         sgd_recs = self.sgd_model.top_n(user_index, n)

diff --git a/src/models/collaborative_filtering/matrix_factorization/SGD/SGD.py b/src/models/collaborative_filtering/matrix_factorization/SGD/SGD.py
@@ -3,7 +3,6 @@
 from data import Data
 from ..MF_Base import MF_Base
 from utils import RandomSingleton
-from typing_extensions import Self
 from tqdm import tqdm
 
 
@@ -24,7 +23,7 @@ def fit(
         batch_size: int = 8,
         lr_decay_factor: float = 0.5,
         silent=False,
-    ) -> Self:
+    ):
         """
         Mini batch SGD training algorithm
         """

diff --git a/src/models/content_based/Content_Based.py b/src/models/content_based/Content_Based.py
@@ -1,4 +1,3 @@
-from typing_extensions import Self
 import numpy as np
 from numpy.typing import NDArray
 from data import Data
@@ -83,7 +82,7 @@ def _get_movie_vector(self, movie_index: int) -> spmatrix:
         """
         Given a movie index compute the respective tfidf matrix
         """
-        movie_id = self.data.index_to_id(movie_index, "item")
+        movie_id = self.data.item_index_to_id[movie_index]
         movie = self.data.get_movies_from_ids([movie_id])
 
         movie_genres = movie["genres"].values

diff --git a/src/models/non_personalized/Highest_Rated.py b/src/models/non_personalized/Highest_Rated.py
@@ -18,7 +18,5 @@ def top_n(self, user_index: int, n: int):
         z = sorted(z, key=lambda x: x[1], reverse=True)
         top_n_indices = [x[0] for x in z][:n]
 
-        movie_ids = np.array(
-            [self.data.index_to_id(idx, "item") for idx in top_n_indices]
-        )
+        movie_ids = np.array([self.data.item_index_to_id[idx] for idx in top_n_indices])
         return movie_ids
diff --git a/src/models/non_personalized/Most_Popular.py b/src/models/non_personalized/Most_Popular.py
@@ -28,7 +28,5 @@ def top_n(self, user_index: int, n: int):
         z = sorted(z, key=lambda x: x[1], reverse=True)
         top_n_indices = [x[0] for x in z][:n]
 
-        movie_ids = np.array(
-            [self.data.index_to_id(idx, "item") for idx in top_n_indices]
-        )
+        movie_ids = np.array([self.data.item_index_to_id[idx] for idx in top_n_indices])
         return movie_ids
diff --git a/src/utils/metrics.py b/src/utils/metrics.py
@@ -66,9 +66,6 @@ def normalized_discounted_cumulative_gain(
     relevant_items: list[int] | NDArray[np.int64],
     recommended_items: list[int] | NDArray[np.int64],
 ):
-    """
-    Compute the normalized discounted cumulative gain
-    """
     binary_relevance = [int(idx in relevant_items) for idx in recommended_items]
     ideal_relevance = sorted(binary_relevance, reverse=True)
     return ndcg_score(np.array([ideal_relevance]), np.array([binary_relevance]))