Skip to content

Commit

Permalink
Refactor code and fix imports
Browse files Browse the repository at this point in the history
  • Loading branch information
owsky committed Dec 18, 2023
1 parent 53ff3f0 commit ac20fdb
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 41 deletions.
Binary file not shown.
4 changes: 2 additions & 2 deletions dataset/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def unzip_file(zip_file_path, extract_to_path):


# Unzip t he ml-latest and ml-latest-small datasets
# unzip_file(os.path.join(script_directory, "ml-latest-small.zip"), script_directory)
# unzip_file(os.path.join(script_directory, "ml-latest.zip"), script_directory)
unzip_file(os.path.join(script_directory, "ml-latest-small.zip"), script_directory)
unzip_file(os.path.join(script_directory, "ml-latest.zip"), script_directory)

# Helper functions for constructing the input paths
make_path_full = lambda path: os.path.join(script_directory, "ml-latest", path)
Expand Down
55 changes: 31 additions & 24 deletions src/data/Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ def _create_interactions(
self.ratings_test_df = pd.DataFrame()

def process_user(df: DataFrame, kind: Literal["train", "test"]):
"""
Given a dataframe containing a single user's collection of ratings,
iterate through all of them, update the dictionaries and add the data
to the correct lists
"""
nonlocal new_user_index
for _, row in df.iterrows():
movie_id, r = row["movieId"], row["rating"]
Expand Down Expand Up @@ -141,6 +146,8 @@ def _load_ratings(self, test_size: float) -> tuple[coo_array, coo_array]:
)
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
df.sort_values(by="timestamp", inplace=True)

# shape of the interactions matrix (n_unique_users, n_unique_movies)
shape = (df["userId"].nunique(), self.how_many_unique_movie_ids)

self.interactions_train, self.interactions_test = self._create_interactions(
Expand All @@ -155,30 +162,12 @@ def _load_ratings(self, test_size: float) -> tuple[coo_array, coo_array]:

return self.interactions_train, self.interactions_test

def id_to_index(self, id: int, kind: Literal["user", "item"]) -> int:
"""
Convert an ID to an index of the user-item matrix
"""
if kind == "user":
return self.user_id_to_index[id]
else:
return self.item_id_to_index[id]

def index_to_id(self, index: int, kind: Literal["user", "item"]) -> int:
"""
Convert an index of the user-item matrix to an ID
"""
if kind == "user":
return self.user_index_to_id[index]
else:
return self.item_index_to_id[index]

def _load_movies(self) -> None:
"""
Load the movies information as DataFrame
Load the movies information as DataFrame alongside the tags
"""
movies_df = pd.read_csv(
self.data_path + "movies.csv", # "movies_filtered.csv",
self.data_path + "movies.csv",
dtype={"movieId": int, "title": str, "genres": str},
)

Expand Down Expand Up @@ -233,32 +222,41 @@ def get_movie_from_indices(self, movie_indices: list[int]) -> DataFrame:
Given a movie index, return the movie information
"""
return self.get_movies_from_ids(
[self.index_to_id(i, "item") for i in movie_indices]
[self.item_index_to_id[i] for i in movie_indices]
)

def get_user_ratings(
self, user_id: int, dataset: Literal["train", "test"]
) -> NDArray:
) -> NDArray[np.float64]:
"""
Return the given user's ratings
"""
user_index = self.user_id_to_index[user_id]
arr = self.interactions_train if dataset == "train" else self.interactions_test
return csr_array(arr.getrow(user_index)).toarray()[0]

def get_weighed_user_ratings(self, user_id: int):
def get_weighed_user_ratings(self, user_id: int) -> list[tuple[int, float, float]]:
"""
Return the given user's ratings alongside a weight computed from the age of the rating
"""
user_df = self.ratings_train_df[self.ratings_train_df["userId"] == user_id]
timestamps = user_df["timestamp"]
try:
base_time = timestamps.tail(1).item()
except ValueError:
return []
weights = [exponential_decay(timestamp, base_time) for timestamp in timestamps]
ratings: list[tuple[int, float]] = list(
ratings = list(
user_df[["movieId", "rating"]].itertuples(index=False, name=None)
)
return [(tup[0], tup[1], weight) for tup, weight in zip(ratings, weights)]

def get_liked_movies_indices(
self, user_id: int, biased: bool, dataset: Literal["train", "test"]
) -> list[int]:
"""
Return the indices of movies liked by a given user
"""
user_ratings = self.get_user_ratings(user_id, dataset)
nz = user_ratings.nonzero()
if len(user_ratings[nz]) == 0:
Expand All @@ -277,6 +275,9 @@ def get_liked_movies_indices(
return sorted(liked_indices, key=lambda x: user_ratings[x], reverse=True)

def get_user_bias(self, user_id: int):
"""
Compute a given user's rating bias
"""
user_index = self.user_id_to_index[user_id]
user_ratings = self.get_user_ratings(user_id, "train")
if np.count_nonzero(user_ratings) == 0:
Expand All @@ -288,6 +289,9 @@ def get_user_bias(self, user_id: int):
return user_bias

def get_weighed_liked_movie_indices(self, user_id: int, biased: bool):
"""
Return the indices of movies liked by a given user, weighed by the age of the rating
"""
user_ratings = self.get_weighed_user_ratings(user_id)
if len(user_ratings) == 0:
return []
Expand All @@ -307,5 +311,8 @@ def get_weighed_liked_movie_indices(self, user_id: int, biased: bool):
return user_likes

def get_ratings_count(self, user_id: int):
"""
Compute how many ratings a given user has provided
"""
ratings = self.get_user_ratings(user_id, "train")
return np.count_nonzero(ratings)
3 changes: 1 addition & 2 deletions src/models/collaborative_filtering/ensemble/Ensemble.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from numpy import int64
from numpy.typing import NDArray
from data import Data
from ..matrix_factorization import ALS, SGD
Expand Down Expand Up @@ -41,7 +40,7 @@ def predict(self, u: int, i: int):
predictions.append(nn_prediction)
return float(np.mean(predictions))

def top_n(self, user_index: int, n: int) -> list[int] | NDArray[int64]:
def top_n(self, user_index: int, n: int) -> list[int] | NDArray[np.int64]:
if not self.is_fit:
raise RuntimeError("Untrained model, invoke fit before predicting")
sgd_recs = self.sgd_model.top_n(user_index, n)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from data import Data
from ..MF_Base import MF_Base
from utils import RandomSingleton
from typing_extensions import Self
from tqdm import tqdm


Expand All @@ -24,7 +23,7 @@ def fit(
batch_size: int = 8,
lr_decay_factor: float = 0.5,
silent=False,
) -> Self:
):
"""
Mini batch SGD training algorithm
"""
Expand Down
3 changes: 1 addition & 2 deletions src/models/content_based/Content_Based.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from typing_extensions import Self
import numpy as np
from numpy.typing import NDArray
from data import Data
Expand Down Expand Up @@ -83,7 +82,7 @@ def _get_movie_vector(self, movie_index: int) -> spmatrix:
"""
Given a movie index compute the respective tfidf matrix
"""
movie_id = self.data.index_to_id(movie_index, "item")
movie_id = self.data.item_index_to_id[movie_index]
movie = self.data.get_movies_from_ids([movie_id])

movie_genres = movie["genres"].values
Expand Down
4 changes: 1 addition & 3 deletions src/models/non_personalized/Highest_Rated.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,5 @@ def top_n(self, user_index: int, n: int):
z = sorted(z, key=lambda x: x[1], reverse=True)
top_n_indices = [x[0] for x in z][:n]

movie_ids = np.array(
[self.data.index_to_id(idx, "item") for idx in top_n_indices]
)
movie_ids = np.array([self.data.item_index_to_id[idx] for idx in top_n_indices])
return movie_ids
4 changes: 1 addition & 3 deletions src/models/non_personalized/Most_Popular.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,5 @@ def top_n(self, user_index: int, n: int):
z = sorted(z, key=lambda x: x[1], reverse=True)
top_n_indices = [x[0] for x in z][:n]

movie_ids = np.array(
[self.data.index_to_id(idx, "item") for idx in top_n_indices]
)
movie_ids = np.array([self.data.item_index_to_id[idx] for idx in top_n_indices])
return movie_ids
3 changes: 0 additions & 3 deletions src/utils/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ def normalized_discounted_cumulative_gain(
relevant_items: list[int] | NDArray[np.int64],
recommended_items: list[int] | NDArray[np.int64],
):
"""
Compute the normalized discounted cumulative gain
"""
binary_relevance = [int(idx in relevant_items) for idx in recommended_items]
ideal_relevance = sorted(binary_relevance, reverse=True)
return ndcg_score(np.array([ideal_relevance]), np.array([binary_relevance]))

0 comments on commit ac20fdb

Please sign in to comment.