Skip to content

Commit

Permalink
> add helper functions
Browse files Browse the repository at this point in the history
  • Loading branch information
Selefth committed Apr 27, 2024
1 parent 2876d62 commit 1eb92ca
Show file tree
Hide file tree
Showing 4 changed files with 570 additions and 0 deletions.
116 changes: 116 additions & 0 deletions src/helper_functions/data_formatting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

import pandas as pd
import numpy as np
import scipy.sparse as sp
import bottleneck as bn

def get_sparse_matrix(data, user_col, item_col, rating_col):
"""
Creates a sparse user-item interaction matrix from a given pandas DataFrame.
-------
Args:
data (pandas.DataFrame): Dataset capturing user-item interactions.
user_col (str): Column name representing user identifiers.
item_col (str): Column name representing item identifiers.
rating_col (str): Column name denoting the rating or interaction strength.
-------
Returns:
A sparse matrix in Compressed Sparse Row (CSR) format representing the user feedback matrix.
"""
row_indices = data[user_col].cat.codes
col_indices = data[item_col].cat.codes
ratings = data[rating_col]

R = sp.csr_matrix((ratings, (row_indices, col_indices)), dtype=np.float64)

return R

def get_raw_item_ids(df_pp, item_ids_inner):
"""
Convert inner item IDs to their original raw form using the DataFramePreprocessor's item_id_mapping.
-------
Args:
df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
item_ids_inner (numpy.array): A NumPy array containing the inner item IDs.
-------
Returns:
A NumPy array of raw item IDs.
"""
item_id_mapping_df = df_pp.item_id_mapping
iid_raw = item_id_mapping_df.columns[0]

item_ids_raw = item_id_mapping_df[item_id_mapping_df["iid"].isin(item_ids_inner)][iid_raw].values

return item_ids_raw

def get_raw_user_ids(df_pp, user_ids_inner):
"""
Convert inner user IDs to their original raw form using the DataFramePreprocessor's user_id_mapping.
-------
Args:
df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
user_ids_inner (numpy.array): A NumPy array containing the inner user IDs.
-------
Returns:
A NumPy array of raw user IDs.
"""
user_id_mapping_df = df_pp.user_id_mapping
uid_raw = user_id_mapping_df.columns[0]

user_ids_raw = user_id_mapping_df[user_id_mapping_df["uid"].isin(user_ids_inner)][uid_raw].values

return user_ids_raw

def get_inner_item_ids(df_pp, item_ids_raw):
"""
Convert raw item IDs to their inner form using the DataFramePreprocessor's item_id_mapping.
-------
Args:
df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
item_ids_raw (numpy.array): A NumPy array containing the raw item IDs.
-------
Returns:
A NumPy array of inner item IDs.
"""
item_id_mapping_df = df_pp.item_id_mapping
iid_raw = item_id_mapping_df.columns[0]

item_ids_inner = item_id_mapping_df[item_id_mapping_df[iid_raw].isin(item_ids_raw)]["iid"].values

return item_ids_inner

def get_inner_user_ids(df_pp, user_ids_raw):
"""
Convert raw user IDs to their inner form using the DataFramePreprocessor's user_id_mapping.
-------
Args:
df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
user_ids_raw (numpy.array): A NumPy array containing the raw user IDs.
-------
Returns:
A NumPy array of inner user IDs.
"""
user_id_mapping_df = df_pp.user_id_mapping
uid_raw = user_id_mapping_df.columns[0]

user_ids_inner = user_id_mapping_df[user_id_mapping_df[uid_raw].isin(user_ids_raw)]["uid"].values

return user_ids_inner

def get_topn_indices(R_hat, n):
"""
Helper function to get sorted indices of top-n items in each row of R_hat.
"""
users = R_hat.shape[0]

# find the indices that partition the array so that the first n elements are the largest n elements
idx_topn_part = bn.argpartition(-R_hat, n, axis=1)

# keep only the largest n elements of R_hat
topn_part = R_hat[np.arange(users)[:, np.newaxis], idx_topn_part[:, :n]]

# find the indeces of the sorted top-n predicted relevance scores in R_hat
idx_part = np.argsort(-topn_part, axis=1)
idx_topn = idx_topn_part[np.arange(users)[:, np.newaxis], idx_part]

return idx_topn
68 changes: 68 additions & 0 deletions src/helper_functions/metrics_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@

import numpy as np
import bottleneck as bn

from src.helper_functions.data_formatting import *

def recall_at_n(R_hat,R_held,n):
"""
Computes the Recall at the given value of n.
This metric does not take the rank of recommended items into account.
Only for usage with binary feedback data.
-------
Args:
R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
R_held (sparse matrix): The held out user feedback matrix.
n (int): The number of items to consider in the ranking.
-------
Returns:
recall_mean (flt): The mean Recall@N across users.
recall_std (flt): The standard deviation of Recall@N across users.
"""

users = R_hat.shape[0]

# find the indices that partition the array so that the first n elements are the largest n elements
idx = bn.argpartition(-R_hat, n, axis=1)

R_hat_binary = np.zeros_like(R_hat, dtype=bool)
R_hat_binary[np.arange(users)[:, np.newaxis], idx[:, :n]] = True

R_held_binary = (R_held > 0).toarray()

# recall@N for each user
recall = (np.logical_and(R_held_binary, R_hat_binary).sum(axis=1)).astype(np.float32) / np.minimum(n, R_held_binary.sum(axis=1))

recall_mean = np.mean(recall); recall_std = np.std(recall)

return recall_mean, recall_std

def tndcg_at_n(R_hat,R_held,n):
"""
Computes the truncated Normalized Discounted Cumulative Gain (NDCG) at the given value of n.
A score of 1 is achieved when dcg = idcg (ideal dcg).
Only for usage with binary feedback data.
-------
Args:
R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
R_held (sparse matrix): The held out user feedback matrix.
n (int): The number of items to consider in the ranking.
-------
Returns:
tndcg_mean (flt): The mean truncated NDCG@N across users.
tndcg_std (flt): The standard deviation of truncated NDCG@N across users.
"""

users = R_hat.shape[0]

# find the indeces of the sorted top-n predicted relevance scores in R_hat
idx_topn = get_topn_indices(R_hat, n)

tp = 1. / np.log2(np.arange(2, n + 2))
dcg = (R_held[np.arange(users)[:, np.newaxis], idx_topn].toarray() * tp).sum(axis=1)
idcg = np.array([(tp[:min(i, n)]).sum() for i in R_held.getnnz(axis=1)])
tndcg = dcg / idcg

tndcg_mean = np.mean(tndcg); tndcg_std = np.std(tndcg)

return tndcg_mean, tndcg_std
105 changes: 105 additions & 0 deletions src/helper_functions/metrics_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@

import numpy as np

from src.helper_functions.data_formatting import *

def coverage(R_hat, item_ids, n):
"""
Coverage quantifies the proportion of items that are recommended out of the specified item set.
It is a metric indicative of the system's ability to diversify its recommendations.
A score close to 1 denotes higher diversity, suggesting that the system effectively recommends a vast array of items to users.
-------
Args:
R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
item_ids (lst): A list of item IDs for which to compute the coverage.
n (int): The number of items to consider in the ranking.
-------
Returns:
coverage (flt): The total coverage score.
"""

# find the indices of the sorted top-n predicted relevance scores in R_hat
idx_topn = get_topn_indices(R_hat, n)
flat_topn = idx_topn.flatten()

item_counts = {item_id: np.count_nonzero(flat_topn == item_id) for item_id in item_ids} # count frequency of each item ID
item_counts = np.array(list(item_counts.values()))
coverage = sum(np.minimum(1, item_counts)) / len(item_ids)

return coverage

def u_parity_at_n(R_hat, protected_users, items_dict, n):
"""
Computes the u-parity@N score.
The User-coverage Parity metric measures the average disparity between the proportions of protected
and non-protected users receiving recommendations from each category.
-------
Args:
R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
protected_users (numpy.ndarray): A binary array where 1 indicates a user is part of the protected group and 0 otherwise.
items_dict (dict): A dictionary mapping item categories to their corresponding item IDs.
n (int): The number of items to consider in the ranking.
-------
Returns:
u_parity (flt): The u-parity@N score.
std_dev (flt): The standard deviation of the differences across item categories.
"""

# get top-n indices for all users
idx_topn = get_topn_indices(R_hat, n)

# compute the number of protected and non-protected users once
num_protected_users = sum(protected_users)
num_non_protected_users = len(protected_users) - num_protected_users

differences = []
for _, item_ids in items_dict.items():
count_protected_users = 0; count_non_protected_users = 0
for i, is_protected in enumerate(protected_users):
if np.isin(idx_topn[i], item_ids).sum() > 0:
if is_protected:
count_protected_users += 1
else:
count_non_protected_users += 1

# normalize the number of recommendations by the total number of users in each group
prop_protected_users = count_protected_users / num_protected_users
prop_non_protected_users = count_non_protected_users / num_non_protected_users

differences.append(abs(prop_non_protected_users - prop_protected_users))

u_parity = np.mean(differences); std_dev = np.std(differences)

return u_parity, std_dev

def apcr_at_n(R_hat, items_dict, n):
"""
Computes the apcr@N score.
This metric measures the average rate at which providers are covered in the recommendation lists for a set of users.
-------
Args:
R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
items_dict (dict): A dictionary mapping providers to their corresponding item IDs.
n (int): The number of items to consider in the ranking.
-------
Returns:
apcr (flt): The apcr@N score.
-------
Reference:
Weiwen Liu and Robinn Burke
Personalizing Fairness-aware Re-ranking. FATREC Workshop on Responsible Recommendation 2018
"""

# reverse the items_dict to map item IDs to provider IDs
providers_dict = {item: provider for provider, items in items_dict.items() for item in items}

idx_topn = get_topn_indices(R_hat, n)

count_providers = []
for user_row in idx_topn:
user_providers = set(providers_dict.get(item) for item in user_row)
count_providers.append(len(user_providers))

apcr = np.mean(count_providers) / len(items_dict)

return apcr
Loading

0 comments on commit 1eb92ca

Please sign in to comment.