> add helper functions

Selefth · Apr 27, 2024 · 1eb92ca · 1eb92ca
1 parent 2876d62
commit 1eb92ca
Show file tree

Hide file tree

Showing 4 changed files with 570 additions and 0 deletions.
diff --git a/src/helper_functions/data_formatting.py b/src/helper_functions/data_formatting.py
@@ -0,0 +1,116 @@
+
+import pandas as pd
+import numpy as np
+import scipy.sparse as sp
+import bottleneck as bn
+
+def get_sparse_matrix(data, user_col, item_col, rating_col):
+    """
+    Creates a sparse user-item interaction matrix from a given pandas DataFrame.
+    -------
+    Args:
+        data (pandas.DataFrame): Dataset capturing user-item interactions.
+        user_col (str): Column name representing user identifiers.
+        item_col (str): Column name representing item identifiers.
+        rating_col (str): Column name denoting the rating or interaction strength.
+    -------
+    Returns:
+        A sparse matrix in Compressed Sparse Row (CSR) format representing the user feedback matrix.
+    """
+    row_indices = data[user_col].cat.codes
+    col_indices = data[item_col].cat.codes
+    ratings = data[rating_col]
+
+    R = sp.csr_matrix((ratings, (row_indices, col_indices)), dtype=np.float64)
+
+    return R
+
+def get_raw_item_ids(df_pp, item_ids_inner):
+    """
+    Convert inner item IDs to their original raw form using the DataFramePreprocessor's item_id_mapping.
+    -------
+    Args:
+        df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
+        item_ids_inner (numpy.array): A NumPy array containing the inner item IDs.
+    -------
+    Returns:
+        A NumPy array of raw item IDs.
+    """
+    item_id_mapping_df = df_pp.item_id_mapping
+    iid_raw = item_id_mapping_df.columns[0]
+
+    item_ids_raw = item_id_mapping_df[item_id_mapping_df["iid"].isin(item_ids_inner)][iid_raw].values
+
+    return item_ids_raw
+
+def get_raw_user_ids(df_pp, user_ids_inner):
+    """
+    Convert inner user IDs to their original raw form using the DataFramePreprocessor's user_id_mapping.
+    -------
+    Args:
+        df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
+        user_ids_inner (numpy.array): A NumPy array containing the inner user IDs.
+    -------
+    Returns:
+        A NumPy array of raw user IDs.
+    """
+    user_id_mapping_df = df_pp.user_id_mapping
+    uid_raw = user_id_mapping_df.columns[0]
+
+    user_ids_raw = user_id_mapping_df[user_id_mapping_df["uid"].isin(user_ids_inner)][uid_raw].values
+
+    return user_ids_raw
+
+def get_inner_item_ids(df_pp, item_ids_raw):
+    """
+    Convert raw item IDs to their inner form using the DataFramePreprocessor's item_id_mapping.
+    -------
+    Args:
+        df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
+        item_ids_raw (numpy.array): A NumPy array containing the raw item IDs.
+    -------
+    Returns:
+        A NumPy array of inner item IDs.
+    """
+    item_id_mapping_df = df_pp.item_id_mapping
+    iid_raw = item_id_mapping_df.columns[0]
+
+    item_ids_inner = item_id_mapping_df[item_id_mapping_df[iid_raw].isin(item_ids_raw)]["iid"].values
+
+    return item_ids_inner
+
+def get_inner_user_ids(df_pp, user_ids_raw):
+    """
+    Convert raw user IDs to their inner form using the DataFramePreprocessor's user_id_mapping.
+    -------
+    Args:
+        df_pp (recpack.preprocessing.preprocessors.DataFramePreprocessor): A DataFramePreprocessor object from the RecPack library.
+        user_ids_raw (numpy.array): A NumPy array containing the raw user IDs.
+    -------
+    Returns:
+        A NumPy array of inner user IDs.
+    """
+    user_id_mapping_df = df_pp.user_id_mapping
+    uid_raw = user_id_mapping_df.columns[0]
+
+    user_ids_inner = user_id_mapping_df[user_id_mapping_df[uid_raw].isin(user_ids_raw)]["uid"].values
+
+    return user_ids_inner
+
+def get_topn_indices(R_hat, n):
+    """
+    Helper function to get sorted indices of top-n items in each row of R_hat.
+    """
+    users = R_hat.shape[0]
+
+    # find the indices that partition the array so that the first n elements are the largest n elements
+    idx_topn_part = bn.argpartition(-R_hat, n, axis=1)
+
+    # keep only the largest n elements of R_hat
+    topn_part = R_hat[np.arange(users)[:, np.newaxis], idx_topn_part[:, :n]]
+
+    # find the indeces of the sorted top-n predicted relevance scores in R_hat
+    idx_part = np.argsort(-topn_part, axis=1)
+    idx_topn = idx_topn_part[np.arange(users)[:, np.newaxis], idx_part]
+
+    return idx_topn
diff --git a/src/helper_functions/metrics_accuracy.py b/src/helper_functions/metrics_accuracy.py
@@ -0,0 +1,68 @@
+
+import numpy as np
+import bottleneck as bn
+
+from src.helper_functions.data_formatting import *
+
+def recall_at_n(R_hat,R_held,n):
+    """
+    Computes the Recall at the given value of n.
+    This metric does not take the rank of recommended items into account.
+    Only for usage with binary feedback data.
+    -------
+    Args:
+        R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
+        R_held (sparse matrix): The held out user feedback matrix.
+        n (int): The number of items to consider in the ranking.
+    -------
+    Returns:
+        recall_mean (flt): The mean Recall@N across users.
+        recall_std (flt): The standard deviation of Recall@N across users.
+    """
+
+    users = R_hat.shape[0]
+
+    # find the indices that partition the array so that the first n elements are the largest n elements
+    idx = bn.argpartition(-R_hat, n, axis=1)
+
+    R_hat_binary = np.zeros_like(R_hat, dtype=bool)
+    R_hat_binary[np.arange(users)[:, np.newaxis], idx[:, :n]] = True
+
+    R_held_binary = (R_held > 0).toarray()
+
+    # recall@N for each user
+    recall = (np.logical_and(R_held_binary, R_hat_binary).sum(axis=1)).astype(np.float32) / np.minimum(n, R_held_binary.sum(axis=1))
+
+    recall_mean = np.mean(recall); recall_std = np.std(recall)
+
+    return recall_mean, recall_std
+
+def tndcg_at_n(R_hat,R_held,n):
+    """
+    Computes the truncated Normalized Discounted Cumulative Gain (NDCG) at the given value of n.
+    A score of 1 is achieved when dcg = idcg (ideal dcg).
+    Only for usage with binary feedback data.
+    -------
+    Args:
+        R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
+        R_held (sparse matrix): The held out user feedback matrix.
+        n (int): The number of items to consider in the ranking.
+    -------
+    Returns:
+        tndcg_mean (flt): The mean truncated NDCG@N across users.
+        tndcg_std (flt): The standard deviation of truncated NDCG@N across users.
+    """
+
+    users = R_hat.shape[0]
+
+    # find the indeces of the sorted top-n predicted relevance scores in R_hat
+    idx_topn = get_topn_indices(R_hat, n)
+
+    tp = 1. / np.log2(np.arange(2, n + 2))
+    dcg = (R_held[np.arange(users)[:, np.newaxis], idx_topn].toarray() * tp).sum(axis=1)
+    idcg = np.array([(tp[:min(i, n)]).sum() for i in R_held.getnnz(axis=1)])
+    tndcg = dcg / idcg
+
+    tndcg_mean = np.mean(tndcg); tndcg_std = np.std(tndcg)
+
+    return tndcg_mean, tndcg_std
diff --git a/src/helper_functions/metrics_coverage.py b/src/helper_functions/metrics_coverage.py
@@ -0,0 +1,105 @@
+
+import numpy as np
+
+from src.helper_functions.data_formatting import *
+
+def coverage(R_hat, item_ids, n):
+    """
+    Coverage quantifies the proportion of items that are recommended out of the specified item set. 
+    It is a metric indicative of the system's ability to diversify its recommendations. 
+    A score close to 1 denotes higher diversity, suggesting that the system effectively recommends a vast array of items to users.
+    -------
+    Args:
+        R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
+        item_ids (lst): A list of item IDs for which to compute the coverage.
+        n (int): The number of items to consider in the ranking.
+    -------
+    Returns:
+        coverage (flt): The total coverage score.
+    """
+
+    # find the indices of the sorted top-n predicted relevance scores in R_hat
+    idx_topn = get_topn_indices(R_hat, n)
+    flat_topn = idx_topn.flatten()
+
+    item_counts = {item_id: np.count_nonzero(flat_topn == item_id) for item_id in item_ids}  # count frequency of each item ID
+    item_counts = np.array(list(item_counts.values()))
+    coverage = sum(np.minimum(1, item_counts)) / len(item_ids)
+
+    return coverage
+
+def u_parity_at_n(R_hat, protected_users, items_dict, n):
+    """
+    Computes the u-parity@N score. 
+    The User-coverage Parity metric measures the average disparity between the proportions of protected 
+    and non-protected users receiving recommendations from each category.
+    -------
+    Args:
+    R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
+    protected_users (numpy.ndarray): A binary array where 1 indicates a user is part of the protected group and 0 otherwise.
+    items_dict (dict): A dictionary mapping item categories to their corresponding item IDs.
+    n (int): The number of items to consider in the ranking.
+    -------
+    Returns:
+    u_parity (flt): The u-parity@N score.
+    std_dev (flt): The standard deviation of the differences across item categories.
+    """
+
+    # get top-n indices for all users
+    idx_topn = get_topn_indices(R_hat, n)
+
+    # compute the number of protected and non-protected users once
+    num_protected_users = sum(protected_users)
+    num_non_protected_users = len(protected_users) - num_protected_users
+
+    differences = []
+    for _, item_ids in items_dict.items():
+        count_protected_users = 0; count_non_protected_users = 0
+        for i, is_protected in enumerate(protected_users):
+            if np.isin(idx_topn[i], item_ids).sum() > 0:
+                if is_protected:
+                    count_protected_users += 1 
+                else:
+                    count_non_protected_users += 1
+
+        # normalize the number of recommendations by the total number of users in each group
+        prop_protected_users = count_protected_users / num_protected_users
+        prop_non_protected_users = count_non_protected_users / num_non_protected_users
+
+        differences.append(abs(prop_non_protected_users - prop_protected_users))
+
+    u_parity = np.mean(differences); std_dev = np.std(differences)
+
+    return u_parity, std_dev
+
+def apcr_at_n(R_hat, items_dict, n):
+    """
+    Computes the apcr@N score.
+    This metric measures the average rate at which providers are covered in the recommendation lists for a set of users.
+    -------
+    Args:
+    R_hat (numpy.ndarray): The estimated user feedback matrix with -np.inf marking entries used as model input.
+    items_dict (dict): A dictionary mapping providers to their corresponding item IDs.
+    n (int): The number of items to consider in the ranking.
+    -------
+    Returns:
+    apcr (flt): The apcr@N score.
+    -------
+    Reference:
+        Weiwen Liu and Robinn Burke
+        Personalizing Fairness-aware Re-ranking. FATREC Workshop on Responsible Recommendation 2018
+    """
+
+    # reverse the items_dict to map item IDs to provider IDs
+    providers_dict = {item: provider for provider, items in items_dict.items() for item in items}
+
+    idx_topn = get_topn_indices(R_hat, n)
+
+    count_providers = []
+    for user_row in idx_topn:
+        user_providers = set(providers_dict.get(item) for item in user_row)
+        count_providers.append(len(user_providers))
+
+    apcr = np.mean(count_providers) / len(items_dict)
+
+    return apcr