Skip to content

Commit

Permalink
ndcg refactor (#1481)
Browse files Browse the repository at this point in the history
## Description

This PR:

- Adds ndcg value expecations to the tests, not just counts
- Changes convert_non_numeric=True to be used for string columns
(integers, floats in scores and bools should use the default
convert_non_numeric=False )
- adds score_column support: when a score is available, like in [this
example](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html),
one should use score_column and target_column , and not
prediction_column (see 2 last tests on the sklearn examples)
- Changes the logic to generate target column values to make it
compatible with all scenarios
- Fixes prediction and ideal relevance calculation for non numeric case
- Handles DivisionbyZero edge case when idcg=0 (ndcg set to 1 if no
relevant documents exist)
- If K is not passed, metrics will be calculated according to
predictions cols's length (and metrics named accordingly - k is no
longer omitted in the names when k is None)

For the Numeric case:
- If predictions+target or scores+target columns are both provided, they
need to be of same length.
- If prediction_column is provided with target column, prediction col
contains the rank of suggested items, starting with 1
- If only prediction column is provided, it is assumed that the order
encodes the ranks of recommendations: first item in the list is the
first recommendation. The value in the list encodes the relevance score

- [x] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md)
and the [Code of Conduct](CODE_OF_CONDUCT.md).

---------

Co-authored-by: felipe207 <felipe@whylabs.ai>
  • Loading branch information
FelipeAdachi and felipe207 authored Mar 14, 2024
1 parent 0c72856 commit 164985c
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 64 deletions.
58 changes: 30 additions & 28 deletions python/tests/experimental/api/test_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,37 @@ def test_log_batch_ranking_metrics_single_simple():
pandas_summary = result.view().to_pandas()

column_names = [
"mean_average_precision",
"accuracy",
"mean_average_precision_k_3",
"accuracy_k_3",
"mean_reciprocal_rank",
"precision",
"recall",
"precision_k_3",
"recall_k_3",
"top_rank",
"average_precision",
"norm_dis_cumul_gain",
"average_precision_k_3",
"norm_dis_cumul_gain_k_3",
]
for col in column_names:
assert col in pandas_summary.index
assert pandas_summary.loc["mean_average_precision", "counts/n"] == 1
assert pandas_summary.loc["accuracy", "counts/n"] == 1
assert pandas_summary.loc["mean_average_precision_k_3", "counts/n"] == 1
assert pandas_summary.loc["accuracy_k_3", "counts/n"] == 1
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
assert pandas_summary.loc["precision", "counts/n"] == 4
assert pandas_summary.loc["recall", "counts/n"] == 4
assert pandas_summary.loc["precision_k_3", "counts/n"] == 4
assert pandas_summary.loc["recall_k_3", "counts/n"] == 4
assert pandas_summary.loc["top_rank", "counts/n"] == 4
assert pandas_summary.loc["average_precision", "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain", "counts/n"] == 1
assert pandas_summary.loc["average_precision", "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain", "counts/n"] == 1
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
# ndcg = [1, 0, 0.63, 0.5]
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_3", "distribution/mean"], 0.53273, abs_tol=0.00001)


def test_log_batch_ranking_metrics_binary_simple():
binary_df = pd.DataFrame(
{"raw_predictions": [[True, False, True], [False, False, False], [True, True, False], [False, True, False]]}
)

result = log_batch_ranking_metrics(
data=binary_df, prediction_column="raw_predictions", k=2, convert_non_numeric=True
)
result = log_batch_ranking_metrics(data=binary_df, prediction_column="raw_predictions", k=2)
pandas_summary = result.view().to_pandas()

k = 2
Expand All @@ -76,6 +76,8 @@ def test_log_batch_ranking_metrics_binary_simple():
assert pandas_summary.loc["top_rank", "counts/n"] == 4
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
# ndcg@2 = [0.613147, 1.0, 1.0, 0.63093]
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "distribution/mean"], 0.81101, abs_tol=0.00001)


def test_log_batch_ranking_metrics_multiple_simple():
Expand Down Expand Up @@ -121,16 +123,14 @@ def test_log_batch_ranking_metrics_multiple_simple():
assert pandas_summary.loc["top_rank", "counts/n"] == 4
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1

assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.76244, abs_tol=0.00001)
# ndcg@4 = [0.9197, 0.0, 1.0, 0.386853]
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.57664, abs_tol=0.00001)


def test_log_batch_ranking_metrics_default_target():
multiple_df = pd.DataFrame({"raw_predictions": [[3, 2, 3, 0, 1, 2, 3, 2]]})

result = log_batch_ranking_metrics(
data=multiple_df, prediction_column="raw_predictions", k=3, convert_non_numeric=True
)
result = log_batch_ranking_metrics(data=multiple_df, prediction_column="raw_predictions", k=3)
pandas_summary = result.view().to_pandas()

k = 3
Expand All @@ -154,11 +154,13 @@ def test_log_batch_ranking_metrics_default_target():
assert pandas_summary.loc["top_rank", "counts/n"] == 1
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
# ndcg@3 = [0.9013]
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.90130, abs_tol=0.00001)


def test_log_batch_ranking_metrics_ranking_ndcg_wikipedia():
# From https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Example
ranking_df = pd.DataFrame({"targets": [[3, 2, 3, 0, 1, 2, 3, 2]], "predictions": [[7, 6, 5, 4, 3, 2, 1, 0]]})
ranking_df = pd.DataFrame({"targets": [[1, 0, 2, 3, 3, 2, 2, 3]], "predictions": [[5, 4, 2, 1, 7, 8, 6, 3]]})

result = log_batch_ranking_metrics(data=ranking_df, prediction_column="predictions", target_column="targets", k=6)
pandas_summary = result.view().to_pandas()
Expand All @@ -168,19 +170,19 @@ def test_log_batch_ranking_metrics_ranking_ndcg_wikipedia():

def test_log_batch_ranking_metrics_ranking_ndcg_sklearn():
# From https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html
ranking_df = pd.DataFrame({"predictions": [[0.1, 0.2, 0.3, 4, 70]], "targets": [[10, 0, 0, 1, 5]]})
ranking_df = pd.DataFrame({"scores": [[0.1, 0.2, 0.3, 4, 70]], "true_relevance": [[10, 0, 0, 1, 5]]})

result = log_batch_ranking_metrics(data=ranking_df, prediction_column="predictions", target_column="targets")
result = log_batch_ranking_metrics(data=ranking_df, score_column="scores", target_column="true_relevance")
pandas_summary = result.view().to_pandas()

assert isclose(pandas_summary.loc["norm_dis_cumul_gain", "distribution/median"], 0.69569, abs_tol=0.00001)
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_5", "distribution/median"], 0.69569, abs_tol=0.00001)


def test_log_batch_ranking_metrics_ranking_ndcg_withk_sklearn():
# From https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html
ranking_df = pd.DataFrame({"predictions": [[0.05, 1.1, 1.0, 0.5, 0.0]], "targets": [[10, 0, 0, 1, 5]]})
ranking_df = pd.DataFrame({"scores": [[0.05, 1.1, 1.0, 0.5, 0.0]], "true_relevance": [[10, 0, 0, 1, 5]]})

result = log_batch_ranking_metrics(data=ranking_df, prediction_column="predictions", target_column="targets", k=4)
result = log_batch_ranking_metrics(data=ranking_df, score_column="scores", target_column="true_relevance", k=4)
pandas_summary = result.view().to_pandas()

assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_4", "distribution/median"], 0.35202, abs_tol=0.00001)
94 changes: 58 additions & 36 deletions python/whylogs/experimental/api/logger/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,16 @@
diagnostic_logger = logging.getLogger(__name__)


def _convert_to_int_if_bool(data: pd.core.frame.DataFrame, *columns: str) -> pd.core.frame.DataFrame:
for col in columns:
if all(isinstance(x, bool) for x in data[col]):
data[col] = data[col].apply(lambda x: 1 if x else 0)
return data


def log_batch_ranking_metrics(
data: pd.core.frame.DataFrame,
prediction_column: str,
prediction_column: Optional[str] = None,
target_column: Optional[str] = None,
score_column: Optional[str] = None,
k: Optional[int] = None,
Expand All @@ -22,26 +29,42 @@ def log_batch_ranking_metrics(
) -> ViewResultSet:
formatted_data = data.copy(deep=True) # TODO: does this have to be deep?

if prediction_column is None:
if score_column is not None and target_column is not None:
prediction_column = "__predictions"

# Ties are not being handled here
formatted_data[prediction_column] = formatted_data[score_column].apply(
lambda row: list(np.argsort(np.argsort(-np.array(row))) + 1)
)
else:
raise ValueError("Either prediction_column or score+target columns must be specified")

relevant_cols = [prediction_column]

if target_column is None:
formatted_data = _convert_to_int_if_bool(formatted_data, prediction_column)
target_column = "__targets"
formatted_data[target_column] = formatted_data[prediction_column].apply(lambda x: list(range(len(x)))[::-1])
# the relevances in predicitons are moved to targets, and predicitons contains the indices to the target list
formatted_data[target_column] = formatted_data[prediction_column]
formatted_data[prediction_column] = formatted_data[target_column].apply(
lambda row: list(range(1, len(row) + 1))
)

relevant_cols.append(target_column)
if score_column is not None:
relevant_cols.append(score_column)

for col in relevant_cols:
if not formatted_data[col].apply(lambda x: type(x) == list).all():
# wrapping in lists because at least one isn't a list
# TODO: more error checking
formatted_data[col] = formatted_data[col].apply(lambda x: [x])

_max_k = formatted_data[prediction_column].apply(len).max()

if not k:
k = _max_k
formatted_data["count_at_k"] = formatted_data[relevant_cols].apply(
lambda row: sum([1 if pred_val in row[target_column] else 0 for pred_val in row[prediction_column][:k]]), axis=1
)

formatted_data["count_all"] = formatted_data[relevant_cols].apply(
lambda row: sum([1 if pred_val in row[target_column] else 0 for pred_val in row[prediction_column]]), axis=1
)
Expand All @@ -54,12 +77,10 @@ def get_top_rank(row):
return matches[0]

formatted_data["top_rank"] = formatted_data[relevant_cols].apply(get_top_rank, axis=1)

output_data = (formatted_data["count_at_k"] / (k if k else 1)).to_frame()
output_data.columns = ["precision" + ("_k_" + str(k) if k else "")]
output_data["recall" + ("_k_" + str(k) if k else "")] = formatted_data["count_at_k"] / formatted_data["count_all"]
output_data["top_rank"] = formatted_data["top_rank"]

ki_dict: pd.DataFrame = None
for ki in range(1, (k if k else _max_k) + 1):
ki_result = (
Expand All @@ -76,41 +97,44 @@ def get_top_rank(row):
ki_dict.columns = ["p@" + str(ki)]
else:
ki_dict["p@" + str(ki)] = ki_result

output_data["average_precision" + ("_k_" + str(k) if k else "")] = ki_dict.mean(axis=1)

def _convert_non_numeric(row_dict):
return (
[
row_dict[target_column].index(pred_val) if pred_val in row_dict[target_column] else -1
for pred_val in row_dict[prediction_column]
],
list(range(len(row_dict[prediction_column])))[::-1],
)

if convert_non_numeric:
formatted_data[[prediction_column, target_column]] = formatted_data.apply(
_convert_non_numeric, result_type="expand", axis=1
)
def _calc_non_numeric_relevance(row_dict):
prediction_relevance = []
ideal_relevance = []
for target_val in row_dict[prediction_column]:
ideal_relevance.append(1 if target_val in row_dict[target_column] else 0)
prediction_relevance.append(1 if target_val in row_dict[target_column] else 0)
for target_val in row_dict[target_column]:
if target_val not in row_dict[prediction_column]:
ideal_relevance.append(1)
return (prediction_relevance, sorted(ideal_relevance, reverse=True))

def _calculate_row_ndcg(row_dict, k):
predicted_order = np.array(row_dict[prediction_column]).argsort()[::-1]
target_order = np.array(row_dict[target_column]).argsort()[::-1]
dcg_vals = [
(rel / math.log(i + 2, 2)) for i, rel in enumerate(np.array(row_dict[target_column])[predicted_order][:k])
]
idcg_vals = [
(rel / math.log(i + 2, 2)) for i, rel in enumerate(np.array(row_dict[target_column])[target_order][:k])
]
if not convert_non_numeric:
dcg_vals = [
rel / math.log2(pos + 1)
for rel, pos in zip(row_dict[target_column], row_dict[prediction_column])
if pos <= k
]
idcg_vals = [
rel / math.log2(pos + 2) for pos, rel in enumerate(sorted(row_dict[target_column], reverse=True)[:k])
]
else:
predicted_relevances, ideal_relevances = _calc_non_numeric_relevance(row_dict)
dcg_vals = [(rel / math.log(i + 2, 2)) for i, rel in enumerate(predicted_relevances[:k])]
idcg_vals = [(rel / math.log(i + 2, 2)) for i, rel in enumerate(ideal_relevances[:k])]
if sum(idcg_vals) == 0:
return 1 # if there is no relevant data, not much the recommender can do
return sum(dcg_vals) / sum(idcg_vals)

formatted_data["norm_dis_cumul_gain_k_" + str(k)] = formatted_data.apply(_calculate_row_ndcg, args=(k,), axis=1)

formatted_data["norm_dis_cumul_gain" + ("_k_" + str(k) if k else "")] = formatted_data.apply(
_calculate_row_ndcg, args=(k,), axis=1
)
mAP_at_k = ki_dict.mean()
hit_ratio = formatted_data["count_at_k"].apply(lambda x: bool(x)).sum() / len(formatted_data)
mrr = (1 / formatted_data["top_rank"]).replace([np.inf], np.nan).mean()
ndcg = formatted_data["norm_dis_cumul_gain_k_" + str(k)].mean()

ndcg = formatted_data["norm_dis_cumul_gain" + ("_k_" + str(k) if k else "")].mean()
result = log(pandas=output_data, schema=schema)
result = result.merge(
log(
Expand All @@ -123,8 +147,6 @@ def _calculate_row_ndcg(row_dict, k):
schema=schema,
)
)

if log_full_data:
result = result.merge(log(pandas=data, schema=schema))

return result

0 comments on commit 164985c

Please sign in to comment.