Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ ignore = [
"D104", # Ignore package level docstrings requirement
"D205", # 1 blank line required between summary line and description
"D212", # Multi-line docstring summary should start at the first line
"D301", # r-strings for docstrings with backslashes
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be brought in if we're going to have latex in our docstrings.

"PLR2004", # Replace magic number with named constant
"PLR0913", # Too many arguments
"COM812", # Missing trailing comma
Expand Down
5 changes: 4 additions & 1 deletion src/midst_toolkit/evaluation/metrics_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from logging import INFO
from logging import INFO, WARNING
from typing import overload

import pandas as pd
Expand Down Expand Up @@ -54,6 +54,9 @@ def __init__(
self.numerical_columns = numerical_columns
self.do_preprocess = do_preprocess

if len(self.categorical_columns) == 0 and len(self.numerical_columns) == 0:
log(WARNING, "Both lists of column names are empty. This will result in unexpected metric behavior.")

if do_preprocess:
log(INFO, "Default preprocessing will be performed during computation.")

Expand Down
146 changes: 146 additions & 0 deletions src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from logging import WARNING

import numpy as np
import pandas as pd

from midst_toolkit.common.logger import log
from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric


def hellinger_distance(discrete_distribution_1: np.ndarray, discrete_distribution_2: np.ndarray) -> float:
"""
Compute the empirical Hellinger distance between two discrete probability distributions. Hellinger distance for
discrete probability distributions $p$ and $q$ is expressed as
$$\\frac{1}{\\sqrt{2}} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$.

Args:
discrete_distribution_1: First discrete distribution for distance computation
discrete_distribution_2: Second discrete distribution for distance computation

Returns:
Empirical Hellinger distance between the two distributions.
"""
sum_1 = np.sum(discrete_distribution_1)
sum_2 = np.sum(discrete_distribution_2)
assert np.isclose(sum_1, 1.0, atol=1e-4), f"Distribution 1 is not a probability distribution: Sum is {sum_1}"
assert np.isclose(sum_2, 1.0, atol=1e-4), f"Distribution 2 is not a probability distribution: Sum is {sum_2}"

sqrt_pdf_1 = np.sqrt(discrete_distribution_1)
sqrt_pdf_2 = np.sqrt(discrete_distribution_2)
difference = sqrt_pdf_1 - sqrt_pdf_2
return 1 / np.sqrt(2) * np.linalg.norm(difference)


class MeanHellingerDistance(SynthEvalQualityMetric):
def __init__(
self,
categorical_columns: list[str],
numerical_columns: list[str],
do_preprocess: bool = False,
include_numerical_columns: bool = True,
):
"""
This class computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic
dataframes.

NOTE: The implementation here is inspired by the SynthEval implementation of the Mean Hellinger Distance
but fixes a crucial issue. Their way of computing bins for the discrete histograms of numerical values is
flawed. Here, we make use of the 'auto' binning schemes in numpy to do a better job binning such values into
histograms

- For a categorical column, the number of bins for the discrete distributions is established by computing
the unique values in the column for the REAL DATA. This can have some side effects when the encodings of
the categorical values is not contiguous ([1, 2, 10]) or there are different values in the synthetic
dataframe.
- For numerical columns, binning is determined by the numpy ``histogram_bin_edges`` function and takes into
account values from BOTH dataframes.

The final score is the average of the distances computed across columns. Lower is better.

NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
dataframes before calling compute or by setting ``do_preprocess`` to True.

Args:
categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
numerical_columns: Column names corresponding to the numerical variables of any provided dataframe.
do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval.
Defaults to False.
include_numerical_columns: Whether to include any provided numerical columns in the Hellinger distance
computation. Numerical column values are binned to create discrete distributions, which may or may not
be something you want to do.
"""
super().__init__(categorical_columns, numerical_columns, do_preprocess)

self.include_numerical_columns = include_numerical_columns

if len(self.categorical_columns) == 0 and not self.include_numerical_columns:
log(
WARNING,
"No categorical columns provided and include_numerical_columns is False. This will result in a NaN "
"for the Hellinger distance.",
)

def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
"""
Computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic dataframes. For a
categorical column, the range of values for the discrete distributions is established by computing the unique
values in the column for the REAL DATA. For numerical columns, a binning procedure based on numpy's
``histogram_bin_edges`` with binning strategy set to 'auto' is used.

The final score is the average of the distances computed across columns. Lower is better.

NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
dataframes before calling compute or by setting ``do_preprocess`` to True.

Args:
real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
to TRAIN the model that generated the synthetic data, but not always.
synthetic_data: Synthetically generated data whose quality is to be assessed.

Returns:
The mean of the individual Hellinger distances between each of the corresponding columns of the real and
synthetic dataframes. This mean is keyed by 'mean_hellinger_distance' and is reported along with the
"standard error" associated with that mean keyed under 'hellinger_standard_error'.
"""
if self.do_preprocess:
real_data, synthetic_data = self.preprocess(real_data, synthetic_data)

hellinger_distances = []

for category_column in self.categorical_columns:
class_num = len(np.unique(real_data[category_column]))

real_discrete_counts = np.histogram(real_data[category_column], bins=class_num)[0]
synthetic_discrete_counts = np.histogram(synthetic_data[category_column], bins=class_num)[0]

real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts)
synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts)

distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
hellinger_distances.append(distance)

if self.include_numerical_columns:
for numeric_column in self.numerical_columns:
combined_data = np.concatenate((real_data[numeric_column], synthetic_data[numeric_column]))
bin_edges = np.histogram_bin_edges(combined_data, bins="auto")

real_discrete_counts = np.histogram(real_data[numeric_column], bins=bin_edges)[0]
synthetic_discrete_counts = np.histogram(synthetic_data[numeric_column], bins=bin_edges)[0]

real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts)
synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts)

distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
hellinger_distances.append(distance)

mean_hellinger_distance = np.mean(hellinger_distances).item()
hellinger_distance_standard_error = np.std(hellinger_distances, ddof=1).item() / np.sqrt(
len(hellinger_distances)
)

return {
"mean_hellinger_distance": mean_hellinger_distance,
"hellinger_standard_error": hellinger_distance_standard_error,
}
106 changes: 106 additions & 0 deletions src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import pandas as pd
from syntheval.metrics.utility.metric_propensity_mse import PropensityMeanSquaredError

from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric


class MeanPropensityMeanSquaredError(SynthEvalQualityMetric):
def __init__(
self,
categorical_columns: list[str],
numerical_columns: list[str],
do_preprocess: bool = False,
folds: int = 5,
max_iterations: int = 100,
solver: str = "liblinear",
):
"""
This class measures how well a ``LogisticRegression`` model from sklearn (as implemented in SynthEval) can
distinguish between real and synthetic data. The classification model is trained on a subset of the two data
sources and then applied to a validation split of the mixed data, created through cross-validation folds. The
average pMSE for synthetic vs. real predictions and macro F1 scores across the folds are reported along with
the standard error of these mean values.

Computation of pMSE is based on the formula in:

Woo, M., Reiter, J.P., Oganian, A., Karr, A.F.: Global measures of data utility for microdata masked for
disclosure limitation. J. Priv. Confidentiality 1(1) (2009) https://doi.org/10.29012/jpc.v1i1.568

NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for
categorical columns and ``MinMaxScaling`` for numerical columns.

- A smaller pMSE is better. In cases where the two datasets are balanced in size, 0.25 is worst case.
- Higher Macro F1 is better.

Args:
categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
numerical_columns: Column names corresponding to the numerical variables of any provided dataframe.
do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval.
Defaults to False.
folds: Number of cross-validation folds for training/evaluating the LogisticRegression classifier used to
establish a stable estimate of the pMSE. Defaults to 5.
max_iterations: Maximum number of iterations for the regression fitting. Defaults to 100.
solver: Kind of solver used to fit the ``LogisticRegression`` model. Options coincide with those of the
sklearn ``LogisticRegression`` implementation. Defaults to 'liblinear'.
"""
super().__init__(categorical_columns, numerical_columns, do_preprocess)
self.all_columns = categorical_columns + numerical_columns
self.folds = folds
self.max_iterations = max_iterations
self.solver = solver

def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
"""
Computes how well a LogisticRegression model from sklearn (as implemented in SynthEval) can distinguish between
real and synthetic data. The classification model is trained on a subset of the two data sources and then
applied to a validation split of the mixed data, created through cross-fold validation on the combination of
the two datasets. The average pMSE of the 0 = synthetic, 1 = real predictions and macro F1 scores across the
folds are reported along with the standard error of these mean values.

NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for
categorical columns and ``MinMaxScaling`` for numerical columns.

Args:
real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
to TRAIN the model that generated the synthetic data, but not always.
synthetic_data: Synthetically generated data whose quality is to be assessed.

Returns:
The mean pMSE and macro F1 scores for a LogisticRegression model. These values are keyed by 'avg_pmse' and
'avg_macro_f1_score' respectively. The standard errors associated with these mean values are reported under
the keys 'pmse_standard_error' and 'macro_f1_standard_error' as well.
"""
if self.do_preprocess:
real_data, synthetic_data = self.preprocess(real_data, synthetic_data)

# NOTE: The SynthEval MutualInformation class ignores column specifications by default. However, for
# other classes (correlation_matrix_difference for example), specifying less than all of the columns restricts
# the score computation to just those columns. To make this consistent we do that here, before passing to the
# SynthEval class.
filtered_real_data = real_data[self.all_columns]
filtered_synthetic_data = synthetic_data[self.all_columns]

# Syntheval also ASSUMES you don't have a column in both provided dataframes called 'real' because it will
# attach another column with the same name, so we throw an error here if the column already exists.
assert "real" not in filtered_real_data.columns, "A column called 'real' already exists in the dataframe."
assert "real" not in filtered_synthetic_data.columns, "A column called 'real' already exists in the dataframe."

self.syntheval_metric = PropensityMeanSquaredError(
real_data=filtered_real_data,
synt_data=filtered_synthetic_data,
hout_data=None,
cat_cols=self.categorical_columns,
num_cols=self.numerical_columns,
do_preprocessing=False,
verbose=False,
)
result = self.syntheval_metric.evaluate(self.folds, self.max_iterations, self.solver)
result["avg_pmse"] = result.pop("avg pMSE")
result["pmse_standard_error"] = result.pop("pMSE err")
result["avg_macro_f1_score"] = result.pop("avg acc")
result["macro_f1_standard_error"] = result.pop("acc err")
return result
115 changes: 115 additions & 0 deletions tests/unit/evaluation/quality/test_mean_hellinger_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import math

import numpy as np
import pandas as pd
import pytest

from midst_toolkit.evaluation.quality.mean_hellinger_distance import MeanHellingerDistance


REAL_DATA = pd.DataFrame(
{
"column_a": [1, 2, 3, 4, 5],
"column_b": [4, 5, 6, 7, 8],
"column_c": ["horse", "dog", "horse", "cat", "cat"],
"column_d": [-1, -2, -3, -2, -5],
}
)
SYNTHETIC_DATA = pd.DataFrame(
{
"column_a": [1, 2, 3, 4, 5],
"column_b": [4, 6, 6, -1, 1],
"column_c": ["cat", "dog", "horse", "cat", "cat"],
"column_d": [-1, -2, -3, -2, -50],
}
)

REAL_DATA_ENCODED = pd.DataFrame({"column_c": [1, 2, 1, 3, 3]})

SYNTHETIC_DATA_ENCODED = pd.DataFrame({"column_c": [3, 2, 1, 4, 3]})


def test_mean_hellinger_distance_with_no_preprocess() -> None:
metric = MeanHellingerDistance(
categorical_columns=["column_c"],
numerical_columns=[],
do_preprocess=False,
)

discrete_real = np.array([2 / 5, 1 / 5, 2 / 5])
# 4 gets collapsed into the last bin
synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5])
target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real))
score = metric.compute(REAL_DATA_ENCODED, SYNTHETIC_DATA_ENCODED)
assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
assert np.isnan(score["hellinger_standard_error"])


def test_mean_hellinger_distance_with_preprocess_categorical() -> None:
metric = MeanHellingerDistance(
categorical_columns=["column_c"],
numerical_columns=[],
do_preprocess=True,
)

# Should be the same as after test_mean_hellinger_distance_with_no_preprocess running preprocessing
discrete_real = np.array([2 / 5, 1 / 5, 2 / 5])
# 4 gets collapsed into the last bin
synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5])
target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real))
score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
assert np.isnan(score["hellinger_standard_error"])


def test_mean_hellinger_distance_with_preprocess() -> None:
metric = MeanHellingerDistance(
categorical_columns=[],
numerical_columns=["column_a", "column_b", "column_d"],
do_preprocess=True,
)
# Should be the same, as preprocessing doesn't change the categorical MI
score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
assert pytest.approx(0.3598897091778779, abs=1e-8) == score["mean_hellinger_distance"]
assert pytest.approx(0.18772239774180174, abs=1e-8) == score["hellinger_standard_error"]


def test_one_column_left_off() -> None:
metric = MeanHellingerDistance(
categorical_columns=["column_c"],
numerical_columns=["column_a", "column_b"],
do_preprocess=True,
)

# Make sure computation doesn't include the column that was not included.
target = 1 / 3 * (0.16510402468972515 + 0.0 + 0.6324555320336758)
score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]


def test_mean_hellinger_distance_no_numericals() -> None:
metric = MeanHellingerDistance(
categorical_columns=["column_b", "column_c"],
numerical_columns=[],
do_preprocess=True,
)

# Everything should still work with an empty numerical list
target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515)
score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]


def test_mean_hellinger_distance_do_not_include_numericals() -> None:
metric = MeanHellingerDistance(
categorical_columns=["column_b", "column_c"],
numerical_columns=["column_a", "column_d"],
do_preprocess=True,
include_numerical_columns=False,
)

# Should be the same as test_mean_hellinger_distance_no_numericals since we're saying we do not want to include
# numerical columns in the computations.
target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515)
score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
Loading