VectorInstitute · emersodb · Sep 30, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,6 +96,7 @@ ignore = [
     "D104", # Ignore package level docstrings requirement
     "D205", # 1 blank line required between summary line and description
     "D212", # Multi-line docstring summary should start at the first line
+    "D301", # r-strings for docstrings with backslashes
     "PLR2004", # Replace magic number with named constant
     "PLR0913", # Too many arguments
     "COM812", # Missing trailing comma

diff --git a/src/midst_toolkit/evaluation/metrics_base.py b/src/midst_toolkit/evaluation/metrics_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from logging import INFO
+from logging import INFO, WARNING
 from typing import overload
 
 import pandas as pd
@@ -54,6 +54,9 @@ def __init__(
         self.numerical_columns = numerical_columns
         self.do_preprocess = do_preprocess
 
+        if len(self.categorical_columns) == 0 and len(self.numerical_columns) == 0:
+            log(WARNING, "Both lists of column names are empty. This will result in unexpected metric behavior.")
+
         if do_preprocess:
             log(INFO, "Default preprocessing will be performed during computation.")
 

diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
@@ -0,0 +1,146 @@
+from logging import WARNING
+
+import numpy as np
+import pandas as pd
+
+from midst_toolkit.common.logger import log
+from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric
+
+
+def hellinger_distance(discrete_distribution_1: np.ndarray, discrete_distribution_2: np.ndarray) -> float:
+    """
+    Compute the empirical Hellinger distance between two discrete probability distributions. Hellinger distance for
+    discrete probability distributions $p$ and $q$ is expressed as
+    $$\\frac{1}{\\sqrt{2}} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$.
+
+    Args:
+        discrete_distribution_1: First discrete distribution for distance computation
+        discrete_distribution_2: Second discrete distribution for distance computation
+
+    Returns:
+        Empirical Hellinger distance between the two distributions.
+    """
+    sum_1 = np.sum(discrete_distribution_1)
+    sum_2 = np.sum(discrete_distribution_2)
+    assert np.isclose(sum_1, 1.0, atol=1e-4), f"Distribution 1 is not a probability distribution: Sum is {sum_1}"
+    assert np.isclose(sum_2, 1.0, atol=1e-4), f"Distribution 2 is not a probability distribution: Sum is {sum_2}"
+
+    sqrt_pdf_1 = np.sqrt(discrete_distribution_1)
+    sqrt_pdf_2 = np.sqrt(discrete_distribution_2)
+    difference = sqrt_pdf_1 - sqrt_pdf_2
+    return 1 / np.sqrt(2) * np.linalg.norm(difference)
+
+
+class MeanHellingerDistance(SynthEvalQualityMetric):
+    def __init__(
+        self,
+        categorical_columns: list[str],
+        numerical_columns: list[str],
+        do_preprocess: bool = False,
+        include_numerical_columns: bool = True,
+    ):
+        """
+        This class computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic
+        dataframes.
+
+        NOTE: The implementation here is inspired by the SynthEval implementation of the Mean Hellinger Distance
+        but fixes a crucial issue. Their way of computing bins for the discrete histograms of numerical values is
+        flawed. Here, we make use of the 'auto' binning schemes in numpy to do a better job binning such values into
+        histograms
+
+        - For a categorical column, the number of bins for the discrete distributions is established by computing
+          the unique values in the column for the REAL DATA. This can have some side effects when the encodings of
+          the categorical values is not contiguous ([1, 2, 10]) or there are different values in the synthetic
+          dataframe.
+        - For numerical columns, binning is determined by the numpy ``histogram_bin_edges`` function and takes into
+          account values from BOTH dataframes.
+
+        The final score is the average of the distances computed across columns. Lower is better.
+
+        NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
+        This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
+        dataframes before calling compute or by setting ``do_preprocess`` to True.
+
+        Args:
+            categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
+            numerical_columns: Column names corresponding to the numerical variables of any provided dataframe.
+            do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval.
+                Defaults to False.
+            include_numerical_columns: Whether to include any provided numerical columns in the Hellinger distance
+                computation. Numerical column values are binned to create discrete distributions, which may or may not
+                be something you want to do.
+        """
+        super().__init__(categorical_columns, numerical_columns, do_preprocess)
+
+        self.include_numerical_columns = include_numerical_columns
+
+        if len(self.categorical_columns) == 0 and not self.include_numerical_columns:
+            log(
+                WARNING,
+                "No categorical columns provided and include_numerical_columns is False. This will result in a NaN "
+                "for the Hellinger distance.",
+            )
+
+    def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
+        """
+        Computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic dataframes. For a
+        categorical column, the range of values for the discrete distributions is established by computing the unique
+        values in the column for the REAL DATA. For numerical columns, a binning procedure based on numpy's
+        ``histogram_bin_edges`` with binning strategy set to 'auto' is used.
+
+        The final score is the average of the distances computed across columns. Lower is better.
+
+        NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
+        This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
+        dataframes before calling compute or by setting ``do_preprocess`` to True.
+
+        Args:
+            real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
+                to TRAIN the model that generated the synthetic data, but not always.
+            synthetic_data: Synthetically generated data whose quality is to be assessed.
+
+        Returns:
+            The mean of the individual Hellinger distances between each of the corresponding columns of the real and
+            synthetic dataframes. This mean is keyed by 'mean_hellinger_distance' and is reported along with the
+            "standard error" associated with that mean keyed under 'hellinger_standard_error'.
+        """
+        if self.do_preprocess:
+            real_data, synthetic_data = self.preprocess(real_data, synthetic_data)
+
+        hellinger_distances = []
+
+        for category_column in self.categorical_columns:
+            class_num = len(np.unique(real_data[category_column]))
+
+            real_discrete_counts = np.histogram(real_data[category_column], bins=class_num)[0]
+            synthetic_discrete_counts = np.histogram(synthetic_data[category_column], bins=class_num)[0]
+
+            real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts)
+            synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts)
+
+            distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
+            hellinger_distances.append(distance)
+
+        if self.include_numerical_columns:
+            for numeric_column in self.numerical_columns:
+                combined_data = np.concatenate((real_data[numeric_column], synthetic_data[numeric_column]))
+                bin_edges = np.histogram_bin_edges(combined_data, bins="auto")
+
+                real_discrete_counts = np.histogram(real_data[numeric_column], bins=bin_edges)[0]
+                synthetic_discrete_counts = np.histogram(synthetic_data[numeric_column], bins=bin_edges)[0]
+
+                real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts)
+                synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts)
+
+                distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
+                hellinger_distances.append(distance)
+
+        mean_hellinger_distance = np.mean(hellinger_distances).item()
+        hellinger_distance_standard_error = np.std(hellinger_distances, ddof=1).item() / np.sqrt(
+            len(hellinger_distances)
+        )
+
+        return {
+            "mean_hellinger_distance": mean_hellinger_distance,
+            "hellinger_standard_error": hellinger_distance_standard_error,
+        }
diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
@@ -0,0 +1,106 @@
+import pandas as pd
+from syntheval.metrics.utility.metric_propensity_mse import PropensityMeanSquaredError
+
+from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric
+
+
+class MeanPropensityMeanSquaredError(SynthEvalQualityMetric):
+    def __init__(
+        self,
+        categorical_columns: list[str],
+        numerical_columns: list[str],
+        do_preprocess: bool = False,
+        folds: int = 5,
+        max_iterations: int = 100,
+        solver: str = "liblinear",
+    ):
+        """
+        This class measures how well a ``LogisticRegression`` model from sklearn (as implemented in SynthEval) can
+        distinguish between real and synthetic data. The classification model is trained on a subset of the two data
+        sources and then applied to a validation split of the mixed data, created through cross-validation folds. The
+        average pMSE for synthetic vs. real predictions and macro F1 scores across the folds are reported along with
+        the standard error of these mean values.
+
+        Computation of pMSE is based on the formula in:
+
+        Woo, M., Reiter, J.P., Oganian, A., Karr, A.F.: Global measures of data utility for microdata masked for
+        disclosure limitation. J. Priv. Confidentiality 1(1) (2009) https://doi.org/10.29012/jpc.v1i1.568
+
+        NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
+        preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
+        ``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for
+        categorical columns and ``MinMaxScaling`` for numerical columns.
+
+        - A smaller pMSE is better. In cases where the two datasets are balanced in size, 0.25 is worst case.
+        - Higher Macro F1 is better.
+
+        Args:
+            categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
+            numerical_columns: Column names corresponding to the numerical variables of any provided dataframe.
+            do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval.
+                Defaults to False.
+            folds: Number of cross-validation folds for training/evaluating the LogisticRegression classifier used to
+                establish a stable estimate of the pMSE. Defaults to 5.
+            max_iterations: Maximum number of iterations for the regression fitting. Defaults to 100.
+            solver: Kind of solver used to fit the ``LogisticRegression`` model. Options coincide with those of the
+                sklearn ``LogisticRegression`` implementation. Defaults to 'liblinear'.
+        """
+        super().__init__(categorical_columns, numerical_columns, do_preprocess)
+        self.all_columns = categorical_columns + numerical_columns
+        self.folds = folds
+        self.max_iterations = max_iterations
+        self.solver = solver
+
+    def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
+        """
+        Computes how well a LogisticRegression model from sklearn (as implemented in SynthEval) can distinguish between
+        real and synthetic data. The classification model is trained on a subset of the two data sources and then
+        applied to a validation split of the mixed data, created through cross-fold validation on the combination of
+        the two datasets. The average pMSE of the 0 = synthetic, 1 = real predictions and macro F1 scores across the
+        folds are reported along with the standard error of these mean values.
+
+        NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
+        preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
+        ``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for
+        categorical columns and ``MinMaxScaling`` for numerical columns.
+
+        Args:
+            real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
+                to TRAIN the model that generated the synthetic data, but not always.
+            synthetic_data: Synthetically generated data whose quality is to be assessed.
+
+        Returns:
+            The mean pMSE and macro F1 scores for a LogisticRegression model. These values are keyed by 'avg_pmse' and
+            'avg_macro_f1_score' respectively. The standard errors associated with these mean values are reported under
+            the keys 'pmse_standard_error' and 'macro_f1_standard_error' as well.
+        """
+        if self.do_preprocess:
+            real_data, synthetic_data = self.preprocess(real_data, synthetic_data)
+
+        # NOTE: The SynthEval MutualInformation class ignores column specifications by default. However, for
+        # other classes (correlation_matrix_difference for example), specifying less than all of the columns restricts
+        # the score computation to just those columns. To make this consistent we do that here, before passing to the
+        # SynthEval class.
+        filtered_real_data = real_data[self.all_columns]
+        filtered_synthetic_data = synthetic_data[self.all_columns]
+
+        # Syntheval also ASSUMES you don't have a column in both provided dataframes called 'real' because it will
+        # attach another column with the same name, so we throw an error here if the column already exists.
+        assert "real" not in filtered_real_data.columns, "A column called 'real' already exists in the dataframe."
+        assert "real" not in filtered_synthetic_data.columns, "A column called 'real' already exists in the dataframe."
+
+        self.syntheval_metric = PropensityMeanSquaredError(
+            real_data=filtered_real_data,
+            synt_data=filtered_synthetic_data,
+            hout_data=None,
+            cat_cols=self.categorical_columns,
+            num_cols=self.numerical_columns,
+            do_preprocessing=False,
+            verbose=False,
+        )
+        result = self.syntheval_metric.evaluate(self.folds, self.max_iterations, self.solver)
+        result["avg_pmse"] = result.pop("avg pMSE")
+        result["pmse_standard_error"] = result.pop("pMSE err")
+        result["avg_macro_f1_score"] = result.pop("avg acc")
+        result["macro_f1_standard_error"] = result.pop("acc err")
+        return result
diff --git a/tests/unit/evaluation/quality/test_mean_hellinger_distance.py b/tests/unit/evaluation/quality/test_mean_hellinger_distance.py
@@ -0,0 +1,115 @@
+import math
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from midst_toolkit.evaluation.quality.mean_hellinger_distance import MeanHellingerDistance
+
+
+REAL_DATA = pd.DataFrame(
+    {
+        "column_a": [1, 2, 3, 4, 5],
+        "column_b": [4, 5, 6, 7, 8],
+        "column_c": ["horse", "dog", "horse", "cat", "cat"],
+        "column_d": [-1, -2, -3, -2, -5],
+    }
+)
+SYNTHETIC_DATA = pd.DataFrame(
+    {
+        "column_a": [1, 2, 3, 4, 5],
+        "column_b": [4, 6, 6, -1, 1],
+        "column_c": ["cat", "dog", "horse", "cat", "cat"],
+        "column_d": [-1, -2, -3, -2, -50],
+    }
+)
+
+REAL_DATA_ENCODED = pd.DataFrame({"column_c": [1, 2, 1, 3, 3]})
+
+SYNTHETIC_DATA_ENCODED = pd.DataFrame({"column_c": [3, 2, 1, 4, 3]})
+
+
+def test_mean_hellinger_distance_with_no_preprocess() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_c"],
+        numerical_columns=[],
+        do_preprocess=False,
+    )
+
+    discrete_real = np.array([2 / 5, 1 / 5, 2 / 5])
+    # 4 gets collapsed into the last bin
+    synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5])
+    target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real))
+    score = metric.compute(REAL_DATA_ENCODED, SYNTHETIC_DATA_ENCODED)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+    assert np.isnan(score["hellinger_standard_error"])
+
+
+def test_mean_hellinger_distance_with_preprocess_categorical() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_c"],
+        numerical_columns=[],
+        do_preprocess=True,
+    )
+
+    # Should be the same as after test_mean_hellinger_distance_with_no_preprocess running preprocessing
+    discrete_real = np.array([2 / 5, 1 / 5, 2 / 5])
+    # 4 gets collapsed into the last bin
+    synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5])
+    target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real))
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+    assert np.isnan(score["hellinger_standard_error"])
+
+
+def test_mean_hellinger_distance_with_preprocess() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=[],
+        numerical_columns=["column_a", "column_b", "column_d"],
+        do_preprocess=True,
+    )
+    # Should be the same, as preprocessing doesn't change the categorical MI
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(0.3598897091778779, abs=1e-8) == score["mean_hellinger_distance"]
+    assert pytest.approx(0.18772239774180174, abs=1e-8) == score["hellinger_standard_error"]
+
+
+def test_one_column_left_off() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_c"],
+        numerical_columns=["column_a", "column_b"],
+        do_preprocess=True,
+    )
+
+    # Make sure computation doesn't include the column that was not included.
+    target = 1 / 3 * (0.16510402468972515 + 0.0 + 0.6324555320336758)
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+
+
+def test_mean_hellinger_distance_no_numericals() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_b", "column_c"],
+        numerical_columns=[],
+        do_preprocess=True,
+    )
+
+    # Everything should still work with an empty numerical list
+    target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515)
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+
+
+def test_mean_hellinger_distance_do_not_include_numericals() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_b", "column_c"],
+        numerical_columns=["column_a", "column_d"],
+        do_preprocess=True,
+        include_numerical_columns=False,
+    )
+
+    # Should be the same as test_mean_hellinger_distance_no_numericals since we're saying we do not want to include
+    # numerical columns in the computations.
+    target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515)
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]