From adae16e96d9edfe44cdf792ecefab11e79956ccd Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Wed, 17 Sep 2025 09:27:45 -0400
Subject: [PATCH 1/5] First checkin of hellinger and pmse implementations

---
 pyproject.toml                                |   1 +
 .../quality/mean_hellinger_distance.py        | 131 ++++++++++++++++++
 .../evaluation/quality/mean_propensity_mse.py | 105 ++++++++++++++
 .../quality/test_mean_hellinger_distance.py   | 115 +++++++++++++++
 .../quality/test_mean_propensity_mse.py       |  82 +++++++++++
 5 files changed, 434 insertions(+)
 create mode 100644 src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
 create mode 100644 src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
 create mode 100644 tests/unit/evaluation/quality/test_mean_hellinger_distance.py
 create mode 100644 tests/unit/evaluation/quality/test_mean_propensity_mse.py

diff --git a/pyproject.toml b/pyproject.toml
index bf8af1b6..9df19160 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,6 +122,7 @@ ignore = [
     "D104", # Ignore package level docstrings requirement
     "D205", # 1 blank line required between summary line and description
     "D212", # Multi-line docstring summary should start at the first line
+    "D301", # r-strings for docstrings with backslashes
     "PLR2004", # Replace magic number with named constant
     "PLR0913", # Too many arguments
     "COM812", # Missing trailing comma
diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
new file mode 100644
index 00000000..374761d7
--- /dev/null
+++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
@@ -0,0 +1,131 @@
+import numpy as np
+import pandas as pd
+
+from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric
+
+
+def hellinger_distance(discrete_distribution_1: np.ndarray, discrete_distribution_2: np.ndarray) -> float:
+    """
+    Compute the empirical Hellinger distance between two discrete probability distributions. Hellinger distance for
+    discrete probability distributions $p$ and $q$ is expressed as
+    $$\\frac{1}{2} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$.
+
+    Args:
+        discrete_distribution_1: First discrete distribution for distance computation
+        discrete_distribution_2: Second discrete distribution for distance computation
+
+    Returns:
+        Empirical Hellinger distance between the two distributions.
+    """
+    sum_1 = np.sum(discrete_distribution_1)
+    sum_2 = np.sum(discrete_distribution_2)
+    assert np.isclose(sum_1, 1.0, atol=1e-4), f"Distribution 1 is not a probability distribution: Sum is {sum_1}"
+    assert np.isclose(sum_2, 1.0, atol=1e-4), f"Distribution 2 is not a probability distribution: Sum is {sum_2}"
+
+    sqrt_pdf_1 = np.sqrt(discrete_distribution_1)
+    sqrt_pdf_2 = np.sqrt(discrete_distribution_2)
+    difference = sqrt_pdf_1 - sqrt_pdf_2
+    return 1 / np.sqrt(2) * np.linalg.norm(difference)
+
+
+class MeanHellingerDistance(SynthEvalQualityMetric):
+    def __init__(
+        self,
+        categorical_columns: list[str],
+        numerical_columns: list[str],
+        do_preprocess: bool = False,
+        include_numerical_columns: bool = True,
+    ):
+        """
+        This class computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic
+        dataframes.
+
+        NOTE: The implementation here is inspired by the SynthEval implementation of the Mean Hellinger Distance
+        but fixes a crucial issue. Their way of computing bins for the discrete histograms of numerical values is
+        flawed. Here, we make use of the 'auto' binning schemes in numpy to do a better job binning such values into
+        histograms
+
+        - For a categorical column, the number of bins for the discrete distributions is established by computing
+          the unique values in the column for the REAL DATA. This can have some side effects when the encodings of
+          the categorical values is not contiguous ([1, 2, 10]) or there are different values in the synthetic
+          dataframe.
+        - For numerical columns, binning is determined by the numpy ``histogram_bin_edges`` function and takes into
+          account values from BOTH dataframes.
+
+        The final score is the average of the distances computed across columns. Lower is better.
+
+        NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
+        This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
+        dataframes before calling compute or by setting ``do_preprocess`` to True
+
+        Args:
+            categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
+            numerical_columns: Column names corresponding to the numerical variables of any provided dataframe.
+            do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval.
+                Defaults to False.
+            include_numerical_columns: Whether to include any provided numerical columns in the Hellinger distance
+                computation. Numerical column values are binned to create discrete distributions, which may or may not
+                be something you want to do.
+        """
+        super().__init__(categorical_columns, numerical_columns, do_preprocess)
+
+        self.include_numerical_columns = include_numerical_columns
+
+    def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
+        """
+        Computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic dataframes. For a
+        categorical column, the range of values for the discrete distributions is established by computing the unique
+        values in the column for the REAL DATA. For numerical columns, a binning procedure based on numpy's
+        ``histogram_bin_edges`` with binning strategy set to 'auto' is used.
+
+        The final score is the average of the distances computed across columns. Lower is better.
+
+        NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
+        This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
+        dataframes before calling compute or by setting ``do_preprocess`` to True
+
+        Args:
+            real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
+                to TRAIN the model that generated the synthetic data, but not always.
+            synthetic_data: Synthetically generated data whose quality is to be assessed.
+
+        Returns:
+            The mean of the individual Hellinger distances between each of the corresponding columns of the real and
+            synthetic dataframes. This mean is keyed by 'mean_hellinger_distance' and is reported along with the
+            "standard error" associated with that mean keyed under 'hellinger_standard_error'.
+        """
+        if self.do_preprocess:
+            real_data, synthetic_data = self.preprocess(real_data, synthetic_data)
+
+        hellinger_distances = []
+
+        for category_column in self.categorical_columns:
+            class_num = len(np.unique(real_data[category_column]))
+
+            real_discrete_counts = np.histogram(real_data[category_column], bins=class_num)[0]
+            synthetic_discrete_counts = np.histogram(synthetic_data[category_column], bins=class_num)[0]
+
+            real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts)
+            synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts)
+
+            distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
+            hellinger_distances.append(distance)
+
+        if self.include_numerical_columns:
+            for numeric_column in self.numerical_columns:
+                combined_data = np.concatenate((real_data[numeric_column], synthetic_data[numeric_column]))
+                bin_edges = np.histogram_bin_edges(combined_data, bins="auto")
+
+                real_discrete_counts = np.histogram(real_data[numeric_column], bins=bin_edges)[0]
+                synthetic_discrete_counts = np.histogram(synthetic_data[numeric_column], bins=bin_edges)[0]
+
+                real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts)
+                synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts)
+
+                distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
+                hellinger_distances.append(distance)
+
+        return {
+            "mean_hellinger_distance": np.mean(hellinger_distances),
+            "hellinger_standard_error": np.std(hellinger_distances, ddof=1) / np.sqrt(len(hellinger_distances)),
+        }
diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
new file mode 100644
index 00000000..26544179
--- /dev/null
+++ b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
@@ -0,0 +1,105 @@
+import pandas as pd
+from syntheval.metrics.utility.metric_propensity_mse import PropensityMeanSquaredError
+
+from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric
+
+
+class MeanPropensityMeanSquaredError(SynthEvalQualityMetric):
+    def __init__(
+        self,
+        categorical_columns: list[str],
+        numerical_columns: list[str],
+        do_preprocess: bool = False,
+        folds: int = 5,
+        max_iterations: int = 100,
+        solver: str = "liblinear",
+    ):
+        """
+        This class measures how well a ``LogisticRegression`` model from sklearn (as implemented in SynthEval) can
+        distinguish between real and synthetic data. The classification model is trained on a subset of the two data
+        sources and then applied to a heldout portion of the mixed data. The average pMSE for synthetic vs. real
+        predictions and macro F1 scores across the folds are reported along with the standard error of these mean
+        values.
+
+        Computation of pMSE is based on the formula in:
+
+        Woo, M., Reiter, J.P., Oganian, A., Karr, A.F.: Global measures of data utility for microdata masked for
+        disclosure limitation. J. Priv. Confidentiality 1(1) (2009) https://doi.org/10.29012/jpc.v1i1.568
+
+        NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
+        preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
+        ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the
+        categoricals.
+
+        - A smaller pMSE is better. In cases where the two datasets are balanced in size, 0.25 is worst case.
+        - Higher Macro F1 is better.
+
+        Args:
+            categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
+            numerical_columns: Column names corresponding to the numerical variables of any provided dataframe.
+            do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval.
+                Defaults to False.
+            folds: Number of cross-validation folds for training/evaluating the LogisticRegression classifier used to
+                establish a stable estimate of the pMSE. Defaults to 5.
+            max_iterations: Maximum number of iterations for the regression fitting. Defaults to 100.
+            solver: Kind of solver used to fit the ``LogisticRegression`` model. Options coincide with those of the
+                sklearn ``LogisticRegression`` implementation. Defaults to 'liblinear'.
+        """
+        super().__init__(categorical_columns, numerical_columns, do_preprocess)
+        self.all_columns = categorical_columns + numerical_columns
+        self.folds = folds
+        self.max_iterations = max_iterations
+        self.solver = solver
+
+    def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
+        """
+        Computes how well a LogisticRegression model from sklearn (as implemented in SynthEval) can distinguish between
+        real and synthetic data. The classification model is trained on a subset of the two data sources and then
+        applied to a heldout portion of the mixed data. The average pMSE of the 0 - synthetic, 1 - real predictions and
+        macro F1 scores across the folds are reported along with the standard error of these mean values.
+
+        NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
+        preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
+        ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the
+        categoricals.
+
+        Args:
+            real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
+                to TRAIN the model that generated the synthetic data, but not always.
+            synthetic_data: Synthetically generated data whose quality is to be assessed.
+
+        Returns:
+            The mean pMSE and macro F1 scores for a LogisticRegression model. These values are keyed by 'avg_pmse' and
+            'avg_macro_f1_score' respectively. The standard errors associated with these mean values are reported under
+            the keys 'pmse_standard_error' and 'macro_f1_standard_error' as well.
+        """
+        if self.do_preprocess:
+            real_data, synthetic_data = self.preprocess(real_data, synthetic_data)
+
+        # NOTE: The SynthEval MutualInformation class ignores column specifications by default. However, for
+        # other classes (correlation_matrix_difference for example), specifying less than all of the columns restricts
+        # the score computation to just those columns. To make this consistent we do that here, before passing to the
+        # SynthEval class.
+        filtered_real_data = real_data[self.all_columns]
+        filtered_synthetic_data = synthetic_data[self.all_columns]
+
+        # Syntheval also ASSUMES you don't have a column in both provided dataframes called 'real' because it will
+        # attach another column with the same name, so we throw an error here if the column already exists.
+        assert "real" not in filtered_real_data.columns, "A column called 'real' already exists in the dataframe."
+        assert "real" not in filtered_synthetic_data.columns, "A column called 'real' already exists in the dataframe."
+
+        self.syntheval_metric = PropensityMeanSquaredError(
+            real_data=filtered_real_data,
+            synt_data=filtered_synthetic_data,
+            hout_data=None,
+            cat_cols=self.categorical_columns,
+            num_cols=self.numerical_columns,
+            do_preprocessing=False,
+            verbose=False,
+        )
+        result = self.syntheval_metric.evaluate(self.folds, self.max_iterations, self.solver)
+        result["avg_pmse"] = result.pop("avg pMSE")
+        result["pmse_standard_error"] = result.pop("pMSE err")
+        result["avg_macro_f1_score"] = result.pop("avg acc")
+        result["macro_f1_standard_error"] = result.pop("acc err")
+        return result
diff --git a/tests/unit/evaluation/quality/test_mean_hellinger_distance.py b/tests/unit/evaluation/quality/test_mean_hellinger_distance.py
new file mode 100644
index 00000000..1524ea07
--- /dev/null
+++ b/tests/unit/evaluation/quality/test_mean_hellinger_distance.py
@@ -0,0 +1,115 @@
+import math
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from midst_toolkit.evaluation.quality.mean_hellinger_distance import MeanHellingerDistance
+
+
+REAL_DATA = pd.DataFrame(
+    {
+        "column_a": [1, 2, 3, 4, 5],
+        "column_b": [4, 5, 6, 7, 8],
+        "column_c": ["horse", "dog", "horse", "cat", "cat"],
+        "column_d": [-1, -2, -3, -2, -5],
+    }
+)
+SYNTHETIC_DATA = pd.DataFrame(
+    {
+        "column_a": [1, 2, 3, 4, 5],
+        "column_b": [4, 6, 6, -1, 1],
+        "column_c": ["cat", "dog", "horse", "cat", "cat"],
+        "column_d": [-1, -2, -3, -2, -50],
+    }
+)
+
+REAL_DATA_ENCODED = pd.DataFrame({"column_c": [1, 2, 1, 3, 3]})
+
+SYNTHETIC_DATA_ENCODED = pd.DataFrame({"column_c": [3, 2, 1, 4, 3]})
+
+
+def test_mean_hellinger_distance_with_no_preprocess() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_c"],
+        numerical_columns=[],
+        do_preprocess=False,
+    )
+
+    discrete_real = np.array([2 / 5, 1 / 5, 2 / 5])
+    # 4 gets collapsed into the last bin
+    synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5])
+    target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real))
+    score = metric.compute(REAL_DATA_ENCODED, SYNTHETIC_DATA_ENCODED)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+    assert np.isnan(score["hellinger_standard_error"])
+
+
+def test_mean_hellinger_distance_with_preprocess_categorical() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_c"],
+        numerical_columns=[],
+        do_preprocess=True,
+    )
+
+    # Should be the same as after test_mean_hellinger_distance_with_no_preprocess running preprocessing
+    discrete_real = np.array([2 / 5, 1 / 5, 2 / 5])
+    # 4 gets collapsed into the last bin
+    synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5])
+    target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real))
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+    assert np.isnan(score["hellinger_standard_error"])
+
+
+def test_mean_hellinger_distance_with_preprocess() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=[],
+        numerical_columns=["column_a", "column_b", "column_d"],
+        do_preprocess=True,
+    )
+    # Should be the same, as preprocessing doesn't change the categorical MI
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(0.3598897091778779, abs=1e-8) == score["mean_hellinger_distance"]
+    assert pytest.approx(0.18772239774180174, abs=1e-8) == score["hellinger_standard_error"]
+
+
+def test_one_column_left_off() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_c"],
+        numerical_columns=["column_a", "column_b"],
+        do_preprocess=True,
+    )
+
+    # Make sure computation doesn't include the column that was not included.
+    target = 1 / 3 * (0.16510402468972515 + 0.0 + 0.6324555320336758)
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+
+
+def test_mean_hellinger_distance_no_numericals() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_b", "column_c"],
+        numerical_columns=[],
+        do_preprocess=True,
+    )
+
+    # Everything should still work with an empty numerical list
+    target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515)
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
+
+
+def test_mean_hellinger_distance_do_not_include_numericals() -> None:
+    metric = MeanHellingerDistance(
+        categorical_columns=["column_b", "column_c"],
+        numerical_columns=["column_a", "column_d"],
+        do_preprocess=True,
+        include_numerical_columns=False,
+    )
+
+    # Should be the same as test_mean_hellinger_distance_no_numericals since we're saying we do not want to include
+    # numerical columns in the computations.
+    target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515)
+    score = metric.compute(REAL_DATA, SYNTHETIC_DATA)
+    assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"]
diff --git a/tests/unit/evaluation/quality/test_mean_propensity_mse.py b/tests/unit/evaluation/quality/test_mean_propensity_mse.py
new file mode 100644
index 00000000..8e06fc8c
--- /dev/null
+++ b/tests/unit/evaluation/quality/test_mean_propensity_mse.py
@@ -0,0 +1,82 @@
+from random import choices
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from midst_toolkit.common.random import set_all_random_seeds, unset_all_random_seeds
+from midst_toolkit.evaluation.quality.mean_propensity_mse import MeanPropensityMeanSquaredError
+
+
+def get_data() -> tuple[pd.DataFrame, pd.DataFrame]:
+    real_data = pd.DataFrame(
+        {
+            "column_a": 1.2 * np.random.randn(2500) + 1,
+            "column_b": 2.5 * np.random.randn(2500) - 1,
+            "column_c": choices(["cat", "horse", "dog"], weights=[0.25, 0.5, 0.25], k=2500),
+            "column_d": 1.5 * np.random.randn(2500) + 12,
+        }
+    )
+    synthetic_data = pd.DataFrame(
+        {
+            "column_a": 1.1 * np.random.randn(2500) + 0.5,
+            "column_b": 2.2 * np.random.randn(2500) - 1,
+            "column_c": choices(["cat", "horse", "dog"], weights=[0.35, 0.35, 0.3], k=2500),
+            "column_d": 1.5 * np.random.randn(2500) + 10,
+        }
+    )
+    return real_data, synthetic_data
+
+
+def test_mean_propensity_mse_with_preprocess() -> None:
+    set_all_random_seeds(42)
+
+    real_data, synthetic_data = get_data()
+
+    metric = MeanPropensityMeanSquaredError(
+        categorical_columns=["column_c"],
+        numerical_columns=["column_a", "column_b", "column_d"],
+        do_preprocess=True,
+    )
+
+    score = metric.compute(real_data, synthetic_data)
+    assert pytest.approx(0.08442776888821232, abs=1e-8) == score["avg_pmse"]
+    assert pytest.approx(0.7566743656985974, abs=1e-8) == score["avg_macro_f1_score"]
+    unset_all_random_seeds()
+
+
+def test_mean_propensity_mse_with_no_categorical() -> None:
+    set_all_random_seeds(42)
+
+    real_data, synthetic_data = get_data()
+
+    metric = MeanPropensityMeanSquaredError(
+        categorical_columns=[],
+        numerical_columns=["column_a", "column_b", "column_d"],
+        do_preprocess=True,
+    )
+
+    score = metric.compute(real_data, synthetic_data)
+    assert pytest.approx(0.08000946858157684, abs=1e-8) == score["avg_pmse"]
+    assert pytest.approx(0.7485073080661124, abs=1e-8) == score["avg_macro_f1_score"]
+    unset_all_random_seeds()
+
+
+def test_mean_propensity_mse_with_no_numerical_and_shortcut() -> None:
+    set_all_random_seeds(42)
+
+    real_data, synthetic_data = get_data()
+    real_data["column_e"] = 1
+    synthetic_data["column_e"] = 0
+
+    metric = MeanPropensityMeanSquaredError(
+        categorical_columns=["column_c", "column_e"],
+        numerical_columns=[],
+        do_preprocess=True,
+    )
+
+    score = metric.compute(real_data, synthetic_data)
+    # pMSE should be close to 0.25 and F1 should be essentially 1 due to the shortcut.
+    assert pytest.approx(0.24374514497992789, abs=1e-8) == score["avg_pmse"]
+    assert pytest.approx(1.0, abs=1e-8) == score["avg_macro_f1_score"]
+    unset_all_random_seeds()

From a790991e217002aec71b64eb1cb30225cff93d6c Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Wed, 17 Sep 2025 09:43:03 -0400
Subject: [PATCH 2/5] Fix typing issue

---
 .../evaluation/quality/mean_hellinger_distance.py        | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
index 374761d7..41aecb3a 100644
--- a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
+++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
@@ -125,7 +125,12 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
                 distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf)
                 hellinger_distances.append(distance)
 
+        mean_hellinger_distance = np.mean(hellinger_distances).item()
+        hellinger_distance_standard_error = np.std(hellinger_distances, ddof=1).item() / np.sqrt(
+            len(hellinger_distances)
+        )
+
         return {
-            "mean_hellinger_distance": np.mean(hellinger_distances),
-            "hellinger_standard_error": np.std(hellinger_distances, ddof=1) / np.sqrt(len(hellinger_distances)),
+            "mean_hellinger_distance": mean_hellinger_distance,
+            "hellinger_standard_error": hellinger_distance_standard_error,
         }

From 59ea7f4fb3ee69c89b02a82880c622efebee4f3e Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Wed, 24 Sep 2025 11:09:41 -0400
Subject: [PATCH 3/5] New mypy flow and fixes to typing issues that were
 discovered

---
 .pre-commit-config.yaml                       | 11 ++---
 mypy.ini                                      | 38 +++++++++++++++
 pyproject.toml                                | 26 -----------
 run_mypy.sh                                   |  3 ++
 .../attacks/ensemble/process_split_data.py    |  4 +-
 src/midst_toolkit/core/data_loaders.py        | 29 ++++++++++--
 src/midst_toolkit/core/logger.py              | 18 ++++----
 src/midst_toolkit/data_processing/utils.py    |  4 +-
 .../privacy/distance_closest_record.py        |  3 ++
 .../quality/synthcity/dataloader.py           | 19 ++++----
 .../quality/synthcity/feature_encoder.py      | 10 ++--
 .../quality/synthcity/statistical_eval.py     | 16 +++----
 src/midst_toolkit/evaluation/utils.py         |  2 +-
 .../models/clavaddpm/diffusion_utils.py       | 20 ++++----
 .../gaussian_multinomial_diffusion.py         | 20 +++++---
 src/midst_toolkit/models/clavaddpm/model.py   | 46 +++++++++++--------
 src/midst_toolkit/models/clavaddpm/sampler.py |  6 +--
 src/midst_toolkit/models/clavaddpm/train.py   |  4 +-
 18 files changed, 167 insertions(+), 112 deletions(-)
 create mode 100644 mypy.ini
 create mode 100755 run_mypy.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3c46dd45..376e0bfc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,13 +34,12 @@ repos:
       types_or: [python, jupyter]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.0
+    rev: v1.18.2
     hooks:
-    - id: mypy
-      entry: python3 -m mypy --config-file pyproject.toml
-      language: system
-      types: [python]
-      exclude: "tests"
+      - id: mypy
+        name: mypy
+        entry: ./run_mypy.sh
+        language: system
 
   - repo: local
     hooks:
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 00000000..0a7ee8a1
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,38 @@
+[mypy]
+mypy_path=src
+follow_imports = normal
+ignore_missing_imports = False
+install_types = True
+pretty = True
+non_interactive = True
+disallow_untyped_defs = True
+no_implicit_optional = True
+check_untyped_defs = True
+allow_untyped_decorators = False
+allow_incomplete_defs = False
+warn_redundant_casts = True
+warn_unused_ignores = True
+implicit_reexport = False
+strict_equality = True
+extra_checks = True
+warn_unused_configs = True
+allow_subclassing_any = False
+exclude = (venv|examples/tutorial/*|tests)
+
+[mypy-sklearn.*]
+ignore_missing_imports = True
+
+[mypy-syntheval.*]
+ignore_missing_imports = True
+
+[mypy-opacus.*]
+ignore_missing_imports = True
+
+[mypy-nltk.*]
+ignore_missing_imports = True
+
+[mypy-scipy.*]
+ignore_missing_imports = True
+
+[mypy-category_encoders.*]
+ignore_missing_imports = True
diff --git a/pyproject.toml b/pyproject.toml
index bf8af1b6..d44996dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,32 +53,6 @@ docs = [
 [tool.uv]
 default-groups = ["dev", "docs"]
 
-[tool.mypy]
-follow_imports = "normal"
-ignore_missing_imports = false
-install_types = true
-pretty = true
-non_interactive = true
-disallow_untyped_defs = false
-no_implicit_optional = true
-check_untyped_defs = true
-namespace_packages = true
-explicit_package_bases = true
-warn_unused_configs = true
-allow_subclassing_any = false
-allow_untyped_calls = true
-allow_incomplete_defs = false
-allow_untyped_decorators = false
-warn_redundant_casts = true
-warn_unused_ignores = true
-implicit_reexport = false
-strict_equality = true
-extra_checks = true
-mypy_path = "src"
-files = ["src", "examples"]
-exclude = [
-    "examples/tutorial/.*"
-]
 
 [tool.ruff]
 include = ["*.py", "pyproject.toml"]
diff --git a/run_mypy.sh b/run_mypy.sh
new file mode 100755
index 00000000..112c97da
--- /dev/null
+++ b/run_mypy.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+mypy --config-file ./mypy.ini .
diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py
index dee59279..1e9b5e98 100644
--- a/src/midst_toolkit/attacks/ensemble/process_split_data.py
+++ b/src/midst_toolkit/attacks/ensemble/process_split_data.py
@@ -110,7 +110,7 @@ def generate_train_test_challenge_splits(
     # Shuffle data
     df_val = df_val.sample(frac=1, random_state=random_seed).reset_index(drop=True)
 
-    y_val = df_val["is_train"].values
+    y_val = df_val["is_train"].to_numpy()
     df_val = df_val.drop(columns=["is_train"])
 
     # Test set
@@ -139,7 +139,7 @@ def generate_train_test_challenge_splits(
 
     df_test = df_test.sample(frac=1, random_state=random_seed).reset_index(drop=True)
 
-    y_test = df_test["is_train"].values
+    y_test = df_test["is_train"].to_numpy()
     df_test = df_test.drop(columns=["is_train"])
 
     return df_val, y_val, df_test, y_test
diff --git a/src/midst_toolkit/core/data_loaders.py b/src/midst_toolkit/core/data_loaders.py
index 81513f7b..cabbf9b2 100644
--- a/src/midst_toolkit/core/data_loaders.py
+++ b/src/midst_toolkit/core/data_loaders.py
@@ -6,7 +6,9 @@
 import pandas as pd
 
 
-def load_multi_table(data_dir, verbose=True):
+def load_multi_table(
+    data_dir: str, verbose: bool = True
+) -> tuple[dict[str, Any], list[tuple[str, str]], dict[str, Any]]:
     dataset_meta = json.load(open(os.path.join(data_dir, "dataset_meta.json"), "r"))
 
     relation_order = dataset_meta["relation_order"]
@@ -71,7 +73,7 @@ def pipeline_process_data(
     ratio: float = 0.9,
     save: bool = False,
     verbose: bool = True,
-) -> tuple[dict[str, Any], dict[str, Any]]:
+) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
     # ruff: noqa: D103
     num_data = data_df.shape[0]
 
@@ -93,15 +95,18 @@ def pipeline_process_data(
     num_train = int(num_data * ratio)
     num_test = num_data - num_train
 
+    test_df: pd.DataFrame | None = None
+
     if ratio < 1:
         train_df, test_df, seed = train_val_test_split(data_df, cat_columns, num_train, num_test)
     else:
         train_df = data_df.copy()
 
-    train_df.columns = range(len(train_df.columns))
+    train_df.columns = list(range(len(train_df.columns)))
 
     if ratio < 1:
-        test_df.columns = range(len(test_df.columns))
+        assert test_df is not None
+        test_df.columns = list(range(len(test_df.columns)))
 
     col_info: dict[Any, Any] = {}
 
@@ -131,6 +136,7 @@ def pipeline_process_data(
 
     train_df.rename(columns=idx_name_mapping, inplace=True)
     if ratio < 1:
+        assert test_df is not None
         test_df.rename(columns=idx_name_mapping, inplace=True)
 
     for col in num_columns:
@@ -139,6 +145,7 @@ def pipeline_process_data(
         train_df.loc[train_df[col] == "?", col] = "nan"
 
     if ratio < 1:
+        assert test_df is not None
         for col in num_columns:
             test_df.loc[test_df[col] == "?", col] = np.nan
         for col in cat_columns:
@@ -148,7 +155,12 @@ def pipeline_process_data(
     X_cat_train = train_df[cat_columns].to_numpy()
     y_train = train_df[target_columns].to_numpy()
 
+    X_num_test: np.ndarray | None = None
+    X_cat_test: np.ndarray | None = None
+    y_test: np.ndarray | None = None
+
     if ratio < 1:
+        assert test_df is not None
         X_num_test = test_df[num_columns].to_numpy().astype(np.float32)
         X_cat_test = test_df[cat_columns].to_numpy()
         y_test = test_df[target_columns].to_numpy()
@@ -160,6 +172,7 @@ def pipeline_process_data(
         np.save(f"{save_dir}/y_train.npy", y_train)
 
         if ratio < 1:
+            assert X_num_test is not None and X_cat_test is not None and y_test is not None
             np.save(f"{save_dir}/X_num_test.npy", X_num_test)
             np.save(f"{save_dir}/X_cat_test.npy", X_cat_test)
             np.save(f"{save_dir}/y_test.npy", y_test)
@@ -167,12 +180,14 @@ def pipeline_process_data(
     train_df[num_columns] = train_df[num_columns].astype(np.float32)
 
     if ratio < 1:
+        assert test_df is not None
         test_df[num_columns] = test_df[num_columns].astype(np.float32)
 
     if save:
         train_df.to_csv(f"{save_dir}/train.csv", index=False)
 
         if ratio < 1:
+            assert test_df is not None
             test_df.to_csv(f"{save_dir}/test.csv", index=False)
 
         if not os.path.exists(f"synthetic/{name}"):
@@ -181,12 +196,14 @@ def pipeline_process_data(
         train_df.to_csv(f"synthetic/{name}/real.csv", index=False)
 
         if ratio < 1:
+            assert test_df is not None
             test_df.to_csv(f"synthetic/{name}/test.csv", index=False)
 
     info["column_names"] = column_names
     info["train_num"] = train_df.shape[0]
 
     if ratio < 1:
+        assert test_df is not None
         info["test_num"] = test_df.shape[0]
 
     info["idx_mapping"] = idx_mapping
@@ -227,6 +244,7 @@ def pipeline_process_data(
 
     if verbose:
         if ratio < 1:
+            assert test_df is not None
             str_shape = "Train dataframe shape: {}, Test dataframe shape: {}, Total dataframe shape: {}".format(
                 train_df.shape, test_df.shape, data_df.shape
             )
@@ -251,7 +269,7 @@ def pipeline_process_data(
     # print('Num', num)
     # print('Cat', cat)
 
-    data = {
+    data: dict[str, dict[str, Any]] = {
         "df": {"train": train_df},
         "numpy": {
             "X_num_train": X_num_train,
@@ -261,6 +279,7 @@ def pipeline_process_data(
     }
 
     if ratio < 1:
+        assert test_df is not None and X_num_test is not None and X_cat_test is not None and y_test is not None
         data["df"]["test"] = test_df
         data["numpy"]["X_num_test"] = X_num_test
         data["numpy"]["X_cat_test"] = X_cat_test
diff --git a/src/midst_toolkit/core/logger.py b/src/midst_toolkit/core/logger.py
index b76ce8b7..1775163a 100644
--- a/src/midst_toolkit/core/logger.py
+++ b/src/midst_toolkit/core/logger.py
@@ -15,7 +15,7 @@
 import time
 import warnings
 from collections import defaultdict
-from collections.abc import Generator, Iterable
+from collections.abc import Callable, Generator, Iterable
 from contextlib import contextmanager
 from typing import IO, Any
 
@@ -55,7 +55,7 @@ def __init__(self, filename_or_file: str | IO[str]):
             self.file = filename_or_file  # type: ignore[assignment]
             self.own_file = False
 
-    def writekvs(self, kvs):
+    def writekvs(self, kvs: dict[str, Any]) -> None:
         # Create strings for printing
         key2str = {}
         for key, val in sorted(kvs.items()):
@@ -84,7 +84,7 @@ def _truncate(self, s: str) -> str:
         maxlen = 30
         return s[: maxlen - 3] + "..." if len(s) > maxlen else s
 
-    def writeseq(self, seq):
+    def writeseq(self, seq: Iterable[str]) -> None:
         seq = list(seq)
         for i, elem in enumerate(seq):
             self.file.write(elem)
@@ -103,7 +103,7 @@ def __init__(self, filename: str):
         self.file = open(filename, "wt")
         # ruff: noqa: SIM115
 
-    def writekvs(self, kvs):
+    def writekvs(self, kvs: dict[str, Any]) -> None:
         for k, v in sorted(kvs.items()):
             if hasattr(v, "dtype"):
                 kvs[k] = float(v)
@@ -121,7 +121,7 @@ def __init__(self, filename: str):
         self.keys: list[str] = []
         self.sep = ","
 
-    def writekvs(self, kvs):
+    def writekvs(self, kvs: dict[str, Any]) -> None:
         # Add our current row to the history
         extra_keys = list(kvs.keys() - self.keys)
         extra_keys.sort()
@@ -297,7 +297,7 @@ def profile_kv(scopename: str) -> Generator[None, None, None]:
         get_current().name2val[logkey] += time.time() - tstart
 
 
-def profile(n):
+def profile(n: str) -> Callable:
     """
     Usage.
 
@@ -305,8 +305,8 @@ def profile(n):
     def my_func(): code
     """
 
-    def decorator_with_name(func):
-        def func_wrapper(*args, **kwargs):
+    def decorator_with_name(func):  # type: ignore
+        def func_wrapper(*args, **kwargs):  # type: ignore
             with profile_kv(n):
                 return func(*args, **kwargs)
 
@@ -483,7 +483,7 @@ def reset() -> None:
 
 
 @contextmanager
-def scoped_configure(dir=None, format_strs=None, comm=None):
+def scoped_configure(dir=None, format_strs=None, comm=None):  # type: ignore
     # ruff: noqa: D103
     prevlogger = Logger.CURRENT
     configure(dir=dir, format_strs=format_strs, comm=comm)
diff --git a/src/midst_toolkit/data_processing/utils.py b/src/midst_toolkit/data_processing/utils.py
index f46b8b9d..9a940c64 100644
--- a/src/midst_toolkit/data_processing/utils.py
+++ b/src/midst_toolkit/data_processing/utils.py
@@ -43,7 +43,7 @@ def __init__(
         synthetic_data: pd.DataFrame,
         categorical_columns: list[str] | None,
         numerical_columns: list[str] | None,
-        holdout_data: pd.DataFrame = None,
+        holdout_data: pd.DataFrame | None = None,
     ) -> None:
         """
         A class responsible for fitting encoders and scalers for categorical and numerical columns of dataframes,
@@ -214,6 +214,6 @@ def is_column_type_numerical(dataframe: pd.DataFrame, column_name: str) -> bool:
     Returns:
         True if the column contains numerical values. False otherwise.
     """
-    column_dtype = dataframe[column_name].dtype
+    column_dtype = dataframe[column_name].to_numpy().dtype
 
     return np.issubdtype(column_dtype, np.integer) or np.issubdtype(column_dtype, np.floating)
diff --git a/src/midst_toolkit/evaluation/privacy/distance_closest_record.py b/src/midst_toolkit/evaluation/privacy/distance_closest_record.py
index 38141a87..5f0afa96 100644
--- a/src/midst_toolkit/evaluation/privacy/distance_closest_record.py
+++ b/src/midst_toolkit/evaluation/privacy/distance_closest_record.py
@@ -120,6 +120,9 @@ def preprocess(
 
     if real_data_test is None:
         return (processed_synthetic_data, processed_real_data_train)
+
+    assert num_real_data_test_np is not None
+    assert cat_real_data_test_oh is not None
     return (
         processed_synthetic_data,
         processed_real_data_train,
diff --git a/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py b/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py
index 0eb11ffe..0d26cea9 100644
--- a/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py
+++ b/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 import pandas as pd
-import torch
 from sklearn.model_selection import train_test_split
 
 
@@ -83,19 +82,19 @@ def columns(self) -> list:
         ...
 
     @abstractmethod
-    def dataframe(self) -> pd.DataFrame:
+    def to_dataframe(self) -> pd.DataFrame:
         """A method that returns the pandas dataframe that contains all features and samples."""
         ...
 
     @abstractmethod
-    def numpy(self) -> np.ndarray:
+    def to_numpy(self) -> np.ndarray:
         """A method that returns the numpy array that contains all features and samples."""
         ...
 
     @property
     def values(self) -> np.ndarray:
         """Pass through to the numpy method."""
-        return self.numpy()
+        return self.to_numpy()
 
     @abstractmethod
     def info(self) -> dict:
@@ -145,11 +144,11 @@ def test(self) -> DataLoader:
 
     def __repr__(self, *args: Any, **kwargs: Any) -> str:
         """Return a string representation."""
-        return self.dataframe().__repr__(*args, **kwargs)
+        return self.to_dataframe().__repr__(*args, **kwargs)
 
     def _repr_html_(self, *args: Any, **kwargs: Any) -> Any:
         """Return a string representation in html format."""
-        return self.dataframe()._repr_html_(*args, **kwargs)
+        return self.to_dataframe().to_html(*args, **kwargs)
 
     @abstractmethod
     def fillna(self, value: Any) -> DataLoader:
@@ -266,13 +265,13 @@ def unpack(self, as_numpy: bool = False, pad: bool = False) -> Any:
             return np.asarray(x), np.asarray(y)
         return x, y
 
-    def dataframe(self) -> pd.DataFrame:
+    def to_dataframe(self) -> pd.DataFrame:
         """A method that returns the pandas dataframe that contains all features and samples."""
         return self.data
 
-    def numpy(self) -> np.ndarray:
+    def to_numpy(self) -> np.ndarray:
         """A method that returns the numpy array that contains all features and samples."""
-        return self.dataframe().values
+        return self.to_dataframe().to_numpy()
 
     def info(self) -> dict:
         """A method that returns a dictionary of DataLoader information."""
@@ -376,7 +375,7 @@ def is_tabular(self) -> bool:
         return True
 
 
-def create_from_info(data: pd.DataFrame | torch.utils.data.Dataset, info: dict) -> DataLoader:
+def create_from_info(data: pd.DataFrame, info: dict) -> DataLoader:
     """Helper for creating a DataLoader from existing information."""
     if info["data_type"] == "generic":
         return GenericDataLoader.from_info(data, info)
diff --git a/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py b/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py
index 0435f65a..9f12ea45 100644
--- a/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py
+++ b/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py
@@ -74,9 +74,9 @@ def fit(self, x: pd.Series, y: Any = None, **kwargs: Any) -> FeatureEncoder:
         Returns:
             The fitted FeatureEncoder object.
         """
-        self.feature_name_in = x.name
+        self.feature_name_in = str(x.name)
         self.feature_type_in = self._get_feature_type(x)
-        input = validate_shape(x.values, self.n_dim_in)
+        input = validate_shape(x.to_numpy(), self.n_dim_in)
         output = self._fit(input, **kwargs)._transform(input)
         self._out_shape = (-1, *output.shape[1:])  # for inverse_transform
         output = validate_shape(output, self.n_dim_out)
@@ -101,7 +101,7 @@ def transform(self, x: pd.Series) -> pd.DataFrame | pd.Series:
         Returns:
             The transformed input.
         """
-        data = validate_shape(x.values, self.n_dim_in)
+        data = validate_shape(x.to_numpy(), self.n_dim_in)
         out = self._transform(data)
         out = validate_shape(out, self.n_dim_out)
         if self.n_dim_out == 1:
@@ -135,7 +135,7 @@ def _get_feature_type(self, x: Any) -> str:
 
     def inverse_transform(self, df: pd.DataFrame | pd.Series) -> pd.Series:
         """Reverse the encoder mapping."""
-        y = df.values.reshape(self._out_shape)
+        y = df.to_numpy().reshape(self._out_shape)
         x = self._inverse_transform(y)
         x = validate_shape(x, 1)
         return pd.Series(x, name=self.feature_name_in)
@@ -197,4 +197,4 @@ def _transform(self, x: np.ndarray) -> np.ndarray:
         return pd.to_numeric(x).astype(float)
 
     def _inverse_transform(self, data: np.ndarray) -> np.ndarray:
-        return pd.to_datetime(data)
+        return pd.to_datetime(data).to_numpy()
diff --git a/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py b/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py
index 1bc38050..a084ca16 100644
--- a/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py
+++ b/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py
@@ -191,8 +191,8 @@ def _normalize_covariates(
         Returns:
             Tuple[pd.DataFrame, pd.DataFrame]: normalized version of the datasets
         """
-        x_gt_norm = x.dataframe().copy()
-        x_syn_norm = x_syn.dataframe().copy()
+        x_gt_norm = x.to_dataframe().copy()
+        x_syn_norm = x_syn.to_dataframe().copy()
         if self._task_type != "survival_analysis":
             if hasattr(x, "target_column"):
                 x_gt_norm = x_gt_norm.drop(columns=[x.target_column])
@@ -202,18 +202,18 @@ def _normalize_covariates(
         if hasattr(x, "target_column"):
             x_gt_norm_df = pd.DataFrame(
                 scaler.transform(x_gt_norm),
-                columns=[col for col in x.train().dataframe().columns if col != x.target_column],
+                columns=[col for col in x.train().to_dataframe().columns if col != x.target_column],
             )
         else:
-            x_gt_norm_df = pd.DataFrame(scaler.transform(x_gt_norm), columns=x.train().dataframe().columns)
+            x_gt_norm_df = pd.DataFrame(scaler.transform(x_gt_norm), columns=x.train().to_dataframe().columns)
 
         if hasattr(x_syn, "target_column"):
             x_syn_norm_df = pd.DataFrame(
                 scaler.transform(x_syn_norm),
-                columns=[col for col in x_syn.dataframe().columns if col != x_syn.target_column],
+                columns=[col for col in x_syn.to_dataframe().columns if col != x_syn.target_column],
             )
         else:
-            x_syn_norm_df = pd.DataFrame(scaler.transform(x_syn_norm), columns=x_syn.dataframe().columns)
+            x_syn_norm_df = pd.DataFrame(scaler.transform(x_syn_norm), columns=x_syn.to_dataframe().columns)
 
         return x_gt_norm_df, x_syn_norm_df
 
@@ -234,8 +234,8 @@ def _evaluate(
         """
         results = {}
 
-        x_ = x.numpy().reshape(len(x), -1)
-        x_syn_ = x_syn.numpy().reshape(len(x_syn), -1)
+        x_ = x.to_numpy().reshape(len(x), -1)
+        x_syn_ = x_syn.to_numpy().reshape(len(x_syn), -1)
 
         # OneClass representation
         emb = "_OC"
diff --git a/src/midst_toolkit/evaluation/utils.py b/src/midst_toolkit/evaluation/utils.py
index f850ef78..fb6eb390 100644
--- a/src/midst_toolkit/evaluation/utils.py
+++ b/src/midst_toolkit/evaluation/utils.py
@@ -64,7 +64,7 @@ def extract_columns_based_on_meta_info(
     # Training the diffusion generators.
 
     # Enumerate columns and replace column name with index
-    data.columns = range(len(data.columns))
+    data.columns = list(range(len(data.columns)))
 
     # Get numerical and categorical column indices from meta info
     # NOTE: numerical and categorical columns are the only admissible/generate-able types"
diff --git a/src/midst_toolkit/models/clavaddpm/diffusion_utils.py b/src/midst_toolkit/models/clavaddpm/diffusion_utils.py
index de4be1e2..76c72df2 100644
--- a/src/midst_toolkit/models/clavaddpm/diffusion_utils.py
+++ b/src/midst_toolkit/models/clavaddpm/diffusion_utils.py
@@ -1,5 +1,6 @@
 """PLACEHOLDER."""
 
+from collections.abc import Callable
 from inspect import isfunction
 from typing import Any
 
@@ -18,7 +19,7 @@ def normal_kl(
     logvar2: Tensor | float,
 ) -> Tensor:
     """
-    Compute the KL divergence between two gaussians.
+    Compute the KL divergence between two Gaussians.
 
     Shapes are automatically broadcasted, so batches can be compared to
     scalars, among other use cases.
@@ -133,11 +134,14 @@ def extract(a: Tensor, t: Tensor, x_shape: tuple[int, ...]) -> Tensor:
     return out.expand(x_shape)
 
 
-def default(val, d):
+def default(val: Tensor, d: Callable[[], Tensor] | Tensor) -> Tensor:
     # ruff: noqa: D103
     if exists(val):
         return val
-    return d() if isfunction(d) else d
+    if isfunction(d):
+        return d()
+    assert isinstance(d, Tensor)
+    return d
 
 
 def log_categorical(log_x_start: Tensor, log_prob: Tensor) -> Tensor:
@@ -155,7 +159,7 @@ def index_to_log_onehot(x: Tensor, num_classes: Tensor) -> Tensor:
     return torch.log(x_onehot.float().clamp(min=1e-30))
 
 
-def log_sum_exp_by_classes(x, slices):
+def log_sum_exp_by_classes(x: Tensor, slices: Tensor) -> Tensor:
     # ruff: noqa: D103
     res = torch.zeros_like(x)
     for ixs in slices:
@@ -167,14 +171,14 @@ def log_sum_exp_by_classes(x, slices):
 
 
 @torch.jit.script
-def log_sub_exp(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+def log_sub_exp(a: Tensor, b: Tensor) -> Tensor:
     # ruff: noqa: D103
     m = torch.maximum(a, b)
     return torch.log(torch.exp(a - m) - torch.exp(b - m)) + m
 
 
 @torch.jit.script
-def sliced_logsumexp(x, slices):
+def sliced_logsumexp(x: Tensor, slices: Tensor) -> Tensor:
     # ruff: noqa: D103
     lse = torch.logcumsumexp(torch.nn.functional.pad(x, [1, 0, 0, 0], value=-float("inf")), dim=-1)
 
@@ -185,7 +189,7 @@ def sliced_logsumexp(x, slices):
     return torch.repeat_interleave(slice_lse, slice_ends - slice_starts, dim=-1)
 
 
-def log_onehot_to_index(log_x):
+def log_onehot_to_index(log_x: Tensor) -> Tensor:
     # ruff: noqa: D103
     return log_x.argmax(1)
 
@@ -193,6 +197,6 @@ def log_onehot_to_index(log_x):
 class FoundNANsError(BaseException):
     """Found NANs during sampling."""
 
-    def __init__(self, message="Found NANs during sampling."):
+    def __init__(self, message: str = "Found NANs during sampling.") -> None:
         # ruff: noqa: D107
         super(FoundNANsError, self).__init__(message)
diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py
index bce925a8..2240e2a4 100644
--- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py
+++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py
@@ -751,7 +751,7 @@ def mixed_loss(self, x: Tensor, out_dict: dict[str, Tensor]) -> tuple[Tensor, Te
         return loss_multi.mean(), loss_gauss.mean()
 
     @torch.no_grad()
-    def mixed_elbo(self, x0, out_dict):
+    def mixed_elbo(self, x0: Tensor, out_dict: dict[str, Tensor]) -> dict[str, Tensor]:
         b = x0.size(0)
         device = x0.device
 
@@ -875,7 +875,15 @@ def gaussian_ddim_step(
         return mean_pred + nonzero_mask * sigma * noise
 
     @torch.no_grad()
-    def gaussian_ddim_sample(self, noise, T, out_dict, eta=0.0, model_kwargs=None, cond_fn=None):
+    def gaussian_ddim_sample(
+        self,
+        noise: Tensor,
+        T: int,
+        out_dict: dict[str, Tensor],
+        eta: float = 0.0,
+        model_kwargs: Any | None = None,
+        cond_fn: Callable | None = None,
+    ) -> Tensor:
         # ruff: noqa: D102, N803
         x = noise
         b = x.shape[0]
@@ -918,11 +926,11 @@ def gaussian_ddim_reverse_step(
     @torch.no_grad()
     def gaussian_ddim_reverse_sample(
         self,
-        x,
-        T,
+        x: Tensor,
+        T: int,
         # ruff: noqa: N803
-        out_dict,
-    ):
+        out_dict: dict[str, Tensor],
+    ) -> Tensor:
         # ruff: noqa: D102
         b = x.shape[0]
         device = x.device
diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py
index c88fbdbc..17cab4b6 100644
--- a/src/midst_toolkit/models/clavaddpm/model.py
+++ b/src/midst_toolkit/models/clavaddpm/model.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import hashlib
 import json
 import math
@@ -255,7 +257,7 @@ def __init__(
         # Create a Sequential model from the list of layers
         self.model = nn.Sequential(*layers)
 
-    def forward(self, x, timesteps):
+    def forward(self, x: Tensor, timesteps: Tensor) -> Tensor:
         emb = self.time_embed(timestep_embedding(timesteps, self.dim_t))
         x = self.proj(x) + emb
         # x = self.transformer_layer(x, x)
@@ -310,10 +312,10 @@ def make_dataset_from_df(
     df: pd.DataFrame,
     T: Transformations,
     is_y_cond: str,
-    df_info: pd.DataFrame,
+    df_info: dict[str, Any],
     ratios: list[float] | None = None,
     std: float = 0,
-) -> tuple[Dataset, dict[int, LabelEncoder], list[int]]:
+) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]:
     """
     The order of the generated dataset: (y, X_num, X_cat).
 
@@ -357,7 +359,7 @@ def make_dataset_from_df(
         X_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None
         y = {}
 
-        cat_cols_with_y = []
+        cat_cols_with_y: list[str] = []
         if df_info["cat_cols"] is not None:
             cat_cols_with_y += df_info["cat_cols"]
         if is_y_cond == "concat":
@@ -385,31 +387,33 @@ def make_dataset_from_df(
         X_num = {} if df_info["num_cols"] is not None or is_y_cond == "concat" else None
         y = {}
 
-        num_cols_with_y = []
+        num_cols_with_y: list[str] = []
         if df_info["num_cols"] is not None:
             num_cols_with_y += df_info["num_cols"]
         if is_y_cond == "concat":
             num_cols_with_y = [df_info["y_col"]] + num_cols_with_y
 
         if len(num_cols_with_y) > 0:
-            X_num["train"] = train_df[num_cols_with_y].values.astype(np.float32)  # type: ignore[index]
-            X_num["val"] = val_df[num_cols_with_y].values.astype(np.float32)  # type: ignore[index]
-            X_num["test"] = test_df[num_cols_with_y].values.astype(np.float32)  # type: ignore[index]
+            assert X_num is not None
+            X_num["train"] = train_df[num_cols_with_y].values.astype(np.float32)
+            X_num["val"] = val_df[num_cols_with_y].values.astype(np.float32)
+            X_num["test"] = test_df[num_cols_with_y].values.astype(np.float32)
 
         y["train"] = train_df[df_info["y_col"]].values.astype(np.float32)
         y["val"] = val_df[df_info["y_col"]].values.astype(np.float32)
         y["test"] = test_df[df_info["y_col"]].values.astype(np.float32)
 
         if df_info["cat_cols"] is not None:
-            X_cat["train"] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)  # type: ignore[index]
-            X_cat["val"] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)  # type: ignore[index]
-            X_cat["test"] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)  # type: ignore[index]
+            assert X_cat is not None
+            X_cat["train"] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
+            X_cat["val"] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
+            X_cat["test"] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
 
         cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]]
         num_column_orders = [column_to_index[col] for col in num_cols_with_y]
 
-    column_orders = num_column_orders + cat_column_orders
-    column_orders = [index_to_column[index] for index in column_orders]
+    column_orders_indices = num_column_orders + cat_column_orders
+    column_orders = [index_to_column[index] for index in column_orders_indices]
 
     label_encoders = {}
     if X_cat is not None and len(df_info["cat_cols"]) > 0:
@@ -434,6 +438,7 @@ def make_dataset_from_df(
         X_cat["test"] = X_cat_converted[train_num + val_num :, :]  # type: ignore[call-overload]
 
         if X_num and len(X_num) > 0:
+            assert X_num is not None
             X_num["train"] = np.concatenate((X_num["train"], X_cat["train"]), axis=1)
             X_num["val"] = np.concatenate((X_num["val"], X_cat["val"]), axis=1)
             X_num["test"] = np.concatenate((X_num["test"], X_cat["test"]), axis=1)
@@ -441,6 +446,9 @@ def make_dataset_from_df(
             X_num = X_cat
             X_cat = None
 
+    n_classes = df_info["n_classes"]
+    assert isinstance(n_classes, int)
+
     D = Dataset(
         # ruff: noqa: N806
         X_num,
@@ -448,7 +456,7 @@ def make_dataset_from_df(
         y,
         y_info={},
         task_type=TaskType(df_info["task_type"]),
-        n_classes=df_info["n_classes"],
+        n_classes=n_classes,
     )
 
     return transform_dataset(D, T, None), label_encoders, column_orders
@@ -778,7 +786,7 @@ def __init__(self, *tensors: Tensor, batch_size: int = 32, shuffle: bool = False
             n_batches += 1
         self.n_batches = n_batches
 
-    def __iter__(self):
+    def __iter__(self) -> FastTensorDataLoader:
         # ruff: noqa: D105
         if self.shuffle:
             r = torch.randperm(self.dataset_len)
@@ -786,7 +794,7 @@ def __iter__(self):
         self.i = 0
         return self
 
-    def __next__(self):
+    def __next__(self) -> tuple[Tensor, ...]:
         # ruff: noqa: D105
         if self.i >= self.dataset_len:
             raise StopIteration
@@ -794,7 +802,7 @@ def __next__(self):
         self.i += self.batch_size
         return batch
 
-    def __len__(self):
+    def __len__(self) -> int:
         # ruff: noqa: D105
         return self.n_batches
 
@@ -1162,7 +1170,7 @@ def __init__(
         self.proj = nn.Linear(d_in, dim_t)
         self.time_embed = nn.Sequential(nn.Linear(dim_t, dim_t), nn.SiLU(), nn.Linear(dim_t, dim_t))
 
-    def forward(self, x, timesteps, y=None):
+    def forward(self, x: Tensor, timesteps: Tensor, y: Tensor | None = None) -> Tensor:
         emb = self.time_embed(timestep_embedding(timesteps, self.dim_t))
         if self.is_y_cond == "embedding" and y is not None:
             y = y.squeeze() if self.num_classes > 0 else y.resize_(y.size(0), 1).float()
@@ -1198,7 +1206,7 @@ def __init__(
 
         self.time_embed = nn.Sequential(nn.Linear(dim_t, dim_t), nn.SiLU(), nn.Linear(dim_t, dim_t))
 
-    def forward(self, x, timesteps, y=None):
+    def forward(self, x: Tensor, timesteps: Tensor, y: Tensor | None = None) -> Tensor:
         # ruff: noqa: D102
         emb = self.time_embed(timestep_embedding(timesteps, self.dim_t))
         if y is not None and self.num_classes > 0:
diff --git a/src/midst_toolkit/models/clavaddpm/sampler.py b/src/midst_toolkit/models/clavaddpm/sampler.py
index 3edb30bc..d9e54404 100644
--- a/src/midst_toolkit/models/clavaddpm/sampler.py
+++ b/src/midst_toolkit/models/clavaddpm/sampler.py
@@ -143,19 +143,19 @@ def __init__(
         self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
         self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.uint)
 
-    def weights(self):
+    def weights(self) -> Tensor:
         """
         Return the weights.
 
         Warms up the sampler if it's not warmed up.
         """
         if not self._warmed_up():
-            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+            return torch.from_numpy(np.ones([self.diffusion.num_timesteps], dtype=np.float64))
         weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
         weights /= np.sum(weights)
         weights *= 1 - self.uniform_prob
         weights += self.uniform_prob / len(weights)
-        return weights
+        return torch.from_numpy(weights)
 
     def update_with_all_losses(self, ts: list[int], losses: list[float]) -> None:
         """
diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py
index a570c75e..84f70c3d 100644
--- a/src/midst_toolkit/models/clavaddpm/train.py
+++ b/src/midst_toolkit/models/clavaddpm/train.py
@@ -240,7 +240,7 @@ def child_training(
 
 def train_model(
     data_frame: pd.DataFrame,
-    data_frame_info: pd.DataFrame,
+    data_frame_info: dict[str, Any],
     model_params: dict[str, Any],
     transformations_dict: dict[str, Any],
     steps: int,
@@ -349,7 +349,7 @@ def train_model(
 
 def train_classifier(
     data_frame: pd.DataFrame,
-    data_frame_info: pd.DataFrame,
+    data_frame_info: dict[str, Any],
     model_params: dict[str, Any],
     transformations_dict: dict[str, Any],
     classifier_steps: int,

From 3defbf21c9827d2cc5d8f1960e2a1b785476802b Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Fri, 26 Sep 2025 16:39:15 -0400
Subject: [PATCH 4/5] Addressing some PR comments

---
 .../evaluation/quality/mean_hellinger_distance.py     |  4 ++--
 .../evaluation/quality/mean_propensity_mse.py         | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
index 41aecb3a..0eb739fe 100644
--- a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
+++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
@@ -56,7 +56,7 @@ def __init__(
 
         NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
         This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
-        dataframes before calling compute or by setting ``do_preprocess`` to True
+        dataframes before calling compute or by setting ``do_preprocess`` to True.
 
         Args:
             categorical_columns: Column names corresponding to the categorical variables of any provided dataframe.
@@ -82,7 +82,7 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
 
         NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail.
         This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the
-        dataframes before calling compute or by setting ``do_preprocess`` to True
+        dataframes before calling compute or by setting ``do_preprocess`` to True.
 
         Args:
             real_data: Real data to which the synthetic data may be compared. In many cases this will be data used
diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
index 26544179..c0cd30f2 100644
--- a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
+++ b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
@@ -17,9 +17,9 @@ def __init__(
         """
         This class measures how well a ``LogisticRegression`` model from sklearn (as implemented in SynthEval) can
         distinguish between real and synthetic data. The classification model is trained on a subset of the two data
-        sources and then applied to a heldout portion of the mixed data. The average pMSE for synthetic vs. real
-        predictions and macro F1 scores across the folds are reported along with the standard error of these mean
-        values.
+        sources and then applied to a validation split of the mixed data, created through cross-validation folds. The
+        average pMSE for synthetic vs. real predictions and macro F1 scores across the folds are reported along with
+        the standard error of these mean values.
 
         Computation of pMSE is based on the formula in:
 
@@ -55,8 +55,9 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
         """
         Computes how well a LogisticRegression model from sklearn (as implemented in SynthEval) can distinguish between
         real and synthetic data. The classification model is trained on a subset of the two data sources and then
-        applied to a heldout portion of the mixed data. The average pMSE of the 0 - synthetic, 1 - real predictions and
-        macro F1 scores across the folds are reported along with the standard error of these mean values.
+        applied to a validation split of the mixed data, created through cross-fold validation on the combination of
+        the two datasets. The average pMSE of the 0 = synthetic, 1 = real predictions and macro F1 scores across the
+        folds are reported along with the standard error of these mean values.
 
         NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
         preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if

From 91fd2cabc49605f20c5f6603fbc95fe8717d4eaf Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Tue, 30 Sep 2025 09:07:47 -0400
Subject: [PATCH 5/5] changes based on Fatemeh's PR review

---
 src/midst_toolkit/evaluation/metrics_base.py         |  5 ++++-
 .../evaluation/quality/mean_hellinger_distance.py    | 12 +++++++++++-
 .../evaluation/quality/mean_propensity_mse.py        |  8 ++++----
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/midst_toolkit/evaluation/metrics_base.py b/src/midst_toolkit/evaluation/metrics_base.py
index 94d00e44..94746217 100644
--- a/src/midst_toolkit/evaluation/metrics_base.py
+++ b/src/midst_toolkit/evaluation/metrics_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from logging import INFO
+from logging import INFO, WARNING
 from typing import overload
 
 import pandas as pd
@@ -54,6 +54,9 @@ def __init__(
         self.numerical_columns = numerical_columns
         self.do_preprocess = do_preprocess
 
+        if len(self.categorical_columns) == 0 and len(self.numerical_columns) == 0:
+            log(WARNING, "Both lists of column names are empty. This will result in unexpected metric behavior.")
+
         if do_preprocess:
             log(INFO, "Default preprocessing will be performed during computation.")
 
diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
index 0eb739fe..268ca969 100644
--- a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
+++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py
@@ -1,6 +1,9 @@
+from logging import WARNING
+
 import numpy as np
 import pandas as pd
 
+from midst_toolkit.common.logger import log
 from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric
 
 
@@ -8,7 +11,7 @@ def hellinger_distance(discrete_distribution_1: np.ndarray, discrete_distributio
     """
     Compute the empirical Hellinger distance between two discrete probability distributions. Hellinger distance for
     discrete probability distributions $p$ and $q$ is expressed as
-    $$\\frac{1}{2} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$.
+    $$\\frac{1}{\\sqrt{2}} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$.
 
     Args:
         discrete_distribution_1: First discrete distribution for distance computation
@@ -71,6 +74,13 @@ def __init__(
 
         self.include_numerical_columns = include_numerical_columns
 
+        if len(self.categorical_columns) == 0 and not self.include_numerical_columns:
+            log(
+                WARNING,
+                "No categorical columns provided and include_numerical_columns is False. This will result in a NaN "
+                "for the Hellinger distance.",
+            )
+
     def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]:
         """
         Computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic dataframes. For a
diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
index c0cd30f2..fca5fbd3 100644
--- a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
+++ b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py
@@ -28,8 +28,8 @@ def __init__(
 
         NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
         preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
-        ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the
-        categoricals.
+        ``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for
+        categorical columns and ``MinMaxScaling`` for numerical columns.
 
         - A smaller pMSE is better. In cases where the two datasets are balanced in size, 0.25 is worst case.
         - Higher Macro F1 is better.
@@ -61,8 +61,8 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
 
         NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by
         preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if
-        ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the
-        categoricals.
+        ``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for
+        categorical columns and ``MinMaxScaling`` for numerical columns.
 
         Args:
             real_data: Real data to which the synthetic data may be compared. In many cases this will be data used