From adae16e96d9edfe44cdf792ecefab11e79956ccd Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Wed, 17 Sep 2025 09:27:45 -0400 Subject: [PATCH 1/5] First checkin of hellinger and pmse implementations --- pyproject.toml | 1 + .../quality/mean_hellinger_distance.py | 131 ++++++++++++++++++ .../evaluation/quality/mean_propensity_mse.py | 105 ++++++++++++++ .../quality/test_mean_hellinger_distance.py | 115 +++++++++++++++ .../quality/test_mean_propensity_mse.py | 82 +++++++++++ 5 files changed, 434 insertions(+) create mode 100644 src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py create mode 100644 src/midst_toolkit/evaluation/quality/mean_propensity_mse.py create mode 100644 tests/unit/evaluation/quality/test_mean_hellinger_distance.py create mode 100644 tests/unit/evaluation/quality/test_mean_propensity_mse.py diff --git a/pyproject.toml b/pyproject.toml index bf8af1b6..9df19160 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,6 +122,7 @@ ignore = [ "D104", # Ignore package level docstrings requirement "D205", # 1 blank line required between summary line and description "D212", # Multi-line docstring summary should start at the first line + "D301", # r-strings for docstrings with backslashes "PLR2004", # Replace magic number with named constant "PLR0913", # Too many arguments "COM812", # Missing trailing comma diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py new file mode 100644 index 00000000..374761d7 --- /dev/null +++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py @@ -0,0 +1,131 @@ +import numpy as np +import pandas as pd + +from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric + + +def hellinger_distance(discrete_distribution_1: np.ndarray, discrete_distribution_2: np.ndarray) -> float: + """ + Compute the empirical Hellinger distance between two discrete probability distributions. Hellinger distance for + discrete probability distributions $p$ and $q$ is expressed as + $$\\frac{1}{2} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$. + + Args: + discrete_distribution_1: First discrete distribution for distance computation + discrete_distribution_2: Second discrete distribution for distance computation + + Returns: + Empirical Hellinger distance between the two distributions. + """ + sum_1 = np.sum(discrete_distribution_1) + sum_2 = np.sum(discrete_distribution_2) + assert np.isclose(sum_1, 1.0, atol=1e-4), f"Distribution 1 is not a probability distribution: Sum is {sum_1}" + assert np.isclose(sum_2, 1.0, atol=1e-4), f"Distribution 2 is not a probability distribution: Sum is {sum_2}" + + sqrt_pdf_1 = np.sqrt(discrete_distribution_1) + sqrt_pdf_2 = np.sqrt(discrete_distribution_2) + difference = sqrt_pdf_1 - sqrt_pdf_2 + return 1 / np.sqrt(2) * np.linalg.norm(difference) + + +class MeanHellingerDistance(SynthEvalQualityMetric): + def __init__( + self, + categorical_columns: list[str], + numerical_columns: list[str], + do_preprocess: bool = False, + include_numerical_columns: bool = True, + ): + """ + This class computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic + dataframes. + + NOTE: The implementation here is inspired by the SynthEval implementation of the Mean Hellinger Distance + but fixes a crucial issue. Their way of computing bins for the discrete histograms of numerical values is + flawed. Here, we make use of the 'auto' binning schemes in numpy to do a better job binning such values into + histograms + + - For a categorical column, the number of bins for the discrete distributions is established by computing + the unique values in the column for the REAL DATA. This can have some side effects when the encodings of + the categorical values is not contiguous ([1, 2, 10]) or there are different values in the synthetic + dataframe. + - For numerical columns, binning is determined by the numpy ``histogram_bin_edges`` function and takes into + account values from BOTH dataframes. + + The final score is the average of the distances computed across columns. Lower is better. + + NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail. + This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the + dataframes before calling compute or by setting ``do_preprocess`` to True + + Args: + categorical_columns: Column names corresponding to the categorical variables of any provided dataframe. + numerical_columns: Column names corresponding to the numerical variables of any provided dataframe. + do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval. + Defaults to False. + include_numerical_columns: Whether to include any provided numerical columns in the Hellinger distance + computation. Numerical column values are binned to create discrete distributions, which may or may not + be something you want to do. + """ + super().__init__(categorical_columns, numerical_columns, do_preprocess) + + self.include_numerical_columns = include_numerical_columns + + def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]: + """ + Computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic dataframes. For a + categorical column, the range of values for the discrete distributions is established by computing the unique + values in the column for the REAL DATA. For numerical columns, a binning procedure based on numpy's + ``histogram_bin_edges`` with binning strategy set to 'auto' is used. + + The final score is the average of the distances computed across columns. Lower is better. + + NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail. + This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the + dataframes before calling compute or by setting ``do_preprocess`` to True + + Args: + real_data: Real data to which the synthetic data may be compared. In many cases this will be data used + to TRAIN the model that generated the synthetic data, but not always. + synthetic_data: Synthetically generated data whose quality is to be assessed. + + Returns: + The mean of the individual Hellinger distances between each of the corresponding columns of the real and + synthetic dataframes. This mean is keyed by 'mean_hellinger_distance' and is reported along with the + "standard error" associated with that mean keyed under 'hellinger_standard_error'. + """ + if self.do_preprocess: + real_data, synthetic_data = self.preprocess(real_data, synthetic_data) + + hellinger_distances = [] + + for category_column in self.categorical_columns: + class_num = len(np.unique(real_data[category_column])) + + real_discrete_counts = np.histogram(real_data[category_column], bins=class_num)[0] + synthetic_discrete_counts = np.histogram(synthetic_data[category_column], bins=class_num)[0] + + real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts) + synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts) + + distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf) + hellinger_distances.append(distance) + + if self.include_numerical_columns: + for numeric_column in self.numerical_columns: + combined_data = np.concatenate((real_data[numeric_column], synthetic_data[numeric_column])) + bin_edges = np.histogram_bin_edges(combined_data, bins="auto") + + real_discrete_counts = np.histogram(real_data[numeric_column], bins=bin_edges)[0] + synthetic_discrete_counts = np.histogram(synthetic_data[numeric_column], bins=bin_edges)[0] + + real_discrete_pdf = real_discrete_counts / sum(real_discrete_counts) + synthetic_discrete_pdf = synthetic_discrete_counts / sum(synthetic_discrete_counts) + + distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf) + hellinger_distances.append(distance) + + return { + "mean_hellinger_distance": np.mean(hellinger_distances), + "hellinger_standard_error": np.std(hellinger_distances, ddof=1) / np.sqrt(len(hellinger_distances)), + } diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py new file mode 100644 index 00000000..26544179 --- /dev/null +++ b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py @@ -0,0 +1,105 @@ +import pandas as pd +from syntheval.metrics.utility.metric_propensity_mse import PropensityMeanSquaredError + +from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric + + +class MeanPropensityMeanSquaredError(SynthEvalQualityMetric): + def __init__( + self, + categorical_columns: list[str], + numerical_columns: list[str], + do_preprocess: bool = False, + folds: int = 5, + max_iterations: int = 100, + solver: str = "liblinear", + ): + """ + This class measures how well a ``LogisticRegression`` model from sklearn (as implemented in SynthEval) can + distinguish between real and synthetic data. The classification model is trained on a subset of the two data + sources and then applied to a heldout portion of the mixed data. The average pMSE for synthetic vs. real + predictions and macro F1 scores across the folds are reported along with the standard error of these mean + values. + + Computation of pMSE is based on the formula in: + + Woo, M., Reiter, J.P., Oganian, A., Karr, A.F.: Global measures of data utility for microdata masked for + disclosure limitation. J. Priv. Confidentiality 1(1) (2009) https://doi.org/10.29012/jpc.v1i1.568 + + NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by + preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if + ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the + categoricals. + + - A smaller pMSE is better. In cases where the two datasets are balanced in size, 0.25 is worst case. + - Higher Macro F1 is better. + + Args: + categorical_columns: Column names corresponding to the categorical variables of any provided dataframe. + numerical_columns: Column names corresponding to the numerical variables of any provided dataframe. + do_preprocess: Whether or not to preprocess the dataframes with the default pipeline used by SynthEval. + Defaults to False. + folds: Number of cross-validation folds for training/evaluating the LogisticRegression classifier used to + establish a stable estimate of the pMSE. Defaults to 5. + max_iterations: Maximum number of iterations for the regression fitting. Defaults to 100. + solver: Kind of solver used to fit the ``LogisticRegression`` model. Options coincide with those of the + sklearn ``LogisticRegression`` implementation. Defaults to 'liblinear'. + """ + super().__init__(categorical_columns, numerical_columns, do_preprocess) + self.all_columns = categorical_columns + numerical_columns + self.folds = folds + self.max_iterations = max_iterations + self.solver = solver + + def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]: + """ + Computes how well a LogisticRegression model from sklearn (as implemented in SynthEval) can distinguish between + real and synthetic data. The classification model is trained on a subset of the two data sources and then + applied to a heldout portion of the mixed data. The average pMSE of the 0 - synthetic, 1 - real predictions and + macro F1 scores across the folds are reported along with the standard error of these mean values. + + NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by + preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if + ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the + categoricals. + + Args: + real_data: Real data to which the synthetic data may be compared. In many cases this will be data used + to TRAIN the model that generated the synthetic data, but not always. + synthetic_data: Synthetically generated data whose quality is to be assessed. + + Returns: + The mean pMSE and macro F1 scores for a LogisticRegression model. These values are keyed by 'avg_pmse' and + 'avg_macro_f1_score' respectively. The standard errors associated with these mean values are reported under + the keys 'pmse_standard_error' and 'macro_f1_standard_error' as well. + """ + if self.do_preprocess: + real_data, synthetic_data = self.preprocess(real_data, synthetic_data) + + # NOTE: The SynthEval MutualInformation class ignores column specifications by default. However, for + # other classes (correlation_matrix_difference for example), specifying less than all of the columns restricts + # the score computation to just those columns. To make this consistent we do that here, before passing to the + # SynthEval class. + filtered_real_data = real_data[self.all_columns] + filtered_synthetic_data = synthetic_data[self.all_columns] + + # Syntheval also ASSUMES you don't have a column in both provided dataframes called 'real' because it will + # attach another column with the same name, so we throw an error here if the column already exists. + assert "real" not in filtered_real_data.columns, "A column called 'real' already exists in the dataframe." + assert "real" not in filtered_synthetic_data.columns, "A column called 'real' already exists in the dataframe." + + self.syntheval_metric = PropensityMeanSquaredError( + real_data=filtered_real_data, + synt_data=filtered_synthetic_data, + hout_data=None, + cat_cols=self.categorical_columns, + num_cols=self.numerical_columns, + do_preprocessing=False, + verbose=False, + ) + result = self.syntheval_metric.evaluate(self.folds, self.max_iterations, self.solver) + result["avg_pmse"] = result.pop("avg pMSE") + result["pmse_standard_error"] = result.pop("pMSE err") + result["avg_macro_f1_score"] = result.pop("avg acc") + result["macro_f1_standard_error"] = result.pop("acc err") + return result diff --git a/tests/unit/evaluation/quality/test_mean_hellinger_distance.py b/tests/unit/evaluation/quality/test_mean_hellinger_distance.py new file mode 100644 index 00000000..1524ea07 --- /dev/null +++ b/tests/unit/evaluation/quality/test_mean_hellinger_distance.py @@ -0,0 +1,115 @@ +import math + +import numpy as np +import pandas as pd +import pytest + +from midst_toolkit.evaluation.quality.mean_hellinger_distance import MeanHellingerDistance + + +REAL_DATA = pd.DataFrame( + { + "column_a": [1, 2, 3, 4, 5], + "column_b": [4, 5, 6, 7, 8], + "column_c": ["horse", "dog", "horse", "cat", "cat"], + "column_d": [-1, -2, -3, -2, -5], + } +) +SYNTHETIC_DATA = pd.DataFrame( + { + "column_a": [1, 2, 3, 4, 5], + "column_b": [4, 6, 6, -1, 1], + "column_c": ["cat", "dog", "horse", "cat", "cat"], + "column_d": [-1, -2, -3, -2, -50], + } +) + +REAL_DATA_ENCODED = pd.DataFrame({"column_c": [1, 2, 1, 3, 3]}) + +SYNTHETIC_DATA_ENCODED = pd.DataFrame({"column_c": [3, 2, 1, 4, 3]}) + + +def test_mean_hellinger_distance_with_no_preprocess() -> None: + metric = MeanHellingerDistance( + categorical_columns=["column_c"], + numerical_columns=[], + do_preprocess=False, + ) + + discrete_real = np.array([2 / 5, 1 / 5, 2 / 5]) + # 4 gets collapsed into the last bin + synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5]) + target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real)) + score = metric.compute(REAL_DATA_ENCODED, SYNTHETIC_DATA_ENCODED) + assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"] + assert np.isnan(score["hellinger_standard_error"]) + + +def test_mean_hellinger_distance_with_preprocess_categorical() -> None: + metric = MeanHellingerDistance( + categorical_columns=["column_c"], + numerical_columns=[], + do_preprocess=True, + ) + + # Should be the same as after test_mean_hellinger_distance_with_no_preprocess running preprocessing + discrete_real = np.array([2 / 5, 1 / 5, 2 / 5]) + # 4 gets collapsed into the last bin + synthetic_real = np.array([1 / 5, 1 / 5, 3 / 5]) + target = (1.0 / math.sqrt(2)) * np.linalg.norm(np.sqrt(discrete_real) - np.sqrt(synthetic_real)) + score = metric.compute(REAL_DATA, SYNTHETIC_DATA) + assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"] + assert np.isnan(score["hellinger_standard_error"]) + + +def test_mean_hellinger_distance_with_preprocess() -> None: + metric = MeanHellingerDistance( + categorical_columns=[], + numerical_columns=["column_a", "column_b", "column_d"], + do_preprocess=True, + ) + # Should be the same, as preprocessing doesn't change the categorical MI + score = metric.compute(REAL_DATA, SYNTHETIC_DATA) + assert pytest.approx(0.3598897091778779, abs=1e-8) == score["mean_hellinger_distance"] + assert pytest.approx(0.18772239774180174, abs=1e-8) == score["hellinger_standard_error"] + + +def test_one_column_left_off() -> None: + metric = MeanHellingerDistance( + categorical_columns=["column_c"], + numerical_columns=["column_a", "column_b"], + do_preprocess=True, + ) + + # Make sure computation doesn't include the column that was not included. + target = 1 / 3 * (0.16510402468972515 + 0.0 + 0.6324555320336758) + score = metric.compute(REAL_DATA, SYNTHETIC_DATA) + assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"] + + +def test_mean_hellinger_distance_no_numericals() -> None: + metric = MeanHellingerDistance( + categorical_columns=["column_b", "column_c"], + numerical_columns=[], + do_preprocess=True, + ) + + # Everything should still work with an empty numerical list + target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515) + score = metric.compute(REAL_DATA, SYNTHETIC_DATA) + assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"] + + +def test_mean_hellinger_distance_do_not_include_numericals() -> None: + metric = MeanHellingerDistance( + categorical_columns=["column_b", "column_c"], + numerical_columns=["column_a", "column_d"], + do_preprocess=True, + include_numerical_columns=False, + ) + + # Should be the same as test_mean_hellinger_distance_no_numericals since we're saying we do not want to include + # numerical columns in the computations. + target = 1 / 2 * (0.3422824674525135 + 0.16510402468972515) + score = metric.compute(REAL_DATA, SYNTHETIC_DATA) + assert pytest.approx(target, abs=1e-8) == score["mean_hellinger_distance"] diff --git a/tests/unit/evaluation/quality/test_mean_propensity_mse.py b/tests/unit/evaluation/quality/test_mean_propensity_mse.py new file mode 100644 index 00000000..8e06fc8c --- /dev/null +++ b/tests/unit/evaluation/quality/test_mean_propensity_mse.py @@ -0,0 +1,82 @@ +from random import choices + +import numpy as np +import pandas as pd +import pytest + +from midst_toolkit.common.random import set_all_random_seeds, unset_all_random_seeds +from midst_toolkit.evaluation.quality.mean_propensity_mse import MeanPropensityMeanSquaredError + + +def get_data() -> tuple[pd.DataFrame, pd.DataFrame]: + real_data = pd.DataFrame( + { + "column_a": 1.2 * np.random.randn(2500) + 1, + "column_b": 2.5 * np.random.randn(2500) - 1, + "column_c": choices(["cat", "horse", "dog"], weights=[0.25, 0.5, 0.25], k=2500), + "column_d": 1.5 * np.random.randn(2500) + 12, + } + ) + synthetic_data = pd.DataFrame( + { + "column_a": 1.1 * np.random.randn(2500) + 0.5, + "column_b": 2.2 * np.random.randn(2500) - 1, + "column_c": choices(["cat", "horse", "dog"], weights=[0.35, 0.35, 0.3], k=2500), + "column_d": 1.5 * np.random.randn(2500) + 10, + } + ) + return real_data, synthetic_data + + +def test_mean_propensity_mse_with_preprocess() -> None: + set_all_random_seeds(42) + + real_data, synthetic_data = get_data() + + metric = MeanPropensityMeanSquaredError( + categorical_columns=["column_c"], + numerical_columns=["column_a", "column_b", "column_d"], + do_preprocess=True, + ) + + score = metric.compute(real_data, synthetic_data) + assert pytest.approx(0.08442776888821232, abs=1e-8) == score["avg_pmse"] + assert pytest.approx(0.7566743656985974, abs=1e-8) == score["avg_macro_f1_score"] + unset_all_random_seeds() + + +def test_mean_propensity_mse_with_no_categorical() -> None: + set_all_random_seeds(42) + + real_data, synthetic_data = get_data() + + metric = MeanPropensityMeanSquaredError( + categorical_columns=[], + numerical_columns=["column_a", "column_b", "column_d"], + do_preprocess=True, + ) + + score = metric.compute(real_data, synthetic_data) + assert pytest.approx(0.08000946858157684, abs=1e-8) == score["avg_pmse"] + assert pytest.approx(0.7485073080661124, abs=1e-8) == score["avg_macro_f1_score"] + unset_all_random_seeds() + + +def test_mean_propensity_mse_with_no_numerical_and_shortcut() -> None: + set_all_random_seeds(42) + + real_data, synthetic_data = get_data() + real_data["column_e"] = 1 + synthetic_data["column_e"] = 0 + + metric = MeanPropensityMeanSquaredError( + categorical_columns=["column_c", "column_e"], + numerical_columns=[], + do_preprocess=True, + ) + + score = metric.compute(real_data, synthetic_data) + # pMSE should be close to 0.25 and F1 should be essentially 1 due to the shortcut. + assert pytest.approx(0.24374514497992789, abs=1e-8) == score["avg_pmse"] + assert pytest.approx(1.0, abs=1e-8) == score["avg_macro_f1_score"] + unset_all_random_seeds() From a790991e217002aec71b64eb1cb30225cff93d6c Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Wed, 17 Sep 2025 09:43:03 -0400 Subject: [PATCH 2/5] Fix typing issue --- .../evaluation/quality/mean_hellinger_distance.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py index 374761d7..41aecb3a 100644 --- a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py +++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py @@ -125,7 +125,12 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict distance = hellinger_distance(real_discrete_pdf, synthetic_discrete_pdf) hellinger_distances.append(distance) + mean_hellinger_distance = np.mean(hellinger_distances).item() + hellinger_distance_standard_error = np.std(hellinger_distances, ddof=1).item() / np.sqrt( + len(hellinger_distances) + ) + return { - "mean_hellinger_distance": np.mean(hellinger_distances), - "hellinger_standard_error": np.std(hellinger_distances, ddof=1) / np.sqrt(len(hellinger_distances)), + "mean_hellinger_distance": mean_hellinger_distance, + "hellinger_standard_error": hellinger_distance_standard_error, } From 59ea7f4fb3ee69c89b02a82880c622efebee4f3e Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:09:41 -0400 Subject: [PATCH 3/5] New mypy flow and fixes to typing issues that were discovered --- .pre-commit-config.yaml | 11 ++--- mypy.ini | 38 +++++++++++++++ pyproject.toml | 26 ----------- run_mypy.sh | 3 ++ .../attacks/ensemble/process_split_data.py | 4 +- src/midst_toolkit/core/data_loaders.py | 29 ++++++++++-- src/midst_toolkit/core/logger.py | 18 ++++---- src/midst_toolkit/data_processing/utils.py | 4 +- .../privacy/distance_closest_record.py | 3 ++ .../quality/synthcity/dataloader.py | 19 ++++---- .../quality/synthcity/feature_encoder.py | 10 ++-- .../quality/synthcity/statistical_eval.py | 16 +++---- src/midst_toolkit/evaluation/utils.py | 2 +- .../models/clavaddpm/diffusion_utils.py | 20 ++++---- .../gaussian_multinomial_diffusion.py | 20 +++++--- src/midst_toolkit/models/clavaddpm/model.py | 46 +++++++++++-------- src/midst_toolkit/models/clavaddpm/sampler.py | 6 +-- src/midst_toolkit/models/clavaddpm/train.py | 4 +- 18 files changed, 167 insertions(+), 112 deletions(-) create mode 100644 mypy.ini create mode 100755 run_mypy.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c46dd45..376e0bfc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,13 +34,12 @@ repos: types_or: [python, jupyter] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.16.0 + rev: v1.18.2 hooks: - - id: mypy - entry: python3 -m mypy --config-file pyproject.toml - language: system - types: [python] - exclude: "tests" + - id: mypy + name: mypy + entry: ./run_mypy.sh + language: system - repo: local hooks: diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..0a7ee8a1 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,38 @@ +[mypy] +mypy_path=src +follow_imports = normal +ignore_missing_imports = False +install_types = True +pretty = True +non_interactive = True +disallow_untyped_defs = True +no_implicit_optional = True +check_untyped_defs = True +allow_untyped_decorators = False +allow_incomplete_defs = False +warn_redundant_casts = True +warn_unused_ignores = True +implicit_reexport = False +strict_equality = True +extra_checks = True +warn_unused_configs = True +allow_subclassing_any = False +exclude = (venv|examples/tutorial/*|tests) + +[mypy-sklearn.*] +ignore_missing_imports = True + +[mypy-syntheval.*] +ignore_missing_imports = True + +[mypy-opacus.*] +ignore_missing_imports = True + +[mypy-nltk.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True + +[mypy-category_encoders.*] +ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index bf8af1b6..d44996dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,32 +53,6 @@ docs = [ [tool.uv] default-groups = ["dev", "docs"] -[tool.mypy] -follow_imports = "normal" -ignore_missing_imports = false -install_types = true -pretty = true -non_interactive = true -disallow_untyped_defs = false -no_implicit_optional = true -check_untyped_defs = true -namespace_packages = true -explicit_package_bases = true -warn_unused_configs = true -allow_subclassing_any = false -allow_untyped_calls = true -allow_incomplete_defs = false -allow_untyped_decorators = false -warn_redundant_casts = true -warn_unused_ignores = true -implicit_reexport = false -strict_equality = true -extra_checks = true -mypy_path = "src" -files = ["src", "examples"] -exclude = [ - "examples/tutorial/.*" -] [tool.ruff] include = ["*.py", "pyproject.toml"] diff --git a/run_mypy.sh b/run_mypy.sh new file mode 100755 index 00000000..112c97da --- /dev/null +++ b/run_mypy.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +mypy --config-file ./mypy.ini . diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py index dee59279..1e9b5e98 100644 --- a/src/midst_toolkit/attacks/ensemble/process_split_data.py +++ b/src/midst_toolkit/attacks/ensemble/process_split_data.py @@ -110,7 +110,7 @@ def generate_train_test_challenge_splits( # Shuffle data df_val = df_val.sample(frac=1, random_state=random_seed).reset_index(drop=True) - y_val = df_val["is_train"].values + y_val = df_val["is_train"].to_numpy() df_val = df_val.drop(columns=["is_train"]) # Test set @@ -139,7 +139,7 @@ def generate_train_test_challenge_splits( df_test = df_test.sample(frac=1, random_state=random_seed).reset_index(drop=True) - y_test = df_test["is_train"].values + y_test = df_test["is_train"].to_numpy() df_test = df_test.drop(columns=["is_train"]) return df_val, y_val, df_test, y_test diff --git a/src/midst_toolkit/core/data_loaders.py b/src/midst_toolkit/core/data_loaders.py index 81513f7b..cabbf9b2 100644 --- a/src/midst_toolkit/core/data_loaders.py +++ b/src/midst_toolkit/core/data_loaders.py @@ -6,7 +6,9 @@ import pandas as pd -def load_multi_table(data_dir, verbose=True): +def load_multi_table( + data_dir: str, verbose: bool = True +) -> tuple[dict[str, Any], list[tuple[str, str]], dict[str, Any]]: dataset_meta = json.load(open(os.path.join(data_dir, "dataset_meta.json"), "r")) relation_order = dataset_meta["relation_order"] @@ -71,7 +73,7 @@ def pipeline_process_data( ratio: float = 0.9, save: bool = False, verbose: bool = True, -) -> tuple[dict[str, Any], dict[str, Any]]: +) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]: # ruff: noqa: D103 num_data = data_df.shape[0] @@ -93,15 +95,18 @@ def pipeline_process_data( num_train = int(num_data * ratio) num_test = num_data - num_train + test_df: pd.DataFrame | None = None + if ratio < 1: train_df, test_df, seed = train_val_test_split(data_df, cat_columns, num_train, num_test) else: train_df = data_df.copy() - train_df.columns = range(len(train_df.columns)) + train_df.columns = list(range(len(train_df.columns))) if ratio < 1: - test_df.columns = range(len(test_df.columns)) + assert test_df is not None + test_df.columns = list(range(len(test_df.columns))) col_info: dict[Any, Any] = {} @@ -131,6 +136,7 @@ def pipeline_process_data( train_df.rename(columns=idx_name_mapping, inplace=True) if ratio < 1: + assert test_df is not None test_df.rename(columns=idx_name_mapping, inplace=True) for col in num_columns: @@ -139,6 +145,7 @@ def pipeline_process_data( train_df.loc[train_df[col] == "?", col] = "nan" if ratio < 1: + assert test_df is not None for col in num_columns: test_df.loc[test_df[col] == "?", col] = np.nan for col in cat_columns: @@ -148,7 +155,12 @@ def pipeline_process_data( X_cat_train = train_df[cat_columns].to_numpy() y_train = train_df[target_columns].to_numpy() + X_num_test: np.ndarray | None = None + X_cat_test: np.ndarray | None = None + y_test: np.ndarray | None = None + if ratio < 1: + assert test_df is not None X_num_test = test_df[num_columns].to_numpy().astype(np.float32) X_cat_test = test_df[cat_columns].to_numpy() y_test = test_df[target_columns].to_numpy() @@ -160,6 +172,7 @@ def pipeline_process_data( np.save(f"{save_dir}/y_train.npy", y_train) if ratio < 1: + assert X_num_test is not None and X_cat_test is not None and y_test is not None np.save(f"{save_dir}/X_num_test.npy", X_num_test) np.save(f"{save_dir}/X_cat_test.npy", X_cat_test) np.save(f"{save_dir}/y_test.npy", y_test) @@ -167,12 +180,14 @@ def pipeline_process_data( train_df[num_columns] = train_df[num_columns].astype(np.float32) if ratio < 1: + assert test_df is not None test_df[num_columns] = test_df[num_columns].astype(np.float32) if save: train_df.to_csv(f"{save_dir}/train.csv", index=False) if ratio < 1: + assert test_df is not None test_df.to_csv(f"{save_dir}/test.csv", index=False) if not os.path.exists(f"synthetic/{name}"): @@ -181,12 +196,14 @@ def pipeline_process_data( train_df.to_csv(f"synthetic/{name}/real.csv", index=False) if ratio < 1: + assert test_df is not None test_df.to_csv(f"synthetic/{name}/test.csv", index=False) info["column_names"] = column_names info["train_num"] = train_df.shape[0] if ratio < 1: + assert test_df is not None info["test_num"] = test_df.shape[0] info["idx_mapping"] = idx_mapping @@ -227,6 +244,7 @@ def pipeline_process_data( if verbose: if ratio < 1: + assert test_df is not None str_shape = "Train dataframe shape: {}, Test dataframe shape: {}, Total dataframe shape: {}".format( train_df.shape, test_df.shape, data_df.shape ) @@ -251,7 +269,7 @@ def pipeline_process_data( # print('Num', num) # print('Cat', cat) - data = { + data: dict[str, dict[str, Any]] = { "df": {"train": train_df}, "numpy": { "X_num_train": X_num_train, @@ -261,6 +279,7 @@ def pipeline_process_data( } if ratio < 1: + assert test_df is not None and X_num_test is not None and X_cat_test is not None and y_test is not None data["df"]["test"] = test_df data["numpy"]["X_num_test"] = X_num_test data["numpy"]["X_cat_test"] = X_cat_test diff --git a/src/midst_toolkit/core/logger.py b/src/midst_toolkit/core/logger.py index b76ce8b7..1775163a 100644 --- a/src/midst_toolkit/core/logger.py +++ b/src/midst_toolkit/core/logger.py @@ -15,7 +15,7 @@ import time import warnings from collections import defaultdict -from collections.abc import Generator, Iterable +from collections.abc import Callable, Generator, Iterable from contextlib import contextmanager from typing import IO, Any @@ -55,7 +55,7 @@ def __init__(self, filename_or_file: str | IO[str]): self.file = filename_or_file # type: ignore[assignment] self.own_file = False - def writekvs(self, kvs): + def writekvs(self, kvs: dict[str, Any]) -> None: # Create strings for printing key2str = {} for key, val in sorted(kvs.items()): @@ -84,7 +84,7 @@ def _truncate(self, s: str) -> str: maxlen = 30 return s[: maxlen - 3] + "..." if len(s) > maxlen else s - def writeseq(self, seq): + def writeseq(self, seq: Iterable[str]) -> None: seq = list(seq) for i, elem in enumerate(seq): self.file.write(elem) @@ -103,7 +103,7 @@ def __init__(self, filename: str): self.file = open(filename, "wt") # ruff: noqa: SIM115 - def writekvs(self, kvs): + def writekvs(self, kvs: dict[str, Any]) -> None: for k, v in sorted(kvs.items()): if hasattr(v, "dtype"): kvs[k] = float(v) @@ -121,7 +121,7 @@ def __init__(self, filename: str): self.keys: list[str] = [] self.sep = "," - def writekvs(self, kvs): + def writekvs(self, kvs: dict[str, Any]) -> None: # Add our current row to the history extra_keys = list(kvs.keys() - self.keys) extra_keys.sort() @@ -297,7 +297,7 @@ def profile_kv(scopename: str) -> Generator[None, None, None]: get_current().name2val[logkey] += time.time() - tstart -def profile(n): +def profile(n: str) -> Callable: """ Usage. @@ -305,8 +305,8 @@ def profile(n): def my_func(): code """ - def decorator_with_name(func): - def func_wrapper(*args, **kwargs): + def decorator_with_name(func): # type: ignore + def func_wrapper(*args, **kwargs): # type: ignore with profile_kv(n): return func(*args, **kwargs) @@ -483,7 +483,7 @@ def reset() -> None: @contextmanager -def scoped_configure(dir=None, format_strs=None, comm=None): +def scoped_configure(dir=None, format_strs=None, comm=None): # type: ignore # ruff: noqa: D103 prevlogger = Logger.CURRENT configure(dir=dir, format_strs=format_strs, comm=comm) diff --git a/src/midst_toolkit/data_processing/utils.py b/src/midst_toolkit/data_processing/utils.py index f46b8b9d..9a940c64 100644 --- a/src/midst_toolkit/data_processing/utils.py +++ b/src/midst_toolkit/data_processing/utils.py @@ -43,7 +43,7 @@ def __init__( synthetic_data: pd.DataFrame, categorical_columns: list[str] | None, numerical_columns: list[str] | None, - holdout_data: pd.DataFrame = None, + holdout_data: pd.DataFrame | None = None, ) -> None: """ A class responsible for fitting encoders and scalers for categorical and numerical columns of dataframes, @@ -214,6 +214,6 @@ def is_column_type_numerical(dataframe: pd.DataFrame, column_name: str) -> bool: Returns: True if the column contains numerical values. False otherwise. """ - column_dtype = dataframe[column_name].dtype + column_dtype = dataframe[column_name].to_numpy().dtype return np.issubdtype(column_dtype, np.integer) or np.issubdtype(column_dtype, np.floating) diff --git a/src/midst_toolkit/evaluation/privacy/distance_closest_record.py b/src/midst_toolkit/evaluation/privacy/distance_closest_record.py index 38141a87..5f0afa96 100644 --- a/src/midst_toolkit/evaluation/privacy/distance_closest_record.py +++ b/src/midst_toolkit/evaluation/privacy/distance_closest_record.py @@ -120,6 +120,9 @@ def preprocess( if real_data_test is None: return (processed_synthetic_data, processed_real_data_train) + + assert num_real_data_test_np is not None + assert cat_real_data_test_oh is not None return ( processed_synthetic_data, processed_real_data_train, diff --git a/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py b/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py index 0eb11ffe..0d26cea9 100644 --- a/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py +++ b/src/midst_toolkit/evaluation/quality/synthcity/dataloader.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -import torch from sklearn.model_selection import train_test_split @@ -83,19 +82,19 @@ def columns(self) -> list: ... @abstractmethod - def dataframe(self) -> pd.DataFrame: + def to_dataframe(self) -> pd.DataFrame: """A method that returns the pandas dataframe that contains all features and samples.""" ... @abstractmethod - def numpy(self) -> np.ndarray: + def to_numpy(self) -> np.ndarray: """A method that returns the numpy array that contains all features and samples.""" ... @property def values(self) -> np.ndarray: """Pass through to the numpy method.""" - return self.numpy() + return self.to_numpy() @abstractmethod def info(self) -> dict: @@ -145,11 +144,11 @@ def test(self) -> DataLoader: def __repr__(self, *args: Any, **kwargs: Any) -> str: """Return a string representation.""" - return self.dataframe().__repr__(*args, **kwargs) + return self.to_dataframe().__repr__(*args, **kwargs) def _repr_html_(self, *args: Any, **kwargs: Any) -> Any: """Return a string representation in html format.""" - return self.dataframe()._repr_html_(*args, **kwargs) + return self.to_dataframe().to_html(*args, **kwargs) @abstractmethod def fillna(self, value: Any) -> DataLoader: @@ -266,13 +265,13 @@ def unpack(self, as_numpy: bool = False, pad: bool = False) -> Any: return np.asarray(x), np.asarray(y) return x, y - def dataframe(self) -> pd.DataFrame: + def to_dataframe(self) -> pd.DataFrame: """A method that returns the pandas dataframe that contains all features and samples.""" return self.data - def numpy(self) -> np.ndarray: + def to_numpy(self) -> np.ndarray: """A method that returns the numpy array that contains all features and samples.""" - return self.dataframe().values + return self.to_dataframe().to_numpy() def info(self) -> dict: """A method that returns a dictionary of DataLoader information.""" @@ -376,7 +375,7 @@ def is_tabular(self) -> bool: return True -def create_from_info(data: pd.DataFrame | torch.utils.data.Dataset, info: dict) -> DataLoader: +def create_from_info(data: pd.DataFrame, info: dict) -> DataLoader: """Helper for creating a DataLoader from existing information.""" if info["data_type"] == "generic": return GenericDataLoader.from_info(data, info) diff --git a/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py b/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py index 0435f65a..9f12ea45 100644 --- a/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py +++ b/src/midst_toolkit/evaluation/quality/synthcity/feature_encoder.py @@ -74,9 +74,9 @@ def fit(self, x: pd.Series, y: Any = None, **kwargs: Any) -> FeatureEncoder: Returns: The fitted FeatureEncoder object. """ - self.feature_name_in = x.name + self.feature_name_in = str(x.name) self.feature_type_in = self._get_feature_type(x) - input = validate_shape(x.values, self.n_dim_in) + input = validate_shape(x.to_numpy(), self.n_dim_in) output = self._fit(input, **kwargs)._transform(input) self._out_shape = (-1, *output.shape[1:]) # for inverse_transform output = validate_shape(output, self.n_dim_out) @@ -101,7 +101,7 @@ def transform(self, x: pd.Series) -> pd.DataFrame | pd.Series: Returns: The transformed input. """ - data = validate_shape(x.values, self.n_dim_in) + data = validate_shape(x.to_numpy(), self.n_dim_in) out = self._transform(data) out = validate_shape(out, self.n_dim_out) if self.n_dim_out == 1: @@ -135,7 +135,7 @@ def _get_feature_type(self, x: Any) -> str: def inverse_transform(self, df: pd.DataFrame | pd.Series) -> pd.Series: """Reverse the encoder mapping.""" - y = df.values.reshape(self._out_shape) + y = df.to_numpy().reshape(self._out_shape) x = self._inverse_transform(y) x = validate_shape(x, 1) return pd.Series(x, name=self.feature_name_in) @@ -197,4 +197,4 @@ def _transform(self, x: np.ndarray) -> np.ndarray: return pd.to_numeric(x).astype(float) def _inverse_transform(self, data: np.ndarray) -> np.ndarray: - return pd.to_datetime(data) + return pd.to_datetime(data).to_numpy() diff --git a/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py b/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py index 1bc38050..a084ca16 100644 --- a/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py +++ b/src/midst_toolkit/evaluation/quality/synthcity/statistical_eval.py @@ -191,8 +191,8 @@ def _normalize_covariates( Returns: Tuple[pd.DataFrame, pd.DataFrame]: normalized version of the datasets """ - x_gt_norm = x.dataframe().copy() - x_syn_norm = x_syn.dataframe().copy() + x_gt_norm = x.to_dataframe().copy() + x_syn_norm = x_syn.to_dataframe().copy() if self._task_type != "survival_analysis": if hasattr(x, "target_column"): x_gt_norm = x_gt_norm.drop(columns=[x.target_column]) @@ -202,18 +202,18 @@ def _normalize_covariates( if hasattr(x, "target_column"): x_gt_norm_df = pd.DataFrame( scaler.transform(x_gt_norm), - columns=[col for col in x.train().dataframe().columns if col != x.target_column], + columns=[col for col in x.train().to_dataframe().columns if col != x.target_column], ) else: - x_gt_norm_df = pd.DataFrame(scaler.transform(x_gt_norm), columns=x.train().dataframe().columns) + x_gt_norm_df = pd.DataFrame(scaler.transform(x_gt_norm), columns=x.train().to_dataframe().columns) if hasattr(x_syn, "target_column"): x_syn_norm_df = pd.DataFrame( scaler.transform(x_syn_norm), - columns=[col for col in x_syn.dataframe().columns if col != x_syn.target_column], + columns=[col for col in x_syn.to_dataframe().columns if col != x_syn.target_column], ) else: - x_syn_norm_df = pd.DataFrame(scaler.transform(x_syn_norm), columns=x_syn.dataframe().columns) + x_syn_norm_df = pd.DataFrame(scaler.transform(x_syn_norm), columns=x_syn.to_dataframe().columns) return x_gt_norm_df, x_syn_norm_df @@ -234,8 +234,8 @@ def _evaluate( """ results = {} - x_ = x.numpy().reshape(len(x), -1) - x_syn_ = x_syn.numpy().reshape(len(x_syn), -1) + x_ = x.to_numpy().reshape(len(x), -1) + x_syn_ = x_syn.to_numpy().reshape(len(x_syn), -1) # OneClass representation emb = "_OC" diff --git a/src/midst_toolkit/evaluation/utils.py b/src/midst_toolkit/evaluation/utils.py index f850ef78..fb6eb390 100644 --- a/src/midst_toolkit/evaluation/utils.py +++ b/src/midst_toolkit/evaluation/utils.py @@ -64,7 +64,7 @@ def extract_columns_based_on_meta_info( # Training the diffusion generators. # Enumerate columns and replace column name with index - data.columns = range(len(data.columns)) + data.columns = list(range(len(data.columns))) # Get numerical and categorical column indices from meta info # NOTE: numerical and categorical columns are the only admissible/generate-able types" diff --git a/src/midst_toolkit/models/clavaddpm/diffusion_utils.py b/src/midst_toolkit/models/clavaddpm/diffusion_utils.py index de4be1e2..76c72df2 100644 --- a/src/midst_toolkit/models/clavaddpm/diffusion_utils.py +++ b/src/midst_toolkit/models/clavaddpm/diffusion_utils.py @@ -1,5 +1,6 @@ """PLACEHOLDER.""" +from collections.abc import Callable from inspect import isfunction from typing import Any @@ -18,7 +19,7 @@ def normal_kl( logvar2: Tensor | float, ) -> Tensor: """ - Compute the KL divergence between two gaussians. + Compute the KL divergence between two Gaussians. Shapes are automatically broadcasted, so batches can be compared to scalars, among other use cases. @@ -133,11 +134,14 @@ def extract(a: Tensor, t: Tensor, x_shape: tuple[int, ...]) -> Tensor: return out.expand(x_shape) -def default(val, d): +def default(val: Tensor, d: Callable[[], Tensor] | Tensor) -> Tensor: # ruff: noqa: D103 if exists(val): return val - return d() if isfunction(d) else d + if isfunction(d): + return d() + assert isinstance(d, Tensor) + return d def log_categorical(log_x_start: Tensor, log_prob: Tensor) -> Tensor: @@ -155,7 +159,7 @@ def index_to_log_onehot(x: Tensor, num_classes: Tensor) -> Tensor: return torch.log(x_onehot.float().clamp(min=1e-30)) -def log_sum_exp_by_classes(x, slices): +def log_sum_exp_by_classes(x: Tensor, slices: Tensor) -> Tensor: # ruff: noqa: D103 res = torch.zeros_like(x) for ixs in slices: @@ -167,14 +171,14 @@ def log_sum_exp_by_classes(x, slices): @torch.jit.script -def log_sub_exp(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: +def log_sub_exp(a: Tensor, b: Tensor) -> Tensor: # ruff: noqa: D103 m = torch.maximum(a, b) return torch.log(torch.exp(a - m) - torch.exp(b - m)) + m @torch.jit.script -def sliced_logsumexp(x, slices): +def sliced_logsumexp(x: Tensor, slices: Tensor) -> Tensor: # ruff: noqa: D103 lse = torch.logcumsumexp(torch.nn.functional.pad(x, [1, 0, 0, 0], value=-float("inf")), dim=-1) @@ -185,7 +189,7 @@ def sliced_logsumexp(x, slices): return torch.repeat_interleave(slice_lse, slice_ends - slice_starts, dim=-1) -def log_onehot_to_index(log_x): +def log_onehot_to_index(log_x: Tensor) -> Tensor: # ruff: noqa: D103 return log_x.argmax(1) @@ -193,6 +197,6 @@ def log_onehot_to_index(log_x): class FoundNANsError(BaseException): """Found NANs during sampling.""" - def __init__(self, message="Found NANs during sampling."): + def __init__(self, message: str = "Found NANs during sampling.") -> None: # ruff: noqa: D107 super(FoundNANsError, self).__init__(message) diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py index bce925a8..2240e2a4 100644 --- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py +++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py @@ -751,7 +751,7 @@ def mixed_loss(self, x: Tensor, out_dict: dict[str, Tensor]) -> tuple[Tensor, Te return loss_multi.mean(), loss_gauss.mean() @torch.no_grad() - def mixed_elbo(self, x0, out_dict): + def mixed_elbo(self, x0: Tensor, out_dict: dict[str, Tensor]) -> dict[str, Tensor]: b = x0.size(0) device = x0.device @@ -875,7 +875,15 @@ def gaussian_ddim_step( return mean_pred + nonzero_mask * sigma * noise @torch.no_grad() - def gaussian_ddim_sample(self, noise, T, out_dict, eta=0.0, model_kwargs=None, cond_fn=None): + def gaussian_ddim_sample( + self, + noise: Tensor, + T: int, + out_dict: dict[str, Tensor], + eta: float = 0.0, + model_kwargs: Any | None = None, + cond_fn: Callable | None = None, + ) -> Tensor: # ruff: noqa: D102, N803 x = noise b = x.shape[0] @@ -918,11 +926,11 @@ def gaussian_ddim_reverse_step( @torch.no_grad() def gaussian_ddim_reverse_sample( self, - x, - T, + x: Tensor, + T: int, # ruff: noqa: N803 - out_dict, - ): + out_dict: dict[str, Tensor], + ) -> Tensor: # ruff: noqa: D102 b = x.shape[0] device = x.device diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index c88fbdbc..17cab4b6 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import hashlib import json import math @@ -255,7 +257,7 @@ def __init__( # Create a Sequential model from the list of layers self.model = nn.Sequential(*layers) - def forward(self, x, timesteps): + def forward(self, x: Tensor, timesteps: Tensor) -> Tensor: emb = self.time_embed(timestep_embedding(timesteps, self.dim_t)) x = self.proj(x) + emb # x = self.transformer_layer(x, x) @@ -310,10 +312,10 @@ def make_dataset_from_df( df: pd.DataFrame, T: Transformations, is_y_cond: str, - df_info: pd.DataFrame, + df_info: dict[str, Any], ratios: list[float] | None = None, std: float = 0, -) -> tuple[Dataset, dict[int, LabelEncoder], list[int]]: +) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]: """ The order of the generated dataset: (y, X_num, X_cat). @@ -357,7 +359,7 @@ def make_dataset_from_df( X_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None y = {} - cat_cols_with_y = [] + cat_cols_with_y: list[str] = [] if df_info["cat_cols"] is not None: cat_cols_with_y += df_info["cat_cols"] if is_y_cond == "concat": @@ -385,31 +387,33 @@ def make_dataset_from_df( X_num = {} if df_info["num_cols"] is not None or is_y_cond == "concat" else None y = {} - num_cols_with_y = [] + num_cols_with_y: list[str] = [] if df_info["num_cols"] is not None: num_cols_with_y += df_info["num_cols"] if is_y_cond == "concat": num_cols_with_y = [df_info["y_col"]] + num_cols_with_y if len(num_cols_with_y) > 0: - X_num["train"] = train_df[num_cols_with_y].values.astype(np.float32) # type: ignore[index] - X_num["val"] = val_df[num_cols_with_y].values.astype(np.float32) # type: ignore[index] - X_num["test"] = test_df[num_cols_with_y].values.astype(np.float32) # type: ignore[index] + assert X_num is not None + X_num["train"] = train_df[num_cols_with_y].values.astype(np.float32) + X_num["val"] = val_df[num_cols_with_y].values.astype(np.float32) + X_num["test"] = test_df[num_cols_with_y].values.astype(np.float32) y["train"] = train_df[df_info["y_col"]].values.astype(np.float32) y["val"] = val_df[df_info["y_col"]].values.astype(np.float32) y["test"] = test_df[df_info["y_col"]].values.astype(np.float32) if df_info["cat_cols"] is not None: - X_cat["train"] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) # type: ignore[index] - X_cat["val"] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) # type: ignore[index] - X_cat["test"] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) # type: ignore[index] + assert X_cat is not None + X_cat["train"] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) + X_cat["val"] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) + X_cat["test"] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]] num_column_orders = [column_to_index[col] for col in num_cols_with_y] - column_orders = num_column_orders + cat_column_orders - column_orders = [index_to_column[index] for index in column_orders] + column_orders_indices = num_column_orders + cat_column_orders + column_orders = [index_to_column[index] for index in column_orders_indices] label_encoders = {} if X_cat is not None and len(df_info["cat_cols"]) > 0: @@ -434,6 +438,7 @@ def make_dataset_from_df( X_cat["test"] = X_cat_converted[train_num + val_num :, :] # type: ignore[call-overload] if X_num and len(X_num) > 0: + assert X_num is not None X_num["train"] = np.concatenate((X_num["train"], X_cat["train"]), axis=1) X_num["val"] = np.concatenate((X_num["val"], X_cat["val"]), axis=1) X_num["test"] = np.concatenate((X_num["test"], X_cat["test"]), axis=1) @@ -441,6 +446,9 @@ def make_dataset_from_df( X_num = X_cat X_cat = None + n_classes = df_info["n_classes"] + assert isinstance(n_classes, int) + D = Dataset( # ruff: noqa: N806 X_num, @@ -448,7 +456,7 @@ def make_dataset_from_df( y, y_info={}, task_type=TaskType(df_info["task_type"]), - n_classes=df_info["n_classes"], + n_classes=n_classes, ) return transform_dataset(D, T, None), label_encoders, column_orders @@ -778,7 +786,7 @@ def __init__(self, *tensors: Tensor, batch_size: int = 32, shuffle: bool = False n_batches += 1 self.n_batches = n_batches - def __iter__(self): + def __iter__(self) -> FastTensorDataLoader: # ruff: noqa: D105 if self.shuffle: r = torch.randperm(self.dataset_len) @@ -786,7 +794,7 @@ def __iter__(self): self.i = 0 return self - def __next__(self): + def __next__(self) -> tuple[Tensor, ...]: # ruff: noqa: D105 if self.i >= self.dataset_len: raise StopIteration @@ -794,7 +802,7 @@ def __next__(self): self.i += self.batch_size return batch - def __len__(self): + def __len__(self) -> int: # ruff: noqa: D105 return self.n_batches @@ -1162,7 +1170,7 @@ def __init__( self.proj = nn.Linear(d_in, dim_t) self.time_embed = nn.Sequential(nn.Linear(dim_t, dim_t), nn.SiLU(), nn.Linear(dim_t, dim_t)) - def forward(self, x, timesteps, y=None): + def forward(self, x: Tensor, timesteps: Tensor, y: Tensor | None = None) -> Tensor: emb = self.time_embed(timestep_embedding(timesteps, self.dim_t)) if self.is_y_cond == "embedding" and y is not None: y = y.squeeze() if self.num_classes > 0 else y.resize_(y.size(0), 1).float() @@ -1198,7 +1206,7 @@ def __init__( self.time_embed = nn.Sequential(nn.Linear(dim_t, dim_t), nn.SiLU(), nn.Linear(dim_t, dim_t)) - def forward(self, x, timesteps, y=None): + def forward(self, x: Tensor, timesteps: Tensor, y: Tensor | None = None) -> Tensor: # ruff: noqa: D102 emb = self.time_embed(timestep_embedding(timesteps, self.dim_t)) if y is not None and self.num_classes > 0: diff --git a/src/midst_toolkit/models/clavaddpm/sampler.py b/src/midst_toolkit/models/clavaddpm/sampler.py index 3edb30bc..d9e54404 100644 --- a/src/midst_toolkit/models/clavaddpm/sampler.py +++ b/src/midst_toolkit/models/clavaddpm/sampler.py @@ -143,19 +143,19 @@ def __init__( self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64) self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.uint) - def weights(self): + def weights(self) -> Tensor: """ Return the weights. Warms up the sampler if it's not warmed up. """ if not self._warmed_up(): - return np.ones([self.diffusion.num_timesteps], dtype=np.float64) + return torch.from_numpy(np.ones([self.diffusion.num_timesteps], dtype=np.float64)) weights = np.sqrt(np.mean(self._loss_history**2, axis=-1)) weights /= np.sum(weights) weights *= 1 - self.uniform_prob weights += self.uniform_prob / len(weights) - return weights + return torch.from_numpy(weights) def update_with_all_losses(self, ts: list[int], losses: list[float]) -> None: """ diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index a570c75e..84f70c3d 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -240,7 +240,7 @@ def child_training( def train_model( data_frame: pd.DataFrame, - data_frame_info: pd.DataFrame, + data_frame_info: dict[str, Any], model_params: dict[str, Any], transformations_dict: dict[str, Any], steps: int, @@ -349,7 +349,7 @@ def train_model( def train_classifier( data_frame: pd.DataFrame, - data_frame_info: pd.DataFrame, + data_frame_info: dict[str, Any], model_params: dict[str, Any], transformations_dict: dict[str, Any], classifier_steps: int, From 3defbf21c9827d2cc5d8f1960e2a1b785476802b Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:39:15 -0400 Subject: [PATCH 4/5] Addressing some PR comments --- .../evaluation/quality/mean_hellinger_distance.py | 4 ++-- .../evaluation/quality/mean_propensity_mse.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py index 41aecb3a..0eb739fe 100644 --- a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py +++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py @@ -56,7 +56,7 @@ def __init__( NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail. This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the - dataframes before calling compute or by setting ``do_preprocess`` to True + dataframes before calling compute or by setting ``do_preprocess`` to True. Args: categorical_columns: Column names corresponding to the categorical variables of any provided dataframe. @@ -82,7 +82,7 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict NOTE: The categorical columns MUST BE PREPROCESSED into numerical values otherwise the evaluation will fail. This function will NOT WORK WITH ONE-HOT ENCODINGS. This can be achieved by separately preprocessing the - dataframes before calling compute or by setting ``do_preprocess`` to True + dataframes before calling compute or by setting ``do_preprocess`` to True. Args: real_data: Real data to which the synthetic data may be compared. In many cases this will be data used diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py index 26544179..c0cd30f2 100644 --- a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py +++ b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py @@ -17,9 +17,9 @@ def __init__( """ This class measures how well a ``LogisticRegression`` model from sklearn (as implemented in SynthEval) can distinguish between real and synthetic data. The classification model is trained on a subset of the two data - sources and then applied to a heldout portion of the mixed data. The average pMSE for synthetic vs. real - predictions and macro F1 scores across the folds are reported along with the standard error of these mean - values. + sources and then applied to a validation split of the mixed data, created through cross-validation folds. The + average pMSE for synthetic vs. real predictions and macro F1 scores across the folds are reported along with + the standard error of these mean values. Computation of pMSE is based on the formula in: @@ -55,8 +55,9 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict """ Computes how well a LogisticRegression model from sklearn (as implemented in SynthEval) can distinguish between real and synthetic data. The classification model is trained on a subset of the two data sources and then - applied to a heldout portion of the mixed data. The average pMSE of the 0 - synthetic, 1 - real predictions and - macro F1 scores across the folds are reported along with the standard error of these mean values. + applied to a validation split of the mixed data, created through cross-fold validation on the combination of + the two datasets. The average pMSE of the 0 = synthetic, 1 = real predictions and macro F1 scores across the + folds are reported along with the standard error of these mean values. NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if From 91fd2cabc49605f20c5f6603fbc95fe8717d4eaf Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Tue, 30 Sep 2025 09:07:47 -0400 Subject: [PATCH 5/5] changes based on Fatemeh's PR review --- src/midst_toolkit/evaluation/metrics_base.py | 5 ++++- .../evaluation/quality/mean_hellinger_distance.py | 12 +++++++++++- .../evaluation/quality/mean_propensity_mse.py | 8 ++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/midst_toolkit/evaluation/metrics_base.py b/src/midst_toolkit/evaluation/metrics_base.py index 94d00e44..94746217 100644 --- a/src/midst_toolkit/evaluation/metrics_base.py +++ b/src/midst_toolkit/evaluation/metrics_base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from logging import INFO +from logging import INFO, WARNING from typing import overload import pandas as pd @@ -54,6 +54,9 @@ def __init__( self.numerical_columns = numerical_columns self.do_preprocess = do_preprocess + if len(self.categorical_columns) == 0 and len(self.numerical_columns) == 0: + log(WARNING, "Both lists of column names are empty. This will result in unexpected metric behavior.") + if do_preprocess: log(INFO, "Default preprocessing will be performed during computation.") diff --git a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py index 0eb739fe..268ca969 100644 --- a/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py +++ b/src/midst_toolkit/evaluation/quality/mean_hellinger_distance.py @@ -1,6 +1,9 @@ +from logging import WARNING + import numpy as np import pandas as pd +from midst_toolkit.common.logger import log from midst_toolkit.evaluation.metrics_base import SynthEvalQualityMetric @@ -8,7 +11,7 @@ def hellinger_distance(discrete_distribution_1: np.ndarray, discrete_distributio """ Compute the empirical Hellinger distance between two discrete probability distributions. Hellinger distance for discrete probability distributions $p$ and $q$ is expressed as - $$\\frac{1}{2} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$. + $$\\frac{1}{\\sqrt{2}} \\cdot \\Vert \\sqrt{p} - \\sqrt{q} \\Vert_2$$. Args: discrete_distribution_1: First discrete distribution for distance computation @@ -71,6 +74,13 @@ def __init__( self.include_numerical_columns = include_numerical_columns + if len(self.categorical_columns) == 0 and not self.include_numerical_columns: + log( + WARNING, + "No categorical columns provided and include_numerical_columns is False. This will result in a NaN " + "for the Hellinger distance.", + ) + def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict[str, float]: """ Computes the DISCRETE Hellinger Distance between corresponding columns of real and synthetic dataframes. For a diff --git a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py index c0cd30f2..fca5fbd3 100644 --- a/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py +++ b/src/midst_toolkit/evaluation/quality/mean_propensity_mse.py @@ -28,8 +28,8 @@ def __init__( NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if - ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the - categoricals. + ``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for + categorical columns and ``MinMaxScaling`` for numerical columns. - A smaller pMSE is better. In cases where the two datasets are balanced in size, 0.25 is worst case. - Higher Macro F1 is better. @@ -61,8 +61,8 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict NOTE: Categorical variables need to be encoded before training the classifier. This can be accomplished by preprocessing before calling ``compute`` or by setting ``do_preprocess`` to True. Note that if - ``do_preprocess`` is True, the default Syntheval pipeline is used, which does NOT one-hot encode the - categoricals. + ``do_preprocess`` is True, the default Syntheval pipeline is used, which performs ``OrdinalEncoding`` for + categorical columns and ``MinMaxScaling`` for numerical columns. Args: real_data: Real data to which the synthetic data may be compared. In many cases this will be data used