Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
adae16e
First checkin of hellinger and pmse implementations
emersodb Sep 17, 2025
a790991
Fix typing issue
emersodb Sep 17, 2025
43834a4
Adding in Hitting Rate and Mean F1 Difference implementations. Also f…
emersodb Sep 17, 2025
bec6418
Removing hard coding
emersodb Sep 17, 2025
fb3c48a
Some CR fixes from Marcelo's review
emersodb Sep 17, 2025
1705d50
Merge branch 'main' into dbe/add_hellinger_pmse
emersodb Sep 17, 2025
c286906
Train code split, Part 2: moving some of the model.py code into clust…
lotif Sep 17, 2025
072a09a
Merge branch 'dbe/add_hellinger_pmse' into dbe/add_f1_dff_hitting_rate
emersodb Sep 17, 2025
81a40ba
NNDR module and tests
emersodb Sep 22, 2025
c5a0682
Fixing small bug
emersodb Sep 22, 2025
704e48f
Adding in the epsilon identifiability risk metric
emersodb Sep 23, 2025
884c582
Small code fixes and documentation improvements
emersodb Sep 23, 2025
59ea7f4
New mypy flow and fixes to typing issues that were discovered
emersodb Sep 24, 2025
660729f
Merge branch 'dbe/fixing_mypy' into dbe/add_hellinger_pmse
emersodb Sep 24, 2025
4023d21
Merge branch 'dbe/add_hellinger_pmse' into dbe/add_f1_dff_hitting_rate
emersodb Sep 24, 2025
b985246
Merge branch 'dbe/add_f1_dff_hitting_rate' into dbe/add_nndr_and_eir
emersodb Sep 24, 2025
1b51d23
Some small updates
emersodb Sep 24, 2025
5bd360c
Merge branch 'main' into dbe/add_hellinger_pmse
emersodb Sep 24, 2025
810f3d6
Adding in a bit more revealing testing for the categorical column det…
emersodb Sep 24, 2025
8328547
Merge branch 'main' into dbe/add_hellinger_pmse
emersodb Sep 25, 2025
2ccdfa6
Merge branch 'dbe/add_hellinger_pmse' into dbe/add_f1_dff_hitting_rate
emersodb Sep 26, 2025
0147eb1
Merge branch 'dbe/add_f1_dff_hitting_rate' into dbe/add_nndr_and_eir
emersodb Sep 26, 2025
2de4692
Merge branch 'main' into dbe/add_hellinger_pmse
emersodb Sep 26, 2025
c78973f
Merge branch 'dbe/add_hellinger_pmse' into dbe/add_f1_dff_hitting_rate
emersodb Sep 26, 2025
1aa4e5a
Merge branch 'dbe/add_f1_dff_hitting_rate' into dbe/add_nndr_and_eir
emersodb Sep 26, 2025
8918c35
Merge branch 'main' into dbe/add_hellinger_pmse
emersodb Sep 26, 2025
b2c5de5
Merge branch 'main' into dbe/add_hellinger_pmse
emersodb Sep 26, 2025
3defbf2
Addressing some PR comments
emersodb Sep 26, 2025
217c7a7
Merge branch 'dbe/add_hellinger_pmse' into dbe/add_f1_dff_hitting_rate
emersodb Sep 26, 2025
789dd7b
Merge branch 'dbe/add_f1_dff_hitting_rate' into dbe/add_nndr_and_eir
emersodb Sep 26, 2025
500ce2b
Addressing some PR comments.
emersodb Sep 29, 2025
231265c
PR Comment changes
emersodb Sep 29, 2025
e41bcc7
Merge branch 'main' into dbe/add_nndr_and_eir
emersodb Sep 30, 2025
6b9d639
Dropping unused variable
emersodb Sep 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ settings should be prescribed in your VS Code settings JSON:
```json
{
"autoDocstring.customTemplatePath": "",
"autoDocstring.docstringFormat": "google",
"autoDocstring.docstringFormat": "google-notypes",
"autoDocstring.generateDocstringOnEnter": true,
"autoDocstring.guessTypes": true,
"autoDocstring.includeExtendedSummary": false,
Expand Down
4 changes: 4 additions & 0 deletions src/midst_toolkit/common/variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import torch


DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
8 changes: 4 additions & 4 deletions src/midst_toolkit/data_processing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def get_categorical_columns(dataframe: pd.DataFrame, threshold: int) -> list[str
it is deemed a categorical column. For example, a hurricane might be rated from 1 to 5 in an integer based column.
With a threshold of 10, this column would be added to the set of categorical columns.

NOTE: A failure case is DateTimes, which will not be detected as categorical, but are not exactly numerical either.

Args:
dataframe: Dataframe from which to extract column names corresponding to categorical variables.
threshold: Threshold below which a column with numerical values (integer or float for example) is deemed to
Expand All @@ -190,10 +192,8 @@ def get_categorical_columns(dataframe: pd.DataFrame, threshold: int) -> list[str

for column_name in dataframe.columns:
# If dtype is an object (as str columns are), assume categorical
if (
dataframe[column_name].dtype == "object"
or is_column_type_numerical(dataframe, column_name)
and dataframe[column_name].nunique() <= threshold
if dataframe[column_name].dtype == "object" or (
is_column_type_numerical(dataframe, column_name) and dataframe[column_name].nunique() <= threshold
):
categorical_variables.append(column_name)

Expand Down
148 changes: 17 additions & 131 deletions src/midst_toolkit/evaluation/privacy/distance_closest_record.py
Original file line number Diff line number Diff line change
@@ -1,133 +1,15 @@
from logging import INFO
from typing import Any, overload
from typing import Any

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

from midst_toolkit.common.logger import log
from midst_toolkit.common.variables import DEVICE
from midst_toolkit.evaluation.metrics_base import MetricBase
from midst_toolkit.evaluation.privacy.distance_preprocess import preprocess_for_distance_computation
from midst_toolkit.evaluation.privacy.distance_utils import NormType, minimum_distances
from midst_toolkit.evaluation.utils import extract_columns_based_on_meta_info


DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


@overload
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this to it's own module, as it is useful for computing NNDR as well.

def preprocess(
meta_info: dict[str, Any], synthetic_data: pd.DataFrame, real_data_train: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]: ...


@overload
def preprocess(
meta_info: dict[str, Any],
synthetic_data: pd.DataFrame,
real_data_train: pd.DataFrame,
real_data_test: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ...


def preprocess(
meta_info: dict[str, Any],
synthetic_data: pd.DataFrame,
real_data_train: pd.DataFrame,
real_data_test: pd.DataFrame | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] | tuple[pd.DataFrame, pd.DataFrame]:
"""
This function performs preprocessing on Pandas dataframes to prepare for computation of the distance to closest
record score. Specifically, this function filters the provided raw dataframes to the appropriate numerical and
categorical columns based on the information of the ``meta_info`` JSON. For the numerical columns, it normalizes
values by the distance between the largest and smallest value of each column of the ``real_data_train`` numerical
values. The categorical columns are processed into one-hot encoding columns, where the transformation is fitted
on the concatenation of columns from each dataset.

Args:
meta_info: JSON with meta information about the columns and their corresponding types that should be
considered.
synthetic_data: Dataframe containing all synthetically generated data.
real_data_train: Dataframe containing the real training data associated with the model that generated the
``synthetic_data``.
real_data_test: Dataframe containing the real test data. It's important that this data was not seen by the
model that generated ``synthetic_data`` during training. If None, then it will, of course, not be
preprocessed. Defaults to None.

Returns:
Processed Pandas dataframes with the synthetic data, real data for training, real data for testing if it was
provided.
"""
numerical_synthetic_data, categorical_synthetic_data = extract_columns_based_on_meta_info(
synthetic_data, meta_info
)
numerical_real_data_train, categorical_real_data_train = extract_columns_based_on_meta_info(
real_data_train, meta_info
)

numerical_ranges = [
numerical_real_data_train[index].max() - numerical_real_data_train[index].min()
for index in numerical_real_data_train.columns
]
numerical_ranges_np = np.array(numerical_ranges)

num_synthetic_data_np = numerical_synthetic_data.to_numpy()
num_real_data_train_np = numerical_real_data_train.to_numpy()

# Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
num_synthetic_data_np = num_synthetic_data_np / numerical_ranges_np
num_real_data_train_np = num_real_data_train_np / numerical_ranges_np

cat_synthetic_data_np = categorical_synthetic_data.to_numpy().astype("str")
cat_real_data_train_np = categorical_real_data_train.to_numpy().astype("str")

if real_data_test is not None:
numerical_real_data_test, categorical_real_data_test = extract_columns_based_on_meta_info(
real_data_test, meta_info
)
num_real_data_test_np = numerical_real_data_test.to_numpy()
# Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
num_real_data_test_np = num_real_data_test_np / numerical_ranges_np
cat_real_data_test_np = categorical_real_data_test.to_numpy().astype("str")
else:
num_real_data_test_np, cat_real_data_test_np = None, None

if categorical_real_data_train.shape[1] > 0:
encoder = OneHotEncoder()
if cat_real_data_test_np is not None:
encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np, cat_real_data_test_np), axis=0))
else:
encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np), axis=0))

cat_synthetic_data_oh = encoder.transform(cat_synthetic_data_np).toarray()
cat_real_data_train_oh = encoder.transform(cat_real_data_train_np).toarray()
if cat_real_data_test_np is not None:
cat_real_data_test_oh = encoder.transform(cat_real_data_test_np).toarray()

else:
cat_synthetic_data_oh = np.empty((categorical_synthetic_data.shape[0], 0))
cat_real_data_train_oh = np.empty((categorical_real_data_train.shape[0], 0))
if categorical_real_data_test is not None:
cat_real_data_test_oh = np.empty((categorical_real_data_test.shape[0], 0))

processed_real_data_train = pd.DataFrame(
np.concatenate((num_real_data_train_np, cat_real_data_train_oh), axis=1)
).astype(float)
processed_synthetic_data = pd.DataFrame(
np.concatenate((num_synthetic_data_np, cat_synthetic_data_oh), axis=1)
).astype(float)

if real_data_test is None:
return (processed_synthetic_data, processed_real_data_train)

assert num_real_data_test_np is not None
assert cat_real_data_test_oh is not None
return (
processed_synthetic_data,
processed_real_data_train,
pd.DataFrame(np.concatenate((num_real_data_test_np, cat_real_data_test_oh), axis=1)).astype(float),
)


class DistanceToClosestRecordScore(MetricBase):
Expand Down Expand Up @@ -159,13 +41,15 @@ def __init__(
Args:
norm: Determines what norm the distances are computed in. Defaults to NormType.L1.
batch_size: Batch size used to compute the DCR iteratively. Just needed to manage memory. Defaults to 1000.
device: What device the tensors should be sent to in order to perform the calculations. Defaults to DEVICE.
device: What device the tensors should be sent to in order to perform the calculations. Defaults to
"cuda" if CUDA is available, "cpu" otherwise.
meta_info: This is only required/used if ``do_preprocess`` is True. JSON with meta information about the
columns and their corresponding types that should be considered. At minimum, it should have the keys
'num_col_idx', 'cat_col_idx', 'target_col_idx', and 'task_type'. If None, then no preprocessing is
expected to be done. Defaults to None.
do_preprocess: Whether or not to preprocess the dataframes before performing the DCR computations.
Preprocessing is performed with the ``preprocess`` function Defaults to False.
Preprocessing is performed with the ``preprocess`` function. Note, ``meta_info`` must be provided in
order to perform the appropriate preprocessing steps. Defaults to False.
"""
self.norm = norm
self.batch_size = batch_size
Expand All @@ -190,8 +74,8 @@ def compute(

NOTE: The dataframes provided need to be pre-processed into numerical values for each column in some way. That
is, for example, the categorical variables should be one-hot encoded and the numerical values normalized in
some way. This can be done via the ``preprocess`` function beforehand or it can be done within compute if
``do_preprocess`` is True and ``meta_info`` has been provided.
some way. This can be done via the ``preprocess`` function in ``distance_preprocess.py`` beforehand or it can
be done within ``compute`` if ``do_preprocess`` is True and ``meta_info`` has been provided.

Args:
real_data: Real data that was used to train the model that generated the ``synthetic_data``.
Expand All @@ -205,7 +89,7 @@ def compute(
assert holdout_data is not None, "For DCR score calculations, a holdout dataset is required"

if self.do_preprocess:
synthetic_data, real_data, holdout_data = preprocess(
synthetic_data, real_data, holdout_data = preprocess_for_distance_computation(
self.meta_info, synthetic_data, real_data, holdout_data
)

Expand All @@ -221,7 +105,7 @@ def compute(
end_index = min(start_index + self.batch_size, synthetic_data_tensor.size(0))
synthetic_data_batch = synthetic_data_tensor[start_index:end_index]

# Calculate distances for real and test data in smaller batches
# Calculate distances from synthetic data points to real and test data in smaller batches
dcr_train_batch = minimum_distances(
synthetic_data_batch, real_data_train_tensor, self.batch_size, self.norm
)
Expand Down Expand Up @@ -260,13 +144,15 @@ def __init__(
Args:
norm: Determines what norm the distances are computed in. Defaults to NormType.L1.
batch_size: Batch size used to compute the DCR iteratively. Just needed to manage memory. Defaults to 1000.
device: What device the tensors should be sent to in order to perform the calculations. Defaults to DEVICE.
device: What device the tensors should be sent to in order to perform the calculations. Defaults to
"cuda" if CUDA is available, "cpu" otherwise.
meta_info: This is only required/used if ``do_preprocess`` is True. JSON with meta information about the
columns and their corresponding types that should be considered. At minimum, it should have the keys
'num_col_idx', 'cat_col_idx', 'target_col_idx', and 'task_type'. If None, then no preprocessing is
expected to be done. Defaults to None.
do_preprocess: Whether or not to preprocess the dataframes before performing the DCR computations.
Preprocessing is performed with the ``preprocess`` function Defaults to False.
Preprocessing is performed with the ``preprocess``. Note, ``meta_info`` must be provided in order
to perform the appropriate preprocessing steps. function Defaults to False.
"""
self.norm = norm
self.batch_size = batch_size
Expand All @@ -287,7 +173,7 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict

NOTE: The dataframes provided need to be pre-processed into numerical values for each column in some way. That
is, for example, the categorical variables should be one-hot encoded and the numerical values normalized in
some way. This can be done via the ``preprocess`` function beforehand or it can be done within compute if
some way. This can be done via the ``preprocess`` function beforehand or it can be done within ``compute`` if
``do_preprocess`` is True and ``meta_info`` has been provided.

Args:
Expand All @@ -301,7 +187,7 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
Example: { "median_dcr_score": 0.79 }
"""
if self.do_preprocess:
synthetic_data, real_data = preprocess(self.meta_info, synthetic_data, real_data)
synthetic_data, real_data = preprocess_for_distance_computation(self.meta_info, synthetic_data, real_data)

real_data_tensor = torch.tensor(real_data.to_numpy()).to(self.device)
synthetic_data_tensor = torch.tensor(synthetic_data.to_numpy()).to(self.device)
Expand Down
122 changes: 122 additions & 0 deletions src/midst_toolkit/evaluation/privacy/distance_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from typing import Any, overload

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

from midst_toolkit.evaluation.utils import extract_columns_based_on_meta_info


@overload
def preprocess_for_distance_computation(
meta_info: dict[str, Any], synthetic_data: pd.DataFrame, real_data_train: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]: ...


@overload
def preprocess_for_distance_computation(
meta_info: dict[str, Any],
synthetic_data: pd.DataFrame,
real_data_train: pd.DataFrame,
real_data_test: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ...


def preprocess_for_distance_computation(
meta_info: dict[str, Any],
synthetic_data: pd.DataFrame,
real_data_train: pd.DataFrame,
real_data_test: pd.DataFrame | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] | tuple[pd.DataFrame, pd.DataFrame]:
"""
This function performs preprocessing on Pandas dataframes to prepare for computation of various record-to-record
distances. This is used for computations like distance to closest record scores. Specifically, this function
filters the provided raw dataframes to the appropriate numerical and categorical columns based on the information
of the ``meta_info`` JSON. For the numerical columns, it normalizes values by the distance between the largest
and smallest value of each column of the ``real_data_train`` numerical values. The categorical columns are
processed into one-hot encoding columns, where the transformation is fitted on the concatenation of columns from
each dataset.

Args:
meta_info: JSON with meta information about the columns and their corresponding types that should be
considered.
synthetic_data: Dataframe containing all synthetically generated data.
real_data_train: Dataframe containing the real training data associated with the model that generated the
``synthetic_data``.
real_data_test: Dataframe containing the real test data. It's important that this data was not seen by the
model that generated ``synthetic_data`` during training. If None, then it will, of course, not be
preprocessed. Defaults to None.

Returns:
Processed Pandas dataframes with the synthetic data, real data for training, real data for testing if it was
provided.
"""
numerical_synthetic_data, categorical_synthetic_data = extract_columns_based_on_meta_info(
synthetic_data, meta_info
)
numerical_real_data_train, categorical_real_data_train = extract_columns_based_on_meta_info(
real_data_train, meta_info
)

numerical_ranges = [
numerical_real_data_train[index].max() - numerical_real_data_train[index].min()
for index in numerical_real_data_train.columns
]
numerical_ranges_np = np.array(numerical_ranges)

num_synthetic_data_np = numerical_synthetic_data.to_numpy()
num_real_data_train_np = numerical_real_data_train.to_numpy()

# Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
num_synthetic_data_np = num_synthetic_data_np / numerical_ranges_np
num_real_data_train_np = num_real_data_train_np / numerical_ranges_np

cat_synthetic_data_np = categorical_synthetic_data.to_numpy().astype("str")
cat_real_data_train_np = categorical_real_data_train.to_numpy().astype("str")

if real_data_test is not None:
numerical_real_data_test, categorical_real_data_test = extract_columns_based_on_meta_info(
real_data_test, meta_info
)
num_real_data_test_np = numerical_real_data_test.to_numpy()
# Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
num_real_data_test_np = num_real_data_test_np / numerical_ranges_np
cat_real_data_test_np = categorical_real_data_test.to_numpy().astype("str")
else:
num_real_data_test_np, cat_real_data_test_np = None, None

if categorical_real_data_train.shape[1] > 0:
encoder = OneHotEncoder()
if cat_real_data_test_np is not None:
encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np, cat_real_data_test_np), axis=0))
else:
encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np), axis=0))

cat_synthetic_data_oh = encoder.transform(cat_synthetic_data_np).toarray()
cat_real_data_train_oh = encoder.transform(cat_real_data_train_np).toarray()
if cat_real_data_test_np is not None:
cat_real_data_test_oh = encoder.transform(cat_real_data_test_np).toarray()

else:
cat_synthetic_data_oh = np.empty((categorical_synthetic_data.shape[0], 0))
cat_real_data_train_oh = np.empty((categorical_real_data_train.shape[0], 0))
if categorical_real_data_test is not None:
cat_real_data_test_oh = np.empty((categorical_real_data_test.shape[0], 0))

processed_real_data_train = pd.DataFrame(
np.concatenate((num_real_data_train_np, cat_real_data_train_oh), axis=1)
).astype(float)
processed_synthetic_data = pd.DataFrame(
np.concatenate((num_synthetic_data_np, cat_synthetic_data_oh), axis=1)
).astype(float)

if real_data_test is None:
return (processed_synthetic_data, processed_real_data_train)

assert num_real_data_test_np is not None
assert cat_real_data_test_oh is not None
return (
processed_synthetic_data,
processed_real_data_train,
pd.DataFrame(np.concatenate((num_real_data_test_np, cat_real_data_test_oh), axis=1)).astype(float),
)
Loading