VectorInstitute · emersodb · Sep 30, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -49,7 +49,7 @@ settings should be prescribed in your VS Code settings JSON:
 ```json
 {
     "autoDocstring.customTemplatePath": "",
-    "autoDocstring.docstringFormat": "google",
+    "autoDocstring.docstringFormat": "google-notypes",
     "autoDocstring.generateDocstringOnEnter": true,
     "autoDocstring.guessTypes": true,
     "autoDocstring.includeExtendedSummary": false,

diff --git a/src/midst_toolkit/common/variables.py b/src/midst_toolkit/common/variables.py
@@ -0,0 +1,4 @@
+import torch
+
+
+DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
diff --git a/src/midst_toolkit/data_processing/utils.py b/src/midst_toolkit/data_processing/utils.py
@@ -176,6 +176,8 @@ def get_categorical_columns(dataframe: pd.DataFrame, threshold: int) -> list[str
     it is deemed a categorical column. For example, a hurricane might be rated from 1 to 5 in an integer based column.
     With a threshold of 10, this column would be added to the set of categorical columns.
 
+    NOTE: A failure case is DateTimes, which will not be detected as categorical, but are not exactly numerical either.
+
     Args:
         dataframe: Dataframe from which to extract column names corresponding to categorical variables.
         threshold: Threshold below which a column with numerical values (integer or float for example) is deemed to
@@ -190,10 +192,8 @@ def get_categorical_columns(dataframe: pd.DataFrame, threshold: int) -> list[str
 
     for column_name in dataframe.columns:
         # If dtype is an object (as str columns are), assume categorical
-        if (
-            dataframe[column_name].dtype == "object"
-            or is_column_type_numerical(dataframe, column_name)
-            and dataframe[column_name].nunique() <= threshold
+        if dataframe[column_name].dtype == "object" or (
+            is_column_type_numerical(dataframe, column_name) and dataframe[column_name].nunique() <= threshold
         ):
             categorical_variables.append(column_name)
 

diff --git a/src/midst_toolkit/evaluation/privacy/distance_closest_record.py b/src/midst_toolkit/evaluation/privacy/distance_closest_record.py
@@ -1,133 +1,15 @@
 from logging import INFO
-from typing import Any, overload
+from typing import Any
 
-import numpy as np
 import pandas as pd
 import torch
-from sklearn.preprocessing import OneHotEncoder
 from tqdm import tqdm
 
 from midst_toolkit.common.logger import log
+from midst_toolkit.common.variables import DEVICE
 from midst_toolkit.evaluation.metrics_base import MetricBase
+from midst_toolkit.evaluation.privacy.distance_preprocess import preprocess_for_distance_computation
 from midst_toolkit.evaluation.privacy.distance_utils import NormType, minimum_distances
-from midst_toolkit.evaluation.utils import extract_columns_based_on_meta_info
-
-
-DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-
-
-@overload
-def preprocess(
-    meta_info: dict[str, Any], synthetic_data: pd.DataFrame, real_data_train: pd.DataFrame
-) -> tuple[pd.DataFrame, pd.DataFrame]: ...
-
-
-@overload
-def preprocess(
-    meta_info: dict[str, Any],
-    synthetic_data: pd.DataFrame,
-    real_data_train: pd.DataFrame,
-    real_data_test: pd.DataFrame,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ...
-
-
-def preprocess(
-    meta_info: dict[str, Any],
-    synthetic_data: pd.DataFrame,
-    real_data_train: pd.DataFrame,
-    real_data_test: pd.DataFrame | None = None,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] | tuple[pd.DataFrame, pd.DataFrame]:
-    """
-    This function performs preprocessing on Pandas dataframes to prepare for computation of the distance to closest
-    record score. Specifically, this function filters the provided raw dataframes to the appropriate numerical and
-    categorical columns based on the information of the ``meta_info`` JSON. For the numerical columns, it normalizes
-    values by the distance between the largest and smallest value of each column of the ``real_data_train`` numerical
-    values. The categorical columns are processed into one-hot encoding columns, where the transformation is fitted
-    on the concatenation of columns from each dataset.
-
-    Args:
-        meta_info: JSON with meta information about the columns and their corresponding types that should be
-            considered.
-        synthetic_data: Dataframe containing all synthetically generated data.
-        real_data_train: Dataframe containing the real training data associated with the model that generated the
-            ``synthetic_data``.
-        real_data_test: Dataframe containing the real test data. It's important that this data was not seen by the
-            model that generated ``synthetic_data`` during training. If None, then it will, of course, not be
-            preprocessed. Defaults to None.
-
-    Returns:
-        Processed Pandas dataframes with the synthetic data, real data for training, real data for testing if it was
-        provided.
-    """
-    numerical_synthetic_data, categorical_synthetic_data = extract_columns_based_on_meta_info(
-        synthetic_data, meta_info
-    )
-    numerical_real_data_train, categorical_real_data_train = extract_columns_based_on_meta_info(
-        real_data_train, meta_info
-    )
-
-    numerical_ranges = [
-        numerical_real_data_train[index].max() - numerical_real_data_train[index].min()
-        for index in numerical_real_data_train.columns
-    ]
-    numerical_ranges_np = np.array(numerical_ranges)
-
-    num_synthetic_data_np = numerical_synthetic_data.to_numpy()
-    num_real_data_train_np = numerical_real_data_train.to_numpy()
-
-    # Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
-    num_synthetic_data_np = num_synthetic_data_np / numerical_ranges_np
-    num_real_data_train_np = num_real_data_train_np / numerical_ranges_np
-
-    cat_synthetic_data_np = categorical_synthetic_data.to_numpy().astype("str")
-    cat_real_data_train_np = categorical_real_data_train.to_numpy().astype("str")
-
-    if real_data_test is not None:
-        numerical_real_data_test, categorical_real_data_test = extract_columns_based_on_meta_info(
-            real_data_test, meta_info
-        )
-        num_real_data_test_np = numerical_real_data_test.to_numpy()
-        # Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
-        num_real_data_test_np = num_real_data_test_np / numerical_ranges_np
-        cat_real_data_test_np = categorical_real_data_test.to_numpy().astype("str")
-    else:
-        num_real_data_test_np, cat_real_data_test_np = None, None
-
-    if categorical_real_data_train.shape[1] > 0:
-        encoder = OneHotEncoder()
-        if cat_real_data_test_np is not None:
-            encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np, cat_real_data_test_np), axis=0))
-        else:
-            encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np), axis=0))
-
-        cat_synthetic_data_oh = encoder.transform(cat_synthetic_data_np).toarray()
-        cat_real_data_train_oh = encoder.transform(cat_real_data_train_np).toarray()
-        if cat_real_data_test_np is not None:
-            cat_real_data_test_oh = encoder.transform(cat_real_data_test_np).toarray()
-
-    else:
-        cat_synthetic_data_oh = np.empty((categorical_synthetic_data.shape[0], 0))
-        cat_real_data_train_oh = np.empty((categorical_real_data_train.shape[0], 0))
-        if categorical_real_data_test is not None:
-            cat_real_data_test_oh = np.empty((categorical_real_data_test.shape[0], 0))
-
-    processed_real_data_train = pd.DataFrame(
-        np.concatenate((num_real_data_train_np, cat_real_data_train_oh), axis=1)
-    ).astype(float)
-    processed_synthetic_data = pd.DataFrame(
-        np.concatenate((num_synthetic_data_np, cat_synthetic_data_oh), axis=1)
-    ).astype(float)
-
-    if real_data_test is None:
-        return (processed_synthetic_data, processed_real_data_train)
-
-    assert num_real_data_test_np is not None
-    assert cat_real_data_test_oh is not None
-    return (
-        processed_synthetic_data,
-        processed_real_data_train,
-        pd.DataFrame(np.concatenate((num_real_data_test_np, cat_real_data_test_oh), axis=1)).astype(float),
-    )
 
 
 class DistanceToClosestRecordScore(MetricBase):
@@ -159,13 +41,15 @@ def __init__(
         Args:
             norm: Determines what norm the distances are computed in. Defaults to NormType.L1.
             batch_size: Batch size used to compute the DCR iteratively. Just needed to manage memory. Defaults to 1000.
-            device: What device the tensors should be sent to in order to perform the calculations. Defaults to DEVICE.
+            device: What device the tensors should be sent to in order to perform the calculations. Defaults to
+                "cuda" if CUDA is available, "cpu" otherwise.
             meta_info: This is only required/used if ``do_preprocess`` is True. JSON with meta information about the
                 columns and their corresponding types that should be considered. At minimum, it should have the keys
                 'num_col_idx', 'cat_col_idx', 'target_col_idx', and 'task_type'. If None, then no preprocessing is
                 expected to be done. Defaults to None.
             do_preprocess: Whether or not to preprocess the dataframes before performing the DCR computations.
-                Preprocessing is performed with the ``preprocess`` function Defaults to False.
+                Preprocessing is performed with the ``preprocess`` function. Note, ``meta_info`` must be provided in
+                order to perform the appropriate preprocessing steps. Defaults to False.
         """
         self.norm = norm
         self.batch_size = batch_size
@@ -190,8 +74,8 @@ def compute(
 
         NOTE: The dataframes provided need to be pre-processed into numerical values for each column in some way. That
         is, for example, the categorical variables should be one-hot encoded and the numerical values normalized in
-        some way. This can be done via the ``preprocess`` function beforehand or it can be done within compute if
-        ``do_preprocess`` is True and ``meta_info`` has been provided.
+        some way. This can be done via the ``preprocess`` function in ``distance_preprocess.py`` beforehand or it can
+        be done within ``compute`` if ``do_preprocess`` is True and ``meta_info`` has been provided.
 
         Args:
             real_data: Real data that was used to train the model that generated the ``synthetic_data``.
@@ -205,7 +89,7 @@ def compute(
         assert holdout_data is not None, "For DCR score calculations, a holdout dataset is required"
 
         if self.do_preprocess:
-            synthetic_data, real_data, holdout_data = preprocess(
+            synthetic_data, real_data, holdout_data = preprocess_for_distance_computation(
                 self.meta_info, synthetic_data, real_data, holdout_data
             )
 
@@ -221,7 +105,7 @@ def compute(
             end_index = min(start_index + self.batch_size, synthetic_data_tensor.size(0))
             synthetic_data_batch = synthetic_data_tensor[start_index:end_index]
 
-            # Calculate distances for real and test data in smaller batches
+            # Calculate distances from synthetic data points to real and test data in smaller batches
             dcr_train_batch = minimum_distances(
                 synthetic_data_batch, real_data_train_tensor, self.batch_size, self.norm
             )
@@ -260,13 +144,15 @@ def __init__(
         Args:
             norm: Determines what norm the distances are computed in. Defaults to NormType.L1.
             batch_size: Batch size used to compute the DCR iteratively. Just needed to manage memory. Defaults to 1000.
-            device: What device the tensors should be sent to in order to perform the calculations. Defaults to DEVICE.
+            device: What device the tensors should be sent to in order to perform the calculations. Defaults to
+                "cuda" if CUDA is available, "cpu" otherwise.
             meta_info: This is only required/used if ``do_preprocess`` is True. JSON with meta information about the
                 columns and their corresponding types that should be considered. At minimum, it should have the keys
                 'num_col_idx', 'cat_col_idx', 'target_col_idx', and 'task_type'. If None, then no preprocessing is
                 expected to be done. Defaults to None.
             do_preprocess: Whether or not to preprocess the dataframes before performing the DCR computations.
-                Preprocessing is performed with the ``preprocess`` function Defaults to False.
+                Preprocessing is performed with the ``preprocess``. Note, ``meta_info`` must be provided in order
+                to perform the appropriate preprocessing steps. function Defaults to False.
         """
         self.norm = norm
         self.batch_size = batch_size
@@ -287,7 +173,7 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
 
         NOTE: The dataframes provided need to be pre-processed into numerical values for each column in some way. That
         is, for example, the categorical variables should be one-hot encoded and the numerical values normalized in
-        some way. This can be done via the ``preprocess`` function beforehand or it can be done within compute if
+        some way. This can be done via the ``preprocess`` function beforehand or it can be done within ``compute`` if
         ``do_preprocess`` is True and ``meta_info`` has been provided.
 
         Args:
@@ -301,7 +187,7 @@ def compute(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame) -> dict
             Example: { "median_dcr_score": 0.79 }
         """
         if self.do_preprocess:
-            synthetic_data, real_data = preprocess(self.meta_info, synthetic_data, real_data)
+            synthetic_data, real_data = preprocess_for_distance_computation(self.meta_info, synthetic_data, real_data)
 
         real_data_tensor = torch.tensor(real_data.to_numpy()).to(self.device)
         synthetic_data_tensor = torch.tensor(synthetic_data.to_numpy()).to(self.device)

diff --git a/src/midst_toolkit/evaluation/privacy/distance_preprocess.py b/src/midst_toolkit/evaluation/privacy/distance_preprocess.py
@@ -0,0 +1,122 @@
+from typing import Any, overload
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import OneHotEncoder
+
+from midst_toolkit.evaluation.utils import extract_columns_based_on_meta_info
+
+
+@overload
+def preprocess_for_distance_computation(
+    meta_info: dict[str, Any], synthetic_data: pd.DataFrame, real_data_train: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]: ...
+
+
+@overload
+def preprocess_for_distance_computation(
+    meta_info: dict[str, Any],
+    synthetic_data: pd.DataFrame,
+    real_data_train: pd.DataFrame,
+    real_data_test: pd.DataFrame,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ...
+
+
+def preprocess_for_distance_computation(
+    meta_info: dict[str, Any],
+    synthetic_data: pd.DataFrame,
+    real_data_train: pd.DataFrame,
+    real_data_test: pd.DataFrame | None = None,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] | tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    This function performs preprocessing on Pandas dataframes to prepare for computation of various record-to-record
+    distances. This is used for computations like distance to closest record scores. Specifically, this function
+    filters the provided raw dataframes to the appropriate numerical and categorical columns based on the information
+    of the ``meta_info`` JSON. For the numerical columns, it normalizes values by the distance between the largest
+    and smallest value of each column of the ``real_data_train`` numerical values. The categorical columns are
+    processed into one-hot encoding columns, where the transformation is fitted on the concatenation of columns from
+    each dataset.
+
+    Args:
+        meta_info: JSON with meta information about the columns and their corresponding types that should be
+            considered.
+        synthetic_data: Dataframe containing all synthetically generated data.
+        real_data_train: Dataframe containing the real training data associated with the model that generated the
+            ``synthetic_data``.
+        real_data_test: Dataframe containing the real test data. It's important that this data was not seen by the
+            model that generated ``synthetic_data`` during training. If None, then it will, of course, not be
+            preprocessed. Defaults to None.
+
+    Returns:
+        Processed Pandas dataframes with the synthetic data, real data for training, real data for testing if it was
+        provided.
+    """
+    numerical_synthetic_data, categorical_synthetic_data = extract_columns_based_on_meta_info(
+        synthetic_data, meta_info
+    )
+    numerical_real_data_train, categorical_real_data_train = extract_columns_based_on_meta_info(
+        real_data_train, meta_info
+    )
+
+    numerical_ranges = [
+        numerical_real_data_train[index].max() - numerical_real_data_train[index].min()
+        for index in numerical_real_data_train.columns
+    ]
+    numerical_ranges_np = np.array(numerical_ranges)
+
+    num_synthetic_data_np = numerical_synthetic_data.to_numpy()
+    num_real_data_train_np = numerical_real_data_train.to_numpy()
+
+    # Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
+    num_synthetic_data_np = num_synthetic_data_np / numerical_ranges_np
+    num_real_data_train_np = num_real_data_train_np / numerical_ranges_np
+
+    cat_synthetic_data_np = categorical_synthetic_data.to_numpy().astype("str")
+    cat_real_data_train_np = categorical_real_data_train.to_numpy().astype("str")
+
+    if real_data_test is not None:
+        numerical_real_data_test, categorical_real_data_test = extract_columns_based_on_meta_info(
+            real_data_test, meta_info
+        )
+        num_real_data_test_np = numerical_real_data_test.to_numpy()
+        # Normalize the values of the numerical columns of the different datasets by the ranges of the train set.
+        num_real_data_test_np = num_real_data_test_np / numerical_ranges_np
+        cat_real_data_test_np = categorical_real_data_test.to_numpy().astype("str")
+    else:
+        num_real_data_test_np, cat_real_data_test_np = None, None
+
+    if categorical_real_data_train.shape[1] > 0:
+        encoder = OneHotEncoder()
+        if cat_real_data_test_np is not None:
+            encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np, cat_real_data_test_np), axis=0))
+        else:
+            encoder.fit(np.concatenate((cat_synthetic_data_np, cat_real_data_train_np), axis=0))
+
+        cat_synthetic_data_oh = encoder.transform(cat_synthetic_data_np).toarray()
+        cat_real_data_train_oh = encoder.transform(cat_real_data_train_np).toarray()
+        if cat_real_data_test_np is not None:
+            cat_real_data_test_oh = encoder.transform(cat_real_data_test_np).toarray()
+
+    else:
+        cat_synthetic_data_oh = np.empty((categorical_synthetic_data.shape[0], 0))
+        cat_real_data_train_oh = np.empty((categorical_real_data_train.shape[0], 0))
+        if categorical_real_data_test is not None:
+            cat_real_data_test_oh = np.empty((categorical_real_data_test.shape[0], 0))
+
+    processed_real_data_train = pd.DataFrame(
+        np.concatenate((num_real_data_train_np, cat_real_data_train_oh), axis=1)
+    ).astype(float)
+    processed_synthetic_data = pd.DataFrame(
+        np.concatenate((num_synthetic_data_np, cat_synthetic_data_oh), axis=1)
+    ).astype(float)
+
+    if real_data_test is None:
+        return (processed_synthetic_data, processed_real_data_train)
+
+    assert num_real_data_test_np is not None
+    assert cat_real_data_test_oh is not None
+    return (
+        processed_synthetic_data,
+        processed_real_data_train,
+        pd.DataFrame(np.concatenate((num_real_data_test_np, cat_real_data_test_oh), axis=1)).astype(float),
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		import torch


		DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")