VectorInstitute · sarakodeiri · Oct 6, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,10 @@ wheels/
 # Dataset files
 examples/**/data/
 
+# Trained metaclassifiers
+examples/ensemble_attack/trained_models/
+examples/ensemble_attack/attack_results/
+
 # hydra output
 outputs/
 

diff --git a/examples/ensemble_attack/config.yaml b/examples/ensemble_attack/config.yaml
@@ -8,21 +8,27 @@ data_paths:
   midst_data_path: ${base_data_dir}/midst_data_all_attacks # Used only for reading the data
   population_path: ${base_data_dir}/population_data  # Path where the population data should be stored
   processed_attack_data_path: ${base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored
+  attack_results_path: ${base_example_dir}/attack_results # Path where the attack results will be stored
+
+model_paths:
+  shadow_models_path: ${base_example_dir}/shadow_models # Path where the shadow models are stored
+  metaclassifier_model_path: ${base_example_dir}/trained_models # Path where the trained metaclassifier model will be saved
 
 # Pipeline control
 pipeline:
-  run_data_processing: true
+  run_data_processing: false
+  run_metaclassifier_training: true
 
 # Dataset specific information used for processing in this example
 data_processing_config:
   collect_attack_data_types:
-          [
-            "tabddpm_black_box",
-            "tabsyn_black_box",
-            "tabddpm_white_box",
-            "tabsyn_white_box",
-            "clavaddpm_black_box",
-            "clavaddpm_white_box",
+        [
+          "tabddpm_black_box",
+          "tabsyn_black_box",
+          "tabddpm_white_box",
+          "tabsyn_white_box",
+          "clavaddpm_black_box",
+          "clavaddpm_white_box",
         ]
   # The column name in the data to be used for stratified splitting.
   column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
@@ -34,18 +40,27 @@ data_processing_config:
   single_table_train_data_file_name: "train_with_id.csv"
   multi_table_train_data_file_name: "trans.csv"
   challenge_data_file_name: "challenge_with_id.csv"
+
   # Data Config files path
   trans_domain_file_path: ${base_example_dir}/data_configs/trans_domain.json
   dataset_meta_file_path: ${base_example_dir}/data_configs/dataset_meta.json
   trans_json_file_path: ${base_example_dir}/data_configs/trans.json
+  data_types_file_path: ${base_example_dir}/data_configs/data_types.json
   population_sample_size: 40000
 
-# Training settings (placeholder)
+# Training settings (temporary, numbers subject to change)
 shadow_training:
   epochs: 10
   learning_rate: 0.001
   batch_size: 64
   model_type: "tabddpm"
 
+# Metaclassifier settings
+metaclassifier:
+  model_type: "xgb"
+  use_gpu: true
+  # Temporary. Might remove having an epoch parameter.
+  epochs: 1
+
 # General settings
 random_seed: 42
diff --git a/examples/ensemble_attack/data_configs/data_types.json b/examples/ensemble_attack/data_configs/data_types.json
@@ -0,0 +1,7 @@
+{
+    "numerical": ["trans_date", "amount", "balance", "account"],
+    "categorical": ["trans_type", "operation", "k_symbol", "bank"],
+    "variable_to_predict": "trans_type"
+}
+
+
diff --git a/examples/ensemble_attack/run.sh b/examples/ensemble_attack/run.sh
@@ -6,8 +6,8 @@ source .venv/bin/activate
 echo "Active Environment:"
 which python
 
-echo Experiments Launched
+echo "Experiments Launched"
 
 python -m examples.ensemble_attack.run_attack
 
-echo Experiments Completed
+echo "Experiments Completed"
diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
@@ -3,44 +3,147 @@
 provided resources and data.
 """
 
+import pickle
+from datetime import datetime
 from logging import INFO
 from pathlib import Path
 
 import hydra
+import numpy as np
 from omegaconf import DictConfig
 
 from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble
+from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType
+from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
 from midst_toolkit.attacks.ensemble.process_split_data import process_split_data
 from midst_toolkit.common.logger import log
 
 
+def run_data_processing(config: DictConfig) -> None:
+    """
+    Function to run the data processing pipeline.
+
+    Args:
+        config: Configuration object set in config.yaml.
+    """
+    log(INFO, "Running data processing pipeline...")
+    # Collect the real data from the MIDST challenge resources.
+    population_data = collect_population_data_ensemble(
+        midst_data_input_dir=Path(config.data_paths.midst_data_path),
+        data_processing_config=config.data_processing_config,
+        save_dir=Path(config.data_paths.population_path),
+    )
+    # The following function saves the required dataframe splits in the specified processed_attack_data_path path.
+    process_split_data(
+        all_population_data=population_data,
+        processed_attack_data_path=Path(config.data_paths.processed_attack_data_path),
+        # TODO: column_to_stratify value is not documented in the original codebase.
+        column_to_stratify=config.data_processing_config.column_to_stratify,
+        num_total_samples=config.data_processing_config.population_sample_size,
+        random_seed=config.random_seed,
+    )
+    log(INFO, "Data processing pipeline finished.")
+
+
+def run_metaclassifier_training(config: DictConfig) -> None:
+    """
+    Fuction to run the metaclassifier training and evaluation.
+
+    Args:
+        config: Configuration object set in config.yaml.
+    """
+    log(INFO, "Running metaclassifier training...")
+    # Load the processed data splits.
+    df_meta_train = load_dataframe(
+        Path(config.data_paths.processed_attack_data_path),
+        "master_challenge_train.csv",
+    )
+    y_meta_train = np.load(
+        Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy",
+    )
+    df_meta_test = load_dataframe(
+        Path(config.data_paths.processed_attack_data_path),
+        "master_challenge_test.csv",
+    )
+    y_meta_test = np.load(
+        Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy",
+    )
+
+    # Synthetic data borrowed from the attack implementation repository.
+    # From (https://github.com/CRCHUM-CITADEL/ensemble-mia/tree/main/input/tabddpm_black_box/meta_classifier)
+    # TODO: Change this file path to the path where the synthetic data is stored.
+    df_synthetic = load_dataframe(
+        Path(config.data_paths.processed_attack_data_path),
+        "synth.csv",
+    )
+
+    df_reference = load_dataframe(
+        Path(config.data_paths.population_path),
+        "population_all_with_challenge_no_id.csv",
+    )
+
+    # Fit the metaclassifier.
+    meta_classifier_enum = MetaClassifierType(config.metaclassifier.model_type)
+
+    # 1. Initialize the attacker
+    blending_attacker = BlendingPlusPlus(
+        config=config, meta_classifier_type=meta_classifier_enum, random_seed=config.random_seed
+    )
+    log(INFO, f"{meta_classifier_enum} created with random seed {config.random_seed}, starting training...")
+
+    # 2. Train the attacker on the meta-train set
+
+    blending_attacker.fit(
+        df_train=df_meta_train,
+        y_train=y_meta_train,
+        df_synthetic=df_synthetic,
+        df_reference=df_reference,
+        use_gpu=config.metaclassifier.use_gpu,
+        epochs=config.metaclassifier.epochs,
+    )
+
+    log(INFO, "Metaclassifier training finished.")
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    model_filename = f"{timestamp}_{config.metaclassifier.model_type}_trained_metaclassifier.pkl"
+    with open(Path(config.model_paths.metaclassifier_model_path) / model_filename, "wb") as f:
+        pickle.dump(blending_attacker.trained_model, f)
+
+    log(INFO, "Metaclassifier model saved, starting evaluation...")
+
+    # 3. Get predictions on the test set
+    probabilities, pred_score = blending_attacker.predict(
+        df_test=df_meta_test,
+        df_synthetic=df_synthetic,
+        df_reference=df_reference,
+        y_test=y_meta_test,
+    )
+
+    # Save the prediction probabilities
+    np.save(
+        Path(config.data_paths.attack_results_path)
+        / f"{timestamp}_{config.metaclassifier.model_type}_test_pred_proba.npy",
+        probabilities,
+    )
+    log(INFO, "Test set prediction probabilities saved.")
+
+    if pred_score is not None:
+        log(INFO, f"TPR at FPR=0.1: {pred_score:.4f}")
+
+
 @hydra.main(config_path=".", config_name="config", version_base=None)
-def main(cfg: DictConfig) -> None:
+def main(config: DictConfig) -> None:
     """
     Run the Ensemble Attack example pipeline.
     As the first step, data processing is done.
 
     Args:
-        cfg: Attack OmegaConf DictConfig object.
+        config: Attack configuration as an OmegaConf DictConfig object.
     """
-    if cfg.pipeline.run_data_processing:
-        log(INFO, "Running data processing pipeline...")
-        # Collect the real data from the MIDST challenge resources.
-        population_data = collect_population_data_ensemble(
-            midst_data_input_dir=Path(cfg.data_paths.midst_data_path),
-            data_processing_config=cfg.data_processing_config,
-            save_dir=Path(cfg.data_paths.population_path),
-        )
-        # The following function saves the required dataframe splits in the specified processed_attack_data_path path.
-        process_split_data(
-            all_population_data=population_data,
-            processed_attack_data_path=Path(cfg.data_paths.processed_attack_data_path),
-            # TODO: column_to_stratify value is not documented in the original codebase.
-            column_to_stratify=cfg.data_processing_config.column_to_stratify,
-            num_total_samples=cfg.data_processing_config.population_sample_size,
-            random_seed=cfg.random_seed,
-        )
-        log(INFO, "Data processing pipeline finished.")
+    if config.pipeline.run_data_processing:
+        run_data_processing(config)
+    if config.pipeline.run_metaclassifier_training:
+        run_metaclassifier_training(config)
 
 
 if __name__ == "__main__":

diff --git a/mypy.ini b/mypy.ini
@@ -36,3 +36,6 @@ ignore_missing_imports = True
 
 [mypy-category_encoders.*]
 ignore_missing_imports = True
+
+[mypy-gower.*]
+ignore_missing_imports = True
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,8 @@ authors = [ {name = "Vector AI Engineering", email = "ai_engineering@vectorinsti
 license = "MIT"
 repository = "https://github.com/VectorInstitute/midst-toolkit"
 requires-python = ">=3.12"
-dependencies = []
+dependencies = [
+]
 
 [build-system]
 requires = ["hatchling"]
@@ -37,7 +38,10 @@ dev = [
     "opacus<=1.4.0",
     "syntheval>=1.6.2",
     "hydra-core>=1.3.2",
-    "omegaconf>=2.3.0"
+    "omegaconf>=2.3.0",
+    "gower>=0.1.2",
+    "optuna",
+    "xgboost"
 ]
 docs = [
     "jinja2>=3.1.6", # Pinning version to address vulnerability GHSA-cpwx-vrp4-4pq7