From 61cbba76b7840d8864df1ecbf3a0a4d317ca07b4 Mon Sep 17 00:00:00 2001
From: Edon Gashi <egashi@mpi-sws.org>
Date: Mon, 31 Jul 2023 11:12:55 +0200
Subject: [PATCH 1/2] Changelog for 1.1.0

---
 CHANGELOG.md | 9 +++++++++
 1 file changed, 9 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1a9187..be784af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 ## Changelog
 
+### Version 1.1.0
+
+- Added `syndiffix.py` python wrapper for ML feature selection.
+- Lowered default thresholds for range and singularity nodes to 15 and 5.
+- Improved clustering algorithm for main column.
+- Added `--output` (`-o`) CLI argument to directly save the CSV file to disk.
+- Added `--clustering-mainfeatures <features>` CLI argument to specify main column's ML features.
+- Added `--clusters <clusters>` CLI argument which allows defining clusters manually.
+
 ### Version 1.0.2
 
 - Fixed a bug in the computation of low-count/range/singularity thresholds' mean.

From a604b2c8f38bf41740f1b4c355d2d710e19aa57f Mon Sep 17 00:00:00 2001
From: Edon Gashi <egashi@mpi-sws.org>
Date: Mon, 31 Jul 2023 17:32:17 +0200
Subject: [PATCH 2/2] Support AID columns in Python wrapper

---
 CHANGELOG.md |  4 ++--
 syndiffix.py | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be784af..a7691af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,8 +2,8 @@
 
 ### Version 1.1.0
 
-- Added `syndiffix.py` python wrapper for ML feature selection.
-- Lowered default thresholds for range and singularity nodes to 15 and 5.
+- Added Python wrapper for auto-detecting column types and main features for ML.
+- Lowered default thresholds for range and singularity nodes and raised default tree depth limit.
 - Improved clustering algorithm for main column.
 - Added `--output` (`-o`) CLI argument to directly save the CSV file to disk.
 - Added `--clustering-mainfeatures <features>` CLI argument to specify main column's ML features.
diff --git a/syndiffix.py b/syndiffix.py
index 7ca6b62..fc323ed 100644
--- a/syndiffix.py
+++ b/syndiffix.py
@@ -320,9 +320,21 @@ def columns_metadata(df):
     return columns
 
 
+def process_aid_columns(arg):
+    if isinstance(arg, list):
+        return arg
+    elif isinstance(arg, tuple):
+        return list(arg)
+    elif isinstance(arg, str):
+        return [arg]
+    else:
+        return []
+
+
 def main(
         input_path: str,
         output_path: str,
+        aid_columns: list[str] = [],
         ml_target: str = None,
         ml_features_only: bool = False,
         syndiffix_args: str = '',
@@ -333,6 +345,7 @@ def main(
     Parameters:
         input_path: Path of input CSV file.
         output_path: Path of output CSV file.
+        aid_columns: Entity identifier columns. If not specified, assumes one row per entity.
         ml_target: If specified, focuses on this column for better ML prediction.
         ml_features_only: If set, limits columns to only ML features of ml_target.
         syndiffix_args: Extra arguments to pass to syndiffix.
@@ -347,6 +360,11 @@ def main(
 
     extra_args = []
 
+    aid_columns = process_aid_columns(aid_columns)
+    if len(aid_columns) > 0:
+        print(f'AID Columns: {aid_columns}')
+        extra_args += ['--aidcolumns', *aid_columns]
+
     if ml_target:
         print('ML Target: ' + ml_target)
 
@@ -354,7 +372,7 @@ def main(
         features = select_features_ml(df, ml_target)['kFeatures']
         print('ML Features: ' + (', '.join(features)))
 
-        extra_args = [
+        extra_args += [
             '--clustering-maincolumn', ml_target,
             '--clustering-mainfeatures', *features
         ]