From 61cbba76b7840d8864df1ecbf3a0a4d317ca07b4 Mon Sep 17 00:00:00 2001 From: Edon Gashi Date: Mon, 31 Jul 2023 11:12:55 +0200 Subject: [PATCH 1/2] Changelog for 1.1.0 --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1a9187..be784af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ ## Changelog +### Version 1.1.0 + +- Added `syndiffix.py` python wrapper for ML feature selection. +- Lowered default thresholds for range and singularity nodes to 15 and 5. +- Improved clustering algorithm for main column. +- Added `--output` (`-o`) CLI argument to directly save the CSV file to disk. +- Added `--clustering-mainfeatures ` CLI argument to specify main column's ML features. +- Added `--clusters ` CLI argument which allows defining clusters manually. + ### Version 1.0.2 - Fixed a bug in the computation of low-count/range/singularity thresholds' mean. From a604b2c8f38bf41740f1b4c355d2d710e19aa57f Mon Sep 17 00:00:00 2001 From: Edon Gashi Date: Mon, 31 Jul 2023 17:32:17 +0200 Subject: [PATCH 2/2] Support AID columns in Python wrapper --- CHANGELOG.md | 4 ++-- syndiffix.py | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be784af..a7691af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,8 @@ ### Version 1.1.0 -- Added `syndiffix.py` python wrapper for ML feature selection. -- Lowered default thresholds for range and singularity nodes to 15 and 5. +- Added Python wrapper for auto-detecting column types and main features for ML. +- Lowered default thresholds for range and singularity nodes and raised default tree depth limit. - Improved clustering algorithm for main column. - Added `--output` (`-o`) CLI argument to directly save the CSV file to disk. - Added `--clustering-mainfeatures ` CLI argument to specify main column's ML features. diff --git a/syndiffix.py b/syndiffix.py index 7ca6b62..fc323ed 100644 --- a/syndiffix.py +++ b/syndiffix.py @@ -320,9 +320,21 @@ def columns_metadata(df): return columns +def process_aid_columns(arg): + if isinstance(arg, list): + return arg + elif isinstance(arg, tuple): + return list(arg) + elif isinstance(arg, str): + return [arg] + else: + return [] + + def main( input_path: str, output_path: str, + aid_columns: list[str] = [], ml_target: str = None, ml_features_only: bool = False, syndiffix_args: str = '', @@ -333,6 +345,7 @@ def main( Parameters: input_path: Path of input CSV file. output_path: Path of output CSV file. + aid_columns: Entity identifier columns. If not specified, assumes one row per entity. ml_target: If specified, focuses on this column for better ML prediction. ml_features_only: If set, limits columns to only ML features of ml_target. syndiffix_args: Extra arguments to pass to syndiffix. @@ -347,6 +360,11 @@ def main( extra_args = [] + aid_columns = process_aid_columns(aid_columns) + if len(aid_columns) > 0: + print(f'AID Columns: {aid_columns}') + extra_args += ['--aidcolumns', *aid_columns] + if ml_target: print('ML Target: ' + ml_target) @@ -354,7 +372,7 @@ def main( features = select_features_ml(df, ml_target)['kFeatures'] print('ML Features: ' + (', '.join(features))) - extra_args = [ + extra_args += [ '--clustering-maincolumn', ml_target, '--clustering-mainfeatures', *features ]