From 14c529839cb29efe09e77157d9516d154732e661 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 11 Oct 2023 10:56:53 -0400
Subject: [PATCH 01/92] faster method for dropping pandas columns

---
 setup.cfg                                 |  2 +-
 src/Caribou_reduce_features.py            | 10 ++-----
 src/data/reduction/chi2_selection.py      | 17 ++++++-----
 src/data/reduction/occurence_exclusion.py | 36 ++++++++++-------------
 4 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 917c168..6e6c672 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = Caribou
-version = 1.3.1
+version = 1.4.1
 url = https://github.com/bioinfoUQAM/Caribou/wiki
 author = Nicolas de Montigny
 author_email = de_montigny.nicolas@courrier.uqam.ca
diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 2ca05fc..3e735c7 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -38,11 +38,6 @@ def features_reduction(opt):
     if opt['model_type'] is None:
         opt['model_type'] = 'cnn'
     """
-
-    # Validate training parameters
-    verify_positive_int(opt['batch_size'], 'batch_size')
-    verify_positive_int(opt['training_epochs'], 'number of training iterations')
-    
     outdirs = define_create_outdirs(opt['outdir'])
     
     # Initialize cluster
@@ -70,6 +65,9 @@ def features_reduction(opt):
     # Save reduced dataset
     data['profile'] = f"{data['profile']}_reduced"
     ds.write_parquet(data['profile'])
+    # Save reduced K-mers
+    with open(os.path.join(outdirs["data_dir"],'kmers_list.txt'),'w') as handle:
+        handle.writelines("%s\n" % item for item in data['kmers'])
     # Save reduced data
     path, ext = os.path.splitext(opt['dataset'])
     data_file = f'{path}_reduced{ext}'
@@ -111,8 +109,6 @@ def chi2selection(ds, kmers):
     parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files')
     parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced')
     # Parameters
-    parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
-    parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled')
     args = parser.parse_args()
diff --git a/src/data/reduction/chi2_selection.py b/src/data/reduction/chi2_selection.py
index ee1dff9..f0b9897 100644
--- a/src/data/reduction/chi2_selection.py
+++ b/src/data/reduction/chi2_selection.py
@@ -27,7 +27,7 @@ def __init__(self, features: List[str], threshold: float = 0.05):
     def _fit(self, ds: Dataset) -> Preprocessor:
         mean_chi = []
         cols_keep = []
-        cols_drop = []
+        # cols_drop = []
         # Compute chi2 over batches
         for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'):
             X = batch[TENSOR_COLUMN_NAME].to_numpy()
@@ -49,22 +49,23 @@ def _fit(self, ds: Dataset) -> Preprocessor:
             cols_keep = self.features
             warn('No values were found to have a chi2 p-value under the threshold, all features will be kept.\
                  You can try running this feature selector again with a different threshold to reduce the number of features')
-        else:
-            cols_drop = list(set(self.features).difference(set(cols_keep)))
+        # else:
+        #     cols_drop = list(set(self.features).difference(set(cols_keep)))
+        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
+        self.stats_ = {'cols_keep' : cols_keep}
 
-        self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
         return self
     
     def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
-        _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
-        cols_drop = self.stats_['cols_drop']
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
         
         tensor_col = df[TENSOR_COLUMN_NAME]
         tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
         tensor_col = pd.DataFrame(tensor_col, columns = self.features)
 
-        tensor_col = tensor_col.drop(cols_drop, axis = 1)
-        tensor_col = tensor_col.to_numpy()
+        tensor_col = tensor_col[cols_keep].to_numpy()
+        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
 
         df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
 
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index 88a80bf..2e87273 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -29,28 +29,29 @@ def _fit(self, ds: Dataset) -> Preprocessor:
             occurences += np.count_nonzero(batch, axis = 0)
         
         # Include / Exclude by sorted position
-        cols_drop = []
+        # cols_drop = []
         cols_keep = pd.Series(occurences, index = self.features)
         cols_keep = cols_keep.sort_values(ascending = True) # Long operation
-        cols_drop.extend(cols_keep.iloc[0 : self.num_features].index)
-        cols_drop.extend(cols_keep.iloc[(self._nb_features - self.num_features) : self._nb_features].index)
+        # cols_drop.extend(cols_keep.iloc[0 : self.num_features].index)
+        # cols_drop.extend(cols_keep.iloc[(self._nb_features - self.num_features) : self._nb_features].index)
         cols_keep = cols_keep.iloc[self.num_features : (self._nb_features - self.num_features)]
         cols_keep = list(cols_keep.index)
 
-        self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
+        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
+        self.stats_ = {'cols_keep' : cols_keep}
 
         return self
 
     def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
-        _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
-        cols_drop = self.stats_['cols_drop']
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
         
         tensor_col = df[TENSOR_COLUMN_NAME]
         tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
         tensor_col = pd.DataFrame(tensor_col, columns = self.features)
 
-        tensor_col = tensor_col.drop(cols_drop, axis = 1)
-        tensor_col = tensor_col.to_numpy()
+        tensor_col = tensor_col[cols_keep].to_numpy()
+        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
         
         df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
 
@@ -85,29 +86,24 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         cols_keep = pd.Series(occurences, index = self.features)
         cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')]
         cols_keep = list(cols_keep.index)
-        cols_drop = list(set(self.features).difference(set(cols_keep)))
 
-        # cols_drop = []
-        # occurences = pd.Series(occurences, index = self.features)
-        # cols_drop.extend(occurences[low_treshold > occurences].index)
-        # cols_drop.extend(occurences[occurences < high_treshold].index)
-        
-        # cols_keep = list(set(self.features).difference(set(cols_drop)))
+        # cols_drop = list(set(self.features).difference(set(cols_keep)))
+        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
 
-        self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
+        self.stats_ = {'cols_keep' : cols_keep}
 
         return self
 
     def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
-        _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
-        cols_drop = self.stats_['cols_drop']
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
         
         tensor_col = df[TENSOR_COLUMN_NAME]
         tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
         tensor_col = pd.DataFrame(tensor_col, columns = self.features)
 
-        tensor_col = tensor_col.drop(cols_drop, axis = 1)
-        tensor_col = tensor_col.to_numpy()
+        tensor_col = tensor_col[cols_keep].to_numpy()
+        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
         
         df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
 

From e88db22c7054650034cf00d6c9fcd211039fd60c Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 11 Oct 2023 20:40:51 -0400
Subject: [PATCH 02/92] parallelization for training reducers

---
 src/Caribou_reduce_features.py            | 15 ++--
 src/data/kmers.py                         |  2 +-
 src/data/reduction/chi2_selection.py      | 34 +++++++---
 src/data/reduction/occurence_exclusion.py | 83 +++++++++++++++++++++--
 4 files changed, 112 insertions(+), 22 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 3e735c7..0f36aa3 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -46,12 +46,9 @@ def features_reduction(opt):
 # Features reduction
 ################################################################################
     """
-    Brute -> Affined (20% recursive removal == 40% of original)
-    1. OccurenceExclusion
-    2. LowVarSelection
-    3. Chi2 + SelectPercentile(50) / SelectKBest(X)
-    4. TruncatedSVD + text -> LSA
-    5. KRFE (require to train an estimator)
+    Brute force -> Features statistically related to classes
+    1. OccurenceExclusion (5% extremes)
+    2. Chi2 + SelectKBest() (<0.05 p-value)
     """
 
     # Load data 
@@ -81,6 +78,7 @@ def occurence_exclusion(ds, kmers):
         features = kmers,
         percent = 0.05
     )
+    
     ds = preprocessor.fit_transform(ds)
     
     kmers = preprocessor.stats_['cols_keep']
@@ -93,6 +91,11 @@ def chi2selection(ds, kmers):
         features = kmers,
         threshold = 0.05
     )
+    # TODO : PARALLELIZE FITTING LIKE IN OCCURENCES
+    import sys
+    preprocessor.fit(ds)
+    sys.exit()
+    
     ds = preprocessor.fit_transform(ds)
 
     kmers = preprocessor.stats_['cols_keep']
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 1e84479..655b320 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -342,7 +342,7 @@ def _kmers_tokenization(self):
         self.df = tokenizer.transform(self.df)
         if self.method == 'seen':
             self.kmers_list = tokenizer.stats_['tokens(sequence)']
-            self._kmers_reduction()
+            # self._kmers_reduction()
 
     def _kmers_reduction(self):
         # Exclusion of columns occuring in less 5% / more 95% of the samples
diff --git a/src/data/reduction/chi2_selection.py b/src/data/reduction/chi2_selection.py
index f0b9897..9784ecb 100644
--- a/src/data/reduction/chi2_selection.py
+++ b/src/data/reduction/chi2_selection.py
@@ -28,22 +28,40 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         mean_chi = []
         cols_keep = []
         # cols_drop = []
-        # Compute chi2 over batches
-        for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'):
-            X = batch[TENSOR_COLUMN_NAME].to_numpy()
+
+        # Function for parallel chi2 computing
+        def chi_sqr(batch):
+            X = batch[TENSOR_COLUMN_NAME]
             X = _unwrap_ndarray_object_type_if_needed(X)
             X = pd.DataFrame(X, columns = self.features)
-            y = batch['species'].to_numpy().ravel()
-            mean_chi.append(chi2(X, y)[1])
+            y = batch['species'].ravel()
+            return {'chi' : [chi2(X, y)[1]]}
+
+
+        # Compute chi2 over batches
+        # for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'):
+        #     X = batch[TENSOR_COLUMN_NAME].to_numpy()
+        #     X = _unwrap_ndarray_object_type_if_needed(X)
+        #     X = pd.DataFrame(X, columns = self.features)
+        #     y = batch['species'].to_numpy().ravel()
+        #     mean_chi.append(chi2(X, y)[1])
+
+        chi = ds.map_batches(chi_sqr, batch_format = 'numpy')
+
+        for i, row in enumerate(chi.iter_rows()):
+            mean_chi.append(row['chi'])
 
         # Compute the mean of chi2 by feature
         mean_chi = np.array(mean_chi)
         mean_chi = np.mean(mean_chi, axis = 0)
 
-        cols_keep = pd.Series(mean_chi, index = self.features)
-        cols_keep = cols_keep[cols_keep <= self.threshold]
-        cols_keep = list(cols_keep.index)
+        # cols_keep = pd.Series(mean_chi, index = self.features)
+        # cols_keep = cols_keep[cols_keep <= self.threshold]
+        # cols_keep = list(cols_keep.index)
         
+        # Construct list of features to keep by position
+        cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi < self.threshold]
+
         # Keep all features if none are under the threshold
         if len(cols_keep) == 0:
             cols_keep = self.features
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index 2e87273..097155d 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -59,11 +59,9 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         
     def __repr__(self):
         return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self.num_features!r})")
-
+"""
 class TensorPercentOccurenceExclusion(Preprocessor):
-    """
-    Exclusion of the features present in less than (%) / more than (100% - %) across samples to be used as a Ray preprocessor.
-    """
+    
 
     def __init__(self, features: List[str], percent : int = 0.05):
         # Parameters
@@ -101,17 +99,88 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         tensor_col = df[TENSOR_COLUMN_NAME]
         tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
         tensor_col = pd.DataFrame(tensor_col, columns = self.features)
-
         tensor_col = tensor_col[cols_keep].to_numpy()
         # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
         
         df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
-
+        
         return df
 
     def __repr__(self):
         return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)")
+"""
 
 def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
     if len(df.loc[0, column]) != nb_features:
-        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
+    
+
+
+
+class TensorPercentOccurenceExclusion(Preprocessor):
+    """
+    Exclusion of the features present in less than (%) / more than (100% - %) across samples to be used as a Ray preprocessor.
+    """
+
+    def __init__(self, features: List[str], percent : int = 0.05):
+        # Parameters
+        self.features = features
+        self.percent = percent
+        self._nb_features = len(features)
+    
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        nb_samples = ds.count()
+        low_treshold = ceil((0 + self.percent) * nb_samples)
+        high_treshold = floor((1 -  self.percent) * nb_samples)
+
+        # Function for parallel occurences counting
+        def count_occurences(batch):
+            batch = batch[TENSOR_COLUMN_NAME]
+            return {'occurences' : [np.count_nonzero(batch, axis = 0)]}
+        
+        occur = ds.map_batches(count_occurences, batch_format = 'numpy')
+
+        occurences = np.zeros(self._nb_features)
+        for batch in occur.iter_batches(batch_format = 'numpy'):
+            batch_occur = batch['occurences'].sum(axis = 0)
+            occurences += batch_occur
+
+        # Construct list of features to keep by position
+        cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if low_treshold < occurence < high_treshold]
+        
+        self.stats_ = {'cols_keep' : cols_keep}
+
+        """
+        # Nb of occurences
+        for batch in ds.iter_batches(batch_format = 'numpy'):
+            batch = batch[TENSOR_COLUMN_NAME]
+            occurences += np.count_nonzero(batch, axis = 0)
+        # Include / Exclude by occurences thresholds across samples
+        cols_keep = pd.Series(occurences, index = self.features)
+        cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')]
+        cols_keep = list(cols_keep.index)
+
+        # cols_drop = list(set(self.features).difference(set(cols_keep)))
+        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
+
+        self.stats_ = {'cols_keep' : cols_keep}
+        """
+
+        return self
+
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
+        
+        tensor_col = df[TENSOR_COLUMN_NAME]
+        tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+        tensor_col = pd.DataFrame(tensor_col, columns = self.features)
+        tensor_col = tensor_col[cols_keep].to_numpy()
+        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
+        
+        df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
+        
+        return df
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)")
\ No newline at end of file

From addfbbfb0b68f546603f0e22d05f6b7130b2f1dc Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 11 Oct 2023 22:28:08 -0400
Subject: [PATCH 03/92] reducers tested locally

---
 src/Caribou_reduce_features.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 0f36aa3..b2ffc5a 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -91,10 +91,6 @@ def chi2selection(ds, kmers):
         features = kmers,
         threshold = 0.05
     )
-    # TODO : PARALLELIZE FITTING LIKE IN OCCURENCES
-    import sys
-    preprocessor.fit(ds)
-    sys.exit()
     
     ds = preprocessor.fit_transform(ds)
 

From 8df0ecac9ef5fb69cec864f626e7c8d1564b25e3 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 15 Oct 2023 23:34:24 -0400
Subject: [PATCH 04/92] features reduction test in local

---
 src/Caribou_reduce_features.py             | 102 ++++++++++++---
 src/data/kmers.py                          |   4 +-
 src/data/reduction/chi2_selection.py       |  97 --------------
 src/data/reduction/count_hashing.py        |   1 -
 src/data/reduction/features_selection.py   |  83 ++++++++++++
 src/data/reduction/low_var_selection.py    | 144 +++++++++------------
 src/data/reduction/occurence_exclusion.py  |  30 +----
 src/models/kerasTF/models.py               |   2 +-
 src/models/preprocessors/max_abs_scaler.py |   5 +-
 src/models/preprocessors/min_max_scaler.py |   4 +-
 src/models/sklearn/models.py               |   2 +-
 src/utils.py                               |   2 +-
 12 files changed, 242 insertions(+), 234 deletions(-)
 delete mode 100644 src/data/reduction/chi2_selection.py
 create mode 100644 src/data/reduction/features_selection.py

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index b2ffc5a..379c1ef 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -8,7 +8,11 @@
 from time import time
 from pathlib import Path
 
-from data.reduction.chi2_selection import TensorChi2Selection
+from ray.data.preprocessors import Chain, LabelEncoder
+
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
+from data.reduction.low_var_selection import TensorLowVarSelection
+from data.reduction.features_selection import TensorFeaturesSelection
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
 
 __author__ = "Nicolas de Montigny"
@@ -32,12 +36,6 @@ def features_reduction(opt):
     # Verification of k length
     k_length, kmers_list = verify_kmers_list_length(k_length, opt['kmers_list'])
 
-    # Not sure if needed for training KRFE
-    """
-    # Verify that model type is valid / choose default depending on host presence
-    if opt['model_type'] is None:
-        opt['model_type'] = 'cnn'
-    """
     outdirs = define_create_outdirs(opt['outdir'])
     
     # Initialize cluster
@@ -47,36 +45,84 @@ def features_reduction(opt):
 ################################################################################
     """
     Brute force -> Features statistically related to classes
-    1. OccurenceExclusion (5% extremes)
-    2. Chi2 + SelectKBest() (<0.05 p-value)
+    1. OccurenceExclusion (10% extremes)
+    2. LowVarSelection ()
+    3. Chi2 + SelectPercentile() (75% best values)
     """
 
     # Load data 
     ds = ray.data.read_parquet(data['profile'])
-    # Iterate over methods for exp results
+    ds_train = ray.data.read_parquet(data['profile'])
+    # Time the computation of transformations
     t_start = time()
-    ds, kmers_list = occurence_exclusion(ds, kmers_list)
-    ds, data['kmers'] = chi2selection(ds, kmers_list)
+    ds, data['kmers'] = exclude_select(ds, ds_train, kmers_list, data['taxas'][0])
+    # ds, kmers_list = occurence_exclusion(ds, kmers_list)
+    # ds, kmers_list = low_var_selection(ds,kmers_list)
+    # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
     data['profile'] = f"{data['profile']}_reduced"
     ds.write_parquet(data['profile'])
     # Save reduced K-mers
-    with open(os.path.join(outdirs["data_dir"],'kmers_list.txt'),'w') as handle:
+    with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle:
         handle.writelines("%s\n" % item for item in data['kmers'])
     # Save reduced data
     path, ext = os.path.splitext(opt['dataset'])
     data_file = f'{path}_reduced{ext}'
     save_Xy_data(data, data_file)
 
-    print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} using the combined occurence and chi2 methods from the original dataset in {t_reduction} seconds.")
+    print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.")
+
+def exclude_select(ds, ds_train, kmers, taxa):
+    # Occurence exclusion
+    excluder = TensorPercentOccurenceExclusion(
+        features = kmers,
+        percent = 0.1 # remove features present in less than 10% samples
+    )
+
+    ds = excluder.fit_transform(ds)
+    ds_train = excluder.transform(ds_train)
+
+    kmers = excluder.stats_['cols_keep']
+
+    varier = TensorLowVarSelection(
+        features = kmers,
+        threshold = 0.1, # remove features with less than 10% variance
+    )
+
+    ds = varier.fit_transform(ds)
+    ds_train = varier.transform(ds_train)
+
+    kmers = varier.stats_['cols_keep']
+
+    # Preprocessing
+    preprocessor = Chain(
+        LabelEncoder(taxa),
+        TensorMinMaxScaler(kmers),
+    )
+
+    ds_train = preprocessor.fit_transform(ds_train)
 
-# Exclusion columns occuring in less / more than 10% of the columns
+    # Statistical features selection
+    selector = TensorFeaturesSelection(
+            features = kmers,
+            taxa = taxa,
+            threshold = 0.25, # remove lowest 25% significance
+        )
+    
+    selector.fit(ds_train)
+    ds = selector.transform(ds)
+
+    kmers = selector.stats_['cols_keep']
+
+    return ds, kmers
+
+# Exclusion columns occuring in less / more than 10% of the columns = 20% removed
 def occurence_exclusion(ds, kmers):
     preprocessor = TensorPercentOccurenceExclusion(
         features = kmers,
-        percent = 0.05
+        percent = 0.1 # remove features present in less than 10% samples
     )
     
     ds = preprocessor.fit_transform(ds)
@@ -85,19 +131,33 @@ def occurence_exclusion(ds, kmers):
 
     return ds, kmers
 
-# Chi2 evaluation of dependance between features and classes
-def chi2selection(ds, kmers):
-    preprocessor = TensorChi2Selection(
+# Exclusion of columns with variance lower than a certain threshold
+def low_var_selection(ds, kmers):
+    preprocessor = TensorLowVarSelection(
         features = kmers,
-        threshold = 0.05
+        threshold = 0.1, # remove features with less than 10% variance
     )
-    
+
     ds = preprocessor.fit_transform(ds)
 
     kmers = preprocessor.stats_['cols_keep']
 
     return ds, kmers
 
+# Chi2 evaluation of dependance between features and classes
+def features_selection(ds, kmers, taxa):
+    preprocessor = TensorFeaturesSelection(
+            features = kmers,
+            taxa = taxa,
+            threshold = 0.25, # remove lowest 25% significance
+        )
+
+    ds = preprocessor.fit_transform(ds)
+    
+    kmers = preprocessor.stats_['cols_keep']
+
+    return ds, kmers
+
 # Argument parsing from CLI
 ################################################################################
 
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 655b320..18efe97 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -16,7 +16,7 @@
 from data.extraction.given_kmers_vectorizer import GivenKmersVectorizer
 
 # Features selection
-from data.reduction.chi2_selection import TensorChi2Selection
+from data.reduction.features_selection import TensorFeaturesSelection
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
 
 __author__ = ['Amine Remita', 'Nicolas de Montigny']
@@ -355,7 +355,7 @@ def _kmers_reduction(self):
         self.kmers_list = excluder.stats_['cols_keep']
 
         # Chi2 evaluation of dependance between features and classes
-        selector = TensorChi2Selection(
+        selector = TensorFeaturesSelection(
             features = self.kmers_list,
             threshold = 0.05
         )
diff --git a/src/data/reduction/chi2_selection.py b/src/data/reduction/chi2_selection.py
deleted file mode 100644
index 9784ecb..0000000
--- a/src/data/reduction/chi2_selection.py
+++ /dev/null
@@ -1,97 +0,0 @@
-
-import numpy as np
-import pandas as pd
-
-from typing import List
-from warnings import warn
-from ray.data import Dataset
-from sklearn.feature_selection import chi2
-from ray.data.preprocessor import Preprocessor
-from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
-
-TENSOR_COLUMN_NAME = '__value__'
-
-class TensorChi2Selection(Preprocessor):
-    """
-    Custom implementation of SelectKBest with Chi2 inspired by sklearn.feature_selection.SelectPercentile and sklearn.feature_selection.chi2 features selector to be used as a Ray preprocessor.
-    https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2
-    https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest
-    """
-
-    def __init__(self, features: List[str], threshold: float = 0.05):
-        # Parameters
-        self.features = features
-        self.threshold = threshold
-        self._nb_features = len(features)
-
-    def _fit(self, ds: Dataset) -> Preprocessor:
-        mean_chi = []
-        cols_keep = []
-        # cols_drop = []
-
-        # Function for parallel chi2 computing
-        def chi_sqr(batch):
-            X = batch[TENSOR_COLUMN_NAME]
-            X = _unwrap_ndarray_object_type_if_needed(X)
-            X = pd.DataFrame(X, columns = self.features)
-            y = batch['species'].ravel()
-            return {'chi' : [chi2(X, y)[1]]}
-
-
-        # Compute chi2 over batches
-        # for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'):
-        #     X = batch[TENSOR_COLUMN_NAME].to_numpy()
-        #     X = _unwrap_ndarray_object_type_if_needed(X)
-        #     X = pd.DataFrame(X, columns = self.features)
-        #     y = batch['species'].to_numpy().ravel()
-        #     mean_chi.append(chi2(X, y)[1])
-
-        chi = ds.map_batches(chi_sqr, batch_format = 'numpy')
-
-        for i, row in enumerate(chi.iter_rows()):
-            mean_chi.append(row['chi'])
-
-        # Compute the mean of chi2 by feature
-        mean_chi = np.array(mean_chi)
-        mean_chi = np.mean(mean_chi, axis = 0)
-
-        # cols_keep = pd.Series(mean_chi, index = self.features)
-        # cols_keep = cols_keep[cols_keep <= self.threshold]
-        # cols_keep = list(cols_keep.index)
-        
-        # Construct list of features to keep by position
-        cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi < self.threshold]
-
-        # Keep all features if none are under the threshold
-        if len(cols_keep) == 0:
-            cols_keep = self.features
-            warn('No values were found to have a chi2 p-value under the threshold, all features will be kept.\
-                 You can try running this feature selector again with a different threshold to reduce the number of features')
-        # else:
-        #     cols_drop = list(set(self.features).difference(set(cols_keep)))
-        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
-        self.stats_ = {'cols_keep' : cols_keep}
-
-        return self
-    
-    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
-        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
-        cols_keep = self.stats_['cols_keep']
-        
-        tensor_col = df[TENSOR_COLUMN_NAME]
-        tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
-        tensor_col = pd.DataFrame(tensor_col, columns = self.features)
-
-        tensor_col = tensor_col[cols_keep].to_numpy()
-        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
-
-        df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
-
-        return df
-    
-    def __repr__(self):
-        return (f"{self.__class__.__name__}(features={self._nb_features!r}, threshold={self.threshold!r})")
-
-def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
-    if len(df.loc[0, column]) != nb_features:
-        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file
diff --git a/src/data/reduction/count_hashing.py b/src/data/reduction/count_hashing.py
index 6171112..1b6506e 100644
--- a/src/data/reduction/count_hashing.py
+++ b/src/data/reduction/count_hashing.py
@@ -24,7 +24,6 @@ def __init__(self, features: List[str], num_features: int):
         self.num_features = num_features
 
     def _transform_pandas(self, df: pd.DataFrame):
-        # TODO(matt): Use sparse matrix for efficiency.
         def row_feature_hasher(row):
             hash_counts = collections.defaultdict(int)
             for feature in self.features:
diff --git a/src/data/reduction/features_selection.py b/src/data/reduction/features_selection.py
new file mode 100644
index 0000000..c07b515
--- /dev/null
+++ b/src/data/reduction/features_selection.py
@@ -0,0 +1,83 @@
+import logging
+
+import numpy as np
+import pandas as pd
+
+from typing import List
+from warnings import warn
+from ray.data import Dataset
+
+from sklearn.feature_selection import chi2
+from sklearn.feature_selection import f_classif, f_oneway
+
+from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class TensorFeaturesSelection(Preprocessor):
+    """
+    Custom implementation of SelectKBest with Chi2 inspired by sklearn.feature_selection.SelectPercentile and sklearn.feature_selection.chi2 features selector to be used as a Ray preprocessor.
+    https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2
+    https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest
+    """
+
+    def __init__(self, features: List[str], taxa: str, threshold: float = 0.5):
+        # Parameters
+        self.taxa = taxa
+        self.features = features
+        self.threshold = threshold
+        self._nb_features = len(features)
+
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        # Function for parallel stats computing
+        def stats(batch):
+            X = batch[TENSOR_COLUMN_NAME]
+            X = _unwrap_ndarray_object_type_if_needed(X)
+            X = pd.DataFrame(X, columns = self.features)
+            y = batch[self.taxa].ravel()
+            return {'chi' : [chi2(X, y)[0]]}
+
+        mean_chi = []
+        cols_keep = []
+        
+        # Chi batches means extraction
+        chi = ds.map_batches(stats, batch_format = 'numpy', batch_size = 32)
+        for i, row in enumerate(chi.iter_rows()):
+            mean_chi.append(row['chi'])
+
+        # Chi mean of batches means computing
+        mean_chi = np.array(mean_chi)
+        mean_chi = np.nanmean(mean_chi, axis = 0)
+
+        # Determine the threshold from distribution of chi values
+        self.threshold = np.nanquantile(mean_chi, self.threshold)
+        
+        # Keep features with values higher than the threshold
+        cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi > self.threshold]
+        
+        self.stats_ = {'cols_keep' : cols_keep}
+
+        return self
+
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
+
+        if len(cols_keep) < self._nb_features:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = pd.DataFrame(tensor_col, columns = self.features)
+
+            tensor_col = tensor_col[cols_keep].to_numpy()
+
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))        
+
+        return df
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})")
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file
diff --git a/src/data/reduction/low_var_selection.py b/src/data/reduction/low_var_selection.py
index 52ecb77..7fa6561 100644
--- a/src/data/reduction/low_var_selection.py
+++ b/src/data/reduction/low_var_selection.py
@@ -5,6 +5,7 @@
 from typing import List
 from ray.data import Dataset
 from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 
 TENSOR_COLUMN_NAME = '__value__'
 
@@ -16,103 +17,76 @@ class TensorLowVarSelection(Preprocessor):
     """
     def __init__(
         self,
-        features_list : List[str],
-        threshold: float = np.inf,
-        nb_keep : int = np.inf,
+        features : List[str],
+        threshold: float = 0.1,
     ):
-        self.features_list = features_list
-        if 'id' in self.features_list:
-            self.features_list.remove('id')
-        self.nb_features = len(self.features_list)
+        self.features = features
         self.threshold = threshold
-        self.nb_keep = nb_keep
-        self.removed_features = []
+        self._nb_features = len(features)
 
     def _fit(self, ds: Dataset) -> Preprocessor:
-        nb_records = ds.count()
-        #
-        sum_arr = np.zeros(self.nb_features)
-        mean_arr = np.zeros(self.nb_features)
-        sqr_dev_arr = np.zeros(self.nb_features)
-        var_arr = np.zeros(self.nb_features)
-        #
-        def sum_func(arr, sum_arr):
-            return np.add(sum_arr, np.sum(arr, axis=0))
+        cols_keep = []
+        nb_samples = ds.count()
+        sum_arr = np.zeros(self._nb_features)
+        mean_arr = np.zeros(self._nb_features)
+        sqr_dev_arr = np.zeros(self._nb_features)
+        var_arr = np.zeros(self._nb_features)
+        
+        # Function for parallel sum computing
+        def get_sums(batch):
+            df = batch[TENSOR_COLUMN_NAME]
+            df = _unwrap_ndarray_object_type_if_needed(df)
+            return({'sum' : [np.sum(df, axis = 0)]})
+        
+        # Sum per column
+        sums = ds.map_batches(get_sums, batch_format = 'pandas')
+        for row in sums.iter_rows():
+            sum_arr += row['sum']
+        
+        # Mean per column
+        mean_arr = sum_arr / nb_samples
+        
+        # Function for parallel squared deviation computing
+        def get_sqr_dev(batch):
+            df = batch[TENSOR_COLUMN_NAME]
+            df = _unwrap_ndarray_object_type_if_needed(df)
+            return({'sqr_dev' : [np.sum(np.power(np.subtract(df, mean_arr), 2), axis = 0)]})
+        
+        # Sum of deviation per column
+        sqr_devs = ds.map_batches(get_sqr_dev, batch_format = 'pandas')
+        for row in sqr_devs.iter_rows():
+            sqr_dev_arr += row['sqr_dev']
+
+        # Variance per column
+        var_arr = sqr_dev_arr / nb_samples
+        
+        # Compute the threshold from distribution of variance values
+        self.threshold = np.nanquantile(var_arr, self.threshold)
+
+        # Keep features with values higher than the threshold
+        cols_keep = [self.features[i] for i, var in enumerate(var_arr) if var > self.threshold]
+        
+        self.stats_ = {'cols_keep' : cols_keep}
 
-        def mean_func(arr, nb_records):
-            return np.divide(arr, nb_records)
-
-        def sqr_dev_func(arr, mean_arr, sqr_dev_arr):
-            return np.add(sqr_dev_arr, np.sum(np.power(np.subtract(arr, mean_arr), 2), axis = 0))
-
-        if self.nb_keep != np.inf or self.threshold != np.inf:
-            # Get sum per column
-            for batch in ds.iter_batches(
-                batch_size = 100,
-                batch_format = 'numpy'
-            ):
-                sum_arr = sum_func(batch, sum_arr)
-            # Get mean per column
-            mean_arr = mean_func(sum_arr, nb_records)
-            # Get sum of deviation
-            for batch in ds.iter_batches(
-                batch_size = 100,
-                batch_format = 'numpy'
-            ):
-                sqr_dev_arr = sqr_dev_func(batch, mean_arr, sqr_dev_arr)
-            # Get variance per column
-            var_arr = mean_func(sqr_dev_arr, nb_records)
-            p10 = int(0.1 * self.nb_features)
-
-            if self.nb_keep != np.inf and (self.nb_keep + (p10 * 2)) < self.nb_features:
-                var_mapping = {ind : var_arr[ind] for ind in np.arange(self.nb_features)}
-                keep_arr = np.ravel(np.sort(var_arr))
-                keep_arr = keep_arr[p10:(len(keep_arr) - p10)]
-                keep_arr = np.random.choice(keep_arr, self.nb_keep)
-                remove_arr = np.ravel(np.sort(var_arr))
-                remove_arr = np.array([ind for ind in remove_arr if ind not in keep_arr])
+        return self
 
-                # Switch values from keep_arr to remove if number is discordant
-                if len(keep_arr) > self.nb_keep:
-                    nb_switch = len(keep_arr) - self.nb_keep
-                    remove_arr = np.insert(remove_arr, 0, keep_arr[:nb_switch])
-                    keep_arr = keep_arr[nb_switch:]
-                elif len(keep_arr) < self.nb_keep:
-                    nb_switch = self.nb_keep - len(keep_arr)
-                    keep_arr = np.insert(keep_arr, 0, remove_arr[nb_switch:])
-                    remove_arr = remove_arr[:nb_switch]
-                # Loop to assign values to remove
-                for k, v in var_mapping.items():
-                    if v in remove_arr:
-                        pos_v = int(np.where(remove_arr == v)[0][0])
-                        remove_arr = np.delete(remove_arr, pos_v)
-                        self.stats_.append(k)
-                self.removed_features = [self.features_list[ind] for ind in self.stats_]
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
 
-            elif self.threshold != np.inf:
-                for ind in np.arange(self.nb_features):
-                    variance = var_arr[ind]
-                    if variance <= self.threshold:
-                        self.stats_.append(ind)
-                self.removed_features = [self.features_list[ind] for ind in self.stats_]
+        if len(cols_keep) < self._nb_features:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = pd.DataFrame(tensor_col, columns = self.features)
 
-        return self
+            tensor_col = tensor_col[cols_keep].to_numpy()
 
-    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
-        if len(self.stats_) > 0 :
-            _validate_df(df, TENSOR_COLUMN_NAME, self.nb_features)
-            df_out = pd.DataFrame(columns = [TENSOR_COLUMN_NAME])
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))        
 
-            for ind, row in enumerate(df.iterrows()):
-                tensor = np.delete(row[1].to_numpy()[0], self.stats_, axis=0)
-                df_out.loc[ind, TENSOR_COLUMN_NAME] = tensor
-            
-            return df_out        
-        else:
-            return df
+        return df
 
     def __repr__(self):
-        return (f"{self.__class__.__name__}(threshold={self.threshold!r}, nb_keep={self.nb_keep!r})")
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, threshold={self.threshold!r})")
 
 def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
     if len(df.loc[0, column]) != nb_features:
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index 097155d..78ef429 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -150,35 +150,19 @@ def count_occurences(batch):
         
         self.stats_ = {'cols_keep' : cols_keep}
 
-        """
-        # Nb of occurences
-        for batch in ds.iter_batches(batch_format = 'numpy'):
-            batch = batch[TENSOR_COLUMN_NAME]
-            occurences += np.count_nonzero(batch, axis = 0)
-        # Include / Exclude by occurences thresholds across samples
-        cols_keep = pd.Series(occurences, index = self.features)
-        cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')]
-        cols_keep = list(cols_keep.index)
-
-        # cols_drop = list(set(self.features).difference(set(cols_keep)))
-        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
-
-        self.stats_ = {'cols_keep' : cols_keep}
-        """
-
         return self
 
     def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
         cols_keep = self.stats_['cols_keep']
         
-        tensor_col = df[TENSOR_COLUMN_NAME]
-        tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
-        tensor_col = pd.DataFrame(tensor_col, columns = self.features)
-        tensor_col = tensor_col[cols_keep].to_numpy()
-        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
-        
-        df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
+        if len(cols_keep) < self._nb_features:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = pd.DataFrame(tensor_col, columns = self.features)
+            tensor_col = tensor_col[cols_keep].to_numpy()
+            
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
         
         return df
 
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 165f558..96ef9cc 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -9,8 +9,8 @@
 from shutil import rmtree
 
 # Preprocessing
-from models.preprocessors import TensorMinMaxScaler
 from ray.data.preprocessors import LabelEncoder, Chain
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from src.models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
 
 # Parent class / models
diff --git a/src/models/preprocessors/max_abs_scaler.py b/src/models/preprocessors/max_abs_scaler.py
index 3914113..cce6b8c 100644
--- a/src/models/preprocessors/max_abs_scaler.py
+++ b/src/models/preprocessors/max_abs_scaler.py
@@ -5,6 +5,7 @@
 
 from ray.data.preprocessor import Preprocessor
 from ray.data.extensions.tensor_extension import TensorArray
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 
 TENSOR_COLUMN_NAME = '__value__'
 
@@ -38,7 +39,9 @@ def _transform_pandas(self, batch: pd.DataFrame):
         """
         Transform the given dataset to pandas dataframe.
         """
-        df = pd.DataFrame(np.vstack(batch[TENSOR_COLUMN_NAME]), columns = self._features_list)
+        df = batch[TENSOR_COLUMN_NAME]
+        df = _unwrap_ndarray_object_type_if_needed(df)
+        df = pd.DataFrame(df, columns = self._features_list)
         for i, col in enumerate(self._features_list):
             df[col] = df[col].apply(value_transform, args=[self._absmax[i]])
 
diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py
index b430525..ebf560a 100644
--- a/src/models/preprocessors/min_max_scaler.py
+++ b/src/models/preprocessors/min_max_scaler.py
@@ -4,6 +4,7 @@
 
 from ray.data.dataset import Dataset
 from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 
 TENSOR_COLUMN_NAME = '__value__'
 
@@ -54,7 +55,8 @@ def _transform_pandas(self, batch: pd.DataFrame):
         """
         min = self.stats_['min']
         max = self.stats_['max']
-        df = np.vstack(batch[TENSOR_COLUMN_NAME].to_numpy())
+        df = batch[TENSOR_COLUMN_NAME]
+        df = _unwrap_ndarray_object_type_if_needed(df)
 
         diff = max - min
         diff[diff == 0] = 1
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index b9aa459..4f2308d 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -9,7 +9,7 @@
 
 # Preprocessing
 from ray.data.preprocessors import Chain, BatchMapper, LabelEncoder
-from models.preprocessing.ray_tensor_min_max import TensorMinMaxScaler
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from src.models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 
 # Training
diff --git a/src/utils.py b/src/utils.py
index f5133e6..f7e36a6 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -52,8 +52,8 @@ def init_ray_cluster(workdir):
                 object_store_memory = mem * frac,
                 _temp_dir = str(workdir),
             )
-            ray.data.DataContext.get_current().execution_options.verbose_progress = True
             logging.getLogger("ray").setLevel(logging.WARNING)
+            ray.data.DataContext.get_current().execution_options.verbose_progress = True
         except ValueError :
             ray.shutdown()
             frac -= 0.05

From 787ee8f401e0e59b3d423dbef7a7f8fa783376bf Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 15 Oct 2023 23:45:06 -0400
Subject: [PATCH 05/92] integration of features reduction to kmers_collection

---
 src/Caribou_reduce_features.py | 62 ++++------------------------------
 src/data/kmers.py              | 27 +++++++++++----
 2 files changed, 26 insertions(+), 63 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 379c1ef..a68d9d3 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -10,7 +10,6 @@
 
 from ray.data.preprocessors import Chain, LabelEncoder
 
-from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from data.reduction.low_var_selection import TensorLowVarSelection
 from data.reduction.features_selection import TensorFeaturesSelection
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
@@ -46,19 +45,17 @@ def features_reduction(opt):
     """
     Brute force -> Features statistically related to classes
     1. OccurenceExclusion (10% extremes)
-    2. LowVarSelection ()
+    2. LowVarSelection (variance > 10%)
     3. Chi2 + SelectPercentile() (75% best values)
     """
 
     # Load data 
     ds = ray.data.read_parquet(data['profile'])
-    ds_train = ray.data.read_parquet(data['profile'])
     # Time the computation of transformations
     t_start = time()
-    ds, data['kmers'] = exclude_select(ds, ds_train, kmers_list, data['taxas'][0])
-    # ds, kmers_list = occurence_exclusion(ds, kmers_list)
-    # ds, kmers_list = low_var_selection(ds,kmers_list)
-    # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
+    ds, kmers_list = occurence_exclusion(ds, kmers_list)
+    ds, kmers_list = low_var_selection(ds,kmers_list)
+    ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
@@ -74,51 +71,7 @@ def features_reduction(opt):
 
     print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.")
 
-def exclude_select(ds, ds_train, kmers, taxa):
-    # Occurence exclusion
-    excluder = TensorPercentOccurenceExclusion(
-        features = kmers,
-        percent = 0.1 # remove features present in less than 10% samples
-    )
-
-    ds = excluder.fit_transform(ds)
-    ds_train = excluder.transform(ds_train)
-
-    kmers = excluder.stats_['cols_keep']
-
-    varier = TensorLowVarSelection(
-        features = kmers,
-        threshold = 0.1, # remove features with less than 10% variance
-    )
-
-    ds = varier.fit_transform(ds)
-    ds_train = varier.transform(ds_train)
-
-    kmers = varier.stats_['cols_keep']
-
-    # Preprocessing
-    preprocessor = Chain(
-        LabelEncoder(taxa),
-        TensorMinMaxScaler(kmers),
-    )
-
-    ds_train = preprocessor.fit_transform(ds_train)
-
-    # Statistical features selection
-    selector = TensorFeaturesSelection(
-            features = kmers,
-            taxa = taxa,
-            threshold = 0.25, # remove lowest 25% significance
-        )
-    
-    selector.fit(ds_train)
-    ds = selector.transform(ds)
-
-    kmers = selector.stats_['cols_keep']
-
-    return ds, kmers
-
-# Exclusion columns occuring in less / more than 10% of the columns = 20% removed
+# Exclusion of columns occuring in less / more than 10% of the columns = 20% removed
 def occurence_exclusion(ds, kmers):
     preprocessor = TensorPercentOccurenceExclusion(
         features = kmers,
@@ -126,12 +79,11 @@ def occurence_exclusion(ds, kmers):
     )
     
     ds = preprocessor.fit_transform(ds)
-    
     kmers = preprocessor.stats_['cols_keep']
 
     return ds, kmers
 
-# Exclusion of columns with variance lower than a certain threshold
+# Exclusion of columns with less than 10% variance
 def low_var_selection(ds, kmers):
     preprocessor = TensorLowVarSelection(
         features = kmers,
@@ -139,7 +91,6 @@ def low_var_selection(ds, kmers):
     )
 
     ds = preprocessor.fit_transform(ds)
-
     kmers = preprocessor.stats_['cols_keep']
 
     return ds, kmers
@@ -153,7 +104,6 @@ def features_selection(ds, kmers, taxa):
         )
 
     ds = preprocessor.fit_transform(ds)
-    
     kmers = preprocessor.stats_['cols_keep']
 
     return ds, kmers
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 18efe97..50e12ba 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -16,6 +16,7 @@
 from data.extraction.given_kmers_vectorizer import GivenKmersVectorizer
 
 # Features selection
+from data.reduction.low_var_selection import TensorLowVarSelection
 from data.reduction.features_selection import TensorFeaturesSelection
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
 
@@ -342,25 +343,37 @@ def _kmers_tokenization(self):
         self.df = tokenizer.transform(self.df)
         if self.method == 'seen':
             self.kmers_list = tokenizer.stats_['tokens(sequence)']
-            # self._kmers_reduction()
+            self._kmers_reduction()
 
     def _kmers_reduction(self):
-        # Exclusion of columns occuring in less 5% / more 95% of the samples
+        """
+        Brute force -> Features statistically related to classes
+        1. OccurenceExclusion (10% extremes)
+        2. LowVarSelection (variance > 10%)
+        3. Chi2 + SelectPercentile() (75% best values)
+        """
+        # Exclusion of columns occuring in less / more than 10% of the columns = 20% removed
         excluder = TensorPercentOccurenceExclusion(
             features = self.kmers_list,
-            percent = 0.05
+            percent = 0.1
         )
         self.df = excluder.fit_transform(self.df)
-        
         self.kmers_list = excluder.stats_['cols_keep']
 
-        # Chi2 evaluation of dependance between features and classes
+        # Exclusion of columns with less than 10% variance
+        varier = TensorLowVarSelection(
+            features = self.kmers_list,
+            threshold = 0.1,
+        )
+        self.df = varier.fit_transform(self.df)
+        self.kmers_list = varier.stats_['cols_keep']
+
+        # Chi2 evaluation of dependance between features and classes to keep 75% most significative
         selector = TensorFeaturesSelection(
             features = self.kmers_list,
-            threshold = 0.05
+            threshold = 0.25
         )
         self.df = selector.fit_transform(self.df)
-        
         self.kmers_list = selector.stats_['cols_keep']
 
     def _write_dataset(self):

From 319cb1967cd8706c72078d81b22c62d86f3885eb Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 16 Oct 2023 00:03:20 -0400
Subject: [PATCH 06/92] occurences debug

---
 src/Caribou_reduce_features.py            | 4 ++--
 src/data/reduction/features_selection.py  | 2 +-
 src/data/reduction/low_var_selection.py   | 4 ++--
 src/data/reduction/occurence_exclusion.py | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index a68d9d3..bcb76b0 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -54,8 +54,8 @@ def features_reduction(opt):
     # Time the computation of transformations
     t_start = time()
     ds, kmers_list = occurence_exclusion(ds, kmers_list)
-    ds, kmers_list = low_var_selection(ds,kmers_list)
-    ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
+    # ds, kmers_list = low_var_selection(ds,kmers_list)
+    # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
diff --git a/src/data/reduction/features_selection.py b/src/data/reduction/features_selection.py
index c07b515..c2a02be 100644
--- a/src/data/reduction/features_selection.py
+++ b/src/data/reduction/features_selection.py
@@ -43,7 +43,7 @@ def stats(batch):
         
         # Chi batches means extraction
         chi = ds.map_batches(stats, batch_format = 'numpy', batch_size = 32)
-        for i, row in enumerate(chi.iter_rows()):
+        for row in chi.iter_rows():
             mean_chi.append(row['chi'])
 
         # Chi mean of batches means computing
diff --git a/src/data/reduction/low_var_selection.py b/src/data/reduction/low_var_selection.py
index 7fa6561..0212c8c 100644
--- a/src/data/reduction/low_var_selection.py
+++ b/src/data/reduction/low_var_selection.py
@@ -39,7 +39,7 @@ def get_sums(batch):
             return({'sum' : [np.sum(df, axis = 0)]})
         
         # Sum per column
-        sums = ds.map_batches(get_sums, batch_format = 'pandas')
+        sums = ds.map_batches(get_sums, batch_format = 'numpy')
         for row in sums.iter_rows():
             sum_arr += row['sum']
         
@@ -53,7 +53,7 @@ def get_sqr_dev(batch):
             return({'sqr_dev' : [np.sum(np.power(np.subtract(df, mean_arr), 2), axis = 0)]})
         
         # Sum of deviation per column
-        sqr_devs = ds.map_batches(get_sqr_dev, batch_format = 'pandas')
+        sqr_devs = ds.map_batches(get_sqr_dev, batch_format = 'numpy')
         for row in sqr_devs.iter_rows():
             sqr_dev_arr += row['sqr_dev']
 
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index 78ef429..f333215 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -132,18 +132,18 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         nb_samples = ds.count()
         low_treshold = ceil((0 + self.percent) * nb_samples)
         high_treshold = floor((1 -  self.percent) * nb_samples)
+        occurences = np.zeros(self._nb_features)
 
         # Function for parallel occurences counting
         def count_occurences(batch):
             batch = batch[TENSOR_COLUMN_NAME]
+            batch = _unwrap_ndarray_object_type_if_needed(batch)
             return {'occurences' : [np.count_nonzero(batch, axis = 0)]}
         
         occur = ds.map_batches(count_occurences, batch_format = 'numpy')
 
-        occurences = np.zeros(self._nb_features)
-        for batch in occur.iter_batches(batch_format = 'numpy'):
-            batch_occur = batch['occurences'].sum(axis = 0)
-            occurences += batch_occur
+        for row in occur.iter_rows():
+            occurences += row['occurences']
 
         # Construct list of features to keep by position
         cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if low_treshold < occurence < high_treshold]

From fe50a5a1facecb0d615858a6efcef1228957890b Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 17 Oct 2023 15:33:41 -0400
Subject: [PATCH 07/92] remove feat reduction from kmers extraction

---
 src/data/kmers.py                         |  2 +-
 src/data/reduction/occurence_exclusion.py | 67 ++---------------------
 2 files changed, 6 insertions(+), 63 deletions(-)

diff --git a/src/data/kmers.py b/src/data/kmers.py
index 50e12ba..71148d4 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -343,7 +343,7 @@ def _kmers_tokenization(self):
         self.df = tokenizer.transform(self.df)
         if self.method == 'seen':
             self.kmers_list = tokenizer.stats_['tokens(sequence)']
-            self._kmers_reduction()
+            # self._kmers_reduction()
 
     def _kmers_reduction(self):
         """
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index f333215..cbd7af1 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -29,11 +29,8 @@ def _fit(self, ds: Dataset) -> Preprocessor:
             occurences += np.count_nonzero(batch, axis = 0)
         
         # Include / Exclude by sorted position
-        # cols_drop = []
         cols_keep = pd.Series(occurences, index = self.features)
         cols_keep = cols_keep.sort_values(ascending = True) # Long operation
-        # cols_drop.extend(cols_keep.iloc[0 : self.num_features].index)
-        # cols_drop.extend(cols_keep.iloc[(self._nb_features - self.num_features) : self._nb_features].index)
         cols_keep = cols_keep.iloc[self.num_features : (self._nb_features - self.num_features)]
         cols_keep = list(cols_keep.index)
 
@@ -51,7 +48,6 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         tensor_col = pd.DataFrame(tensor_col, columns = self.features)
 
         tensor_col = tensor_col[cols_keep].to_numpy()
-        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
         
         df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
 
@@ -59,63 +55,6 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         
     def __repr__(self):
         return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self.num_features!r})")
-"""
-class TensorPercentOccurenceExclusion(Preprocessor):
-    
-
-    def __init__(self, features: List[str], percent : int = 0.05):
-        # Parameters
-        self.features = features
-        self.percent = percent
-        self._nb_features = len(features)
-    
-    def _fit(self, ds: Dataset) -> Preprocessor:
-        nb_samples = ds.count()
-        low_treshold = ceil((0 + self.percent) * nb_samples)
-        high_treshold = floor((1 -  self.percent) * nb_samples)
-
-        # Nb of occurences
-        occurences = np.zeros(self._nb_features)
-        for batch in ds.iter_batches(batch_format = 'numpy'):
-            batch = batch[TENSOR_COLUMN_NAME]
-            occurences += np.count_nonzero(batch, axis = 0)
-
-        # Include / Exclude by occurences thresholds across samples
-        cols_keep = pd.Series(occurences, index = self.features)
-        cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')]
-        cols_keep = list(cols_keep.index)
-
-        # cols_drop = list(set(self.features).difference(set(cols_keep)))
-        # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
-
-        self.stats_ = {'cols_keep' : cols_keep}
-
-        return self
-
-    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
-        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
-        cols_keep = self.stats_['cols_keep']
-        
-        tensor_col = df[TENSOR_COLUMN_NAME]
-        tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
-        tensor_col = pd.DataFrame(tensor_col, columns = self.features)
-        tensor_col = tensor_col[cols_keep].to_numpy()
-        # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy()
-        
-        df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
-        
-        return df
-
-    def __repr__(self):
-        return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)")
-"""
-
-def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
-    if len(df.loc[0, column]) != nb_features:
-        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
-    
-
-
 
 class TensorPercentOccurenceExclusion(Preprocessor):
     """
@@ -167,4 +106,8 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def __repr__(self):
-        return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)")
\ No newline at end of file
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)")
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file

From 400d1abc903400c883b89467133c3bd224b90747 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 19 Oct 2023 09:29:02 -0400
Subject: [PATCH 08/92] features reduction debug test

---
 src/Caribou_reduce_features.py | 13 ++++++++-----
 src/data/kmers.py              |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index bcb76b0..270880c 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -6,6 +6,7 @@
 
 from utils import *
 from time import time
+from glob import glob
 from pathlib import Path
 
 from ray.data.preprocessors import Chain, LabelEncoder
@@ -50,12 +51,14 @@ def features_reduction(opt):
     """
 
     # Load data 
-    ds = ray.data.read_parquet(data['profile'])
+    files_lst = glob(os.path.join(data['profile'], '*.parquet'))
+    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    # ds = ray.data.read_parquet(data['profile'])
     # Time the computation of transformations
     t_start = time()
     ds, kmers_list = occurence_exclusion(ds, kmers_list)
-    # ds, kmers_list = low_var_selection(ds,kmers_list)
-    # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
+    ds, kmers_list = low_var_selection(ds,kmers_list)
+    ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
@@ -75,7 +78,7 @@ def features_reduction(opt):
 def occurence_exclusion(ds, kmers):
     preprocessor = TensorPercentOccurenceExclusion(
         features = kmers,
-        percent = 0.1 # remove features present in less than 10% samples
+        percent = 0.05 # remove features present in less than 5% samples
     )
     
     ds = preprocessor.fit_transform(ds)
@@ -87,7 +90,7 @@ def occurence_exclusion(ds, kmers):
 def low_var_selection(ds, kmers):
     preprocessor = TensorLowVarSelection(
         features = kmers,
-        threshold = 0.1, # remove features with less than 10% variance
+        threshold = 0.05, # remove features with less than 5% variance
     )
 
     ds = preprocessor.fit_transform(ds)
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 71148d4..323e3a4 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -371,6 +371,7 @@ def _kmers_reduction(self):
         # Chi2 evaluation of dependance between features and classes to keep 75% most significative
         selector = TensorFeaturesSelection(
             features = self.kmers_list,
+            taxa = self.taxas[0],
             threshold = 0.25
         )
         self.df = selector.fit_transform(self.df)

From b33ba2984045d7d53493c4a3b7d9f918dd76b5ec Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 19 Oct 2023 18:03:49 -0400
Subject: [PATCH 09/92] rectify imports

---
 src/models/kerasTF/models.py |  2 +-
 src/models/sklearn/models.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 96ef9cc..e6d57bf 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -14,7 +14,7 @@
 from src.models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
 
 # Parent class / models
-from src.models.models_utils import ModelsUtils
+from models.models_utils import ModelsUtils
 from models.kerasTF.build_neural_networks import *
 
 # Training
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 4f2308d..d2d7e85 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -10,7 +10,7 @@
 # Preprocessing
 from ray.data.preprocessors import Chain, BatchMapper, LabelEncoder
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
-from src.models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
+from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 
 # Training
 from sklearn.naive_bayes import MultinomialNB
@@ -24,10 +24,10 @@
 from ray.train.sklearn.sklearn_predictor import SklearnPredictor
 
 # Parent class
-from src.models.models_utils import ModelsUtils
-from src.models.sklearn.partial_trainer import SklearnPartialTrainer
-from src.models.sklearn.tensor_predictor import SklearnTensorPredictor
-from src.models.sklearn.probability_predictor import SklearnTensorProbaPredictor
+from models.models_utils import ModelsUtils
+from models.sklearn.partial_trainer import SklearnPartialTrainer
+from models.sklearn.tensor_predictor import SklearnTensorPredictor
+from models.sklearn.probability_predictor import SklearnTensorProbaPredictor
 
 
 __author__ = 'Nicolas de Montigny'

From 6be0ed309fd08acd3995e2ea278cb1a9ded6c3e8 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 19 Oct 2023 18:43:45 -0400
Subject: [PATCH 10/92] imports rectify

---
 src/models/classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/models/classification.py b/src/models/classification.py
index f26d7a9..c98042f 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -8,8 +8,8 @@
 from glob import glob
 from shutil import rmtree
 from utils import load_Xy_data
-from src.models.sklearn.models import SklearnModel
-from src.models.kerasTF.models import KerasTFModel
+from models.sklearn.models import SklearnModel
+from models.kerasTF.models import KerasTFModel
 
 # Simulation class
 from models.reads_simulation import readsSimulation

From db2b978e8d11deb2e230f1be4a414a5cea12cadc Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 19 Oct 2023 18:46:15 -0400
Subject: [PATCH 11/92] imports

---
 src/models/kerasTF/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index e6d57bf..f41828d 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -11,7 +11,7 @@
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
-from src.models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
+from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
 
 # Parent class / models
 from models.models_utils import ModelsUtils

From 4ee5a77c840f211e4dad9bf8e7c0c1560f6ff52c Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 22 Oct 2023 08:33:45 -0400
Subject: [PATCH 12/92] reduce 5 -> 10%, sklearn pd -> np

---
 src/Caribou_reduce_features.py                |  4 +-
 src/models/encoders/model_label_encoder.py    | 39 ++++++++
 src/models/encoders/one_hot_tensor_encoder.py |  6 +-
 src/models/encoders/onesvm_label_encoder.py   |  4 +
 src/models/kerasTF/build_neural_networks.py   |  8 +-
 src/models/kerasTF/models.py                  | 89 +++++++++++--------
 src/models/sklearn/models.py                  | 28 +++---
 src/models/sklearn/partial_trainer.py         | 30 ++++---
 src/models/sklearn/probability_predictor.py   |  2 +-
 src/models/sklearn/tensor_predictor.py        |  2 +-
 10 files changed, 137 insertions(+), 75 deletions(-)
 create mode 100644 src/models/encoders/model_label_encoder.py

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 270880c..d3e289c 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -78,7 +78,7 @@ def features_reduction(opt):
 def occurence_exclusion(ds, kmers):
     preprocessor = TensorPercentOccurenceExclusion(
         features = kmers,
-        percent = 0.05 # remove features present in less than 5% samples
+        percent = 0.1 # remove features present in less than 5% samples
     )
     
     ds = preprocessor.fit_transform(ds)
@@ -90,7 +90,7 @@ def occurence_exclusion(ds, kmers):
 def low_var_selection(ds, kmers):
     preprocessor = TensorLowVarSelection(
         features = kmers,
-        threshold = 0.05, # remove features with less than 5% variance
+        threshold = 0.1, # remove features with less than 5% variance
     )
 
     ds = preprocessor.fit_transform(ds)
diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py
new file mode 100644
index 0000000..2ed90e1
--- /dev/null
+++ b/src/models/encoders/model_label_encoder.py
@@ -0,0 +1,39 @@
+from collections import Counter, OrderedDict
+from functools import partial
+from typing import Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+import pandas.api.types
+
+from ray.data import Dataset
+from ray.data.preprocessor import Preprocessor
+from ray.data.preprocessors.encoder import _get_unique_value_indices, _validate_df
+
+LABELS_COLUMN_NAME = 'labels'
+
+class ModelLabelEncoder(Preprocessor):
+    """
+    Custom implementation of Ray's LabelEncoder to set column name as it encodes labels.
+    """
+    def __init__(self, label_column: str):
+        self.label_column = label_column
+
+    def _fit(self, dataset: Dataset) -> Preprocessor:
+        self.stats_ = _get_unique_value_indices(dataset, [self.label_column])
+        return self
+
+    def _transform_pandas(self, df: pd.DataFrame):
+        _validate_df(df, self.label_column)
+
+        def column_label_encoder(s: pd.Series):
+            s_values = self.stats_[f"unique_values({s.name})"]
+            return s.map(s_values)
+
+        df[self.label_column] = df[self.label_column].transform(column_label_encoder)
+        df = df.rename(columns = {self.label_column : LABELS_COLUMN_NAME})
+
+        return df
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(label_column={self.label_column!r})"
\ No newline at end of file
diff --git a/src/models/encoders/one_hot_tensor_encoder.py b/src/models/encoders/one_hot_tensor_encoder.py
index 0adff44..8acd7fe 100644
--- a/src/models/encoders/one_hot_tensor_encoder.py
+++ b/src/models/encoders/one_hot_tensor_encoder.py
@@ -29,6 +29,9 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
     def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         df = _validate_df(df, self.column)
 
+        values = self.stats_[f"unique_values({self.column})"]
+        nb_unique = len(values.keys())
+
         def tensor_col_encoding(label, nb_unique):
             tensor = np.zeros(nb_unique, dtype = np.int32)
             
@@ -37,9 +40,6 @@ def tensor_col_encoding(label, nb_unique):
             
             return tensor
 
-        values = self.stats_[f"unique_values({self.column})"]
-        nb_unique = len(values.keys())
-
         df = df.assign(labels = lambda x: TensorArray([tensor_col_encoding(x.loc[ind,self.column], nb_unique) for ind in df.index]))
 
         return df
diff --git a/src/models/encoders/onesvm_label_encoder.py b/src/models/encoders/onesvm_label_encoder.py
index 14186fa..9464011 100644
--- a/src/models/encoders/onesvm_label_encoder.py
+++ b/src/models/encoders/onesvm_label_encoder.py
@@ -6,6 +6,8 @@
 from ray.data.preprocessor import Preprocessor
 from ray.data.preprocessors.encoder import _get_unique_value_indices, _validate_df, LabelEncoder
 
+LABELS_COLUMN_NAME = 'labels'
+
 class OneClassSVMLabelEncoder(LabelEncoder):
     """
     Class adapted from Ray's LabelEncoder class to encode labels as integer targets for Scikit-Learn SGDOneClassSVM model.
@@ -33,4 +35,6 @@ def column_label_encoder(s: pd.Series):
             return s
 
         df[self.label_column] = df[self.label_column].transform(column_label_encoder)
+        df = df.rename(columns = {self.label_column : LABELS_COLUMN_NAME})
+
         return df
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index aed5532..97a8489 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -25,7 +25,7 @@ def build_attention(nb_kmers):
 
     x = Dense(128, activation = "relu")(x)
     x = Dropout(0.1)(x)
-    x = Dense(2, activation = "tanh")(x)
+    x = Dense(1, activation = "tanh")(x)
 
     model = Model(inputs = inputs, outputs = x)
     model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy'])
@@ -45,7 +45,7 @@ def build_LSTM(nb_kmers):
 
     x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(x)
 
-    x = Dense(2, activation = 'tanh')(x)
+    x = Dense(1, activation = 'tanh')(x)
     
     model = Model(inputs = inputs, outputs = x)
     model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'])
@@ -77,9 +77,9 @@ def build_deepLSTM(nb_kmers):
     net = Dense(10, activation='relu', name='D_%d'%10)(net)
     net = Dropout(0.1,name='fr_same')(net)
 
-    outputs = Dense(2, activation='tanh', name='score')(net)
+    outputs = Dense(1, activation='sigmoid', name='score')(net)
     model = Model(inputs=inputs, outputs=outputs)
-    model.compile(loss=BinaryCrossentropy(from_logits = True), optimizer='adam', metrics=['accuracy'])
+    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'])
 
     return model
 
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index f41828d..39bd8d3 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -11,6 +11,7 @@
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
+from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
 
 # Parent class / models
@@ -37,6 +38,9 @@
 
 __all__ = ['KerasTFModel']
 
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
+
 # Ignore warnings to have a more comprehensible output on stdout
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 warnings.filterwarnings('ignore')
@@ -138,11 +142,17 @@ def preprocess(self, df):
         for row in df.iter_rows():
             labels.append(row[self.taxa])
         self._nb_classes = len(np.unique(labels))
-        self._preprocessor = Chain(
-            TensorMinMaxScaler(self.kmers),
-            LabelEncoder(self.taxa),
-            OneHotTensorEncoder(self.taxa),
-        )
+        if self._nb_classes == 2:
+            self._preprocessor = Chain(
+                TensorMinMaxScaler(self.kmers),
+                ModelLabelEncoder(self.taxa),
+            )
+        else:
+            self._preprocessor = Chain(
+                TensorMinMaxScaler(self.kmers),
+                LabelEncoder(self.taxa),
+                OneHotTensorEncoder(self.taxa),
+            )
         self._preprocessor.fit(df)
 
     def _label_decode(self, predict):
@@ -220,6 +230,7 @@ def _fit_model(self, datasets):
             ),
             datasets=datasets,
         )
+        
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
@@ -227,7 +238,7 @@ def predict(self, df, threshold=0.8):
         print('predict')
         if df.count() > 0:
             if len(df.schema().names) > 1:
-                col_2_drop = [col for col in df.schema().names if col != '__value__']
+                col_2_drop = [col for col in df.schema().names if col != TENSOR_COLUMN_NAME]
                 df = df.drop_columns(col_2_drop)
 
             # Preprocess
@@ -235,12 +246,12 @@ def predict(self, df, threshold=0.8):
 
             print('number of classes :', self._nb_classes)
 
-            predictor = BatchPredictor.from_checkpoint(
+            self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
                 TensorflowPredictor,
                 model_definition = lambda: build_model(self.classifier, self._nb_classes, len(self.kmers))
             )
-            predictions = predictor.predict(
+            predictions = self._predictor.predict(
                 data = df,
                 batch_size = self.batch_size
             )
@@ -258,46 +269,44 @@ def predict(self, df, threshold=0.8):
     def _prob_2_cls(self, predictions, threshold):
         print('_prob_2_cls')
         def map_predicted_label_binary(df, threshold):
-            # lower_threshold = 0.5 - (threshold * 0.5)
-            # upper_threshold = 0.5 + (threshold * 0.5)
-            predictions = pd.DataFrame({
-                'best_proba': [df['predictions'][i][np.argmax(df['predictions'][i])] for i in range(len(df))],
-                'predicted_label': df["predictions"].map(lambda x: np.array(x).argmax()) # GET POSITION OF ARGMAX
+            df = np.ravel(df['predictions'])
+            lower_threshold = 0.5 - (threshold * 0.5)
+            upper_threshold = 0.5 + (threshold * 0.5)
+            predict = pd.DataFrame({
+                'proba': df,
+                'predicted_label': np.full(len(df), -1)
             })
-            print('map_predicted_label_binary')
-            print(predictions)
-            # predict = pd.DataFrame({
-            #     'proba': df['predictions'],
-            #     'predicted_label': np.zeros(len(df), dtype = np.float32)
-            # })
             # predict['predicted_label'] = np.round(predict['proba'])
-            # predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
-            # predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
-            return predictions['predicted_label'].to_numpy(dtype = np.int32)
+            predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
+            predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
+            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
         
         def map_predicted_label_multiclass(df, threshold):
-            predictions = pd.DataFrame({
-                'best_proba': [df['predictions'][i][np.argmax(df['predictions'][i])] for i in range(len(df))],
-                'predicted_label': df["predictions"].map(lambda x: np.array(x).argmax())
+            df = df['predictions']
+            pred = pd.DataFrame({
+                'best_proba': [df[i][np.argmax(df[i])] for i in range(len(df))],
+                'predicted_label': df.map(lambda x: np.array(x).argmax())
             })
-            predictions.loc[predictions['best_proba'] < threshold, 'predicted_label'] = -1
-            return predictions['predicted_label'].to_numpy(dtype = np.int32)
+            pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1
+            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
         
         if self._nb_classes == 2:
+            print('map_predicted_label_binary')
             fn = map_predicted_label_binary
         else:
+            print('map_predicted_label_multiclass')
             fn = map_predicted_label_multiclass
 
         predict = []
-        for batch in predictions.iter_batches(batch_size = self.batch_size):
-            predict.append(lambda : fn(batch, threshold))
-
-        import sys
-        predictions.materialize()
-        print(predict)
-        sys.exit()
+        predictions = predictions.map_batches(
+            lambda batch : fn(batch, threshold),
+            batch_format = 'numpy',
+            batch_size = self.batch_size
+        )
+        for row in predictions.iter_rows():
+            predict.append(row['predictions'])
 
-        return np.concatenate(predict)
+        return predict
 
 
 # Training/building function outside of the class as mentioned on the Ray discussion
@@ -317,6 +326,8 @@ def train_func(config):
     nb_cls = config.get('nb_cls')
     model = config.get('model')
 
+    
+
     # Model construction 
     model = build_model(model, nb_cls, size)
 
@@ -326,15 +337,15 @@ def train_func(config):
 
     for _ in range(epochs):
         batch_train = train_data.to_tf(
-            feature_columns = '__value__',
-            label_columns = 'labels',
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
             batch_size = batch_size,
             local_shuffle_buffer_size = batch_size,
             local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         batch_val = val_data.to_tf(
-            feature_columns = '__value__',
-            label_columns = 'labels',
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
             batch_size = batch_size,
             local_shuffle_buffer_size = batch_size,
             local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index d2d7e85..9499230 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -8,27 +8,29 @@
 from shutil import rmtree
 
 # Preprocessing
-from ray.data.preprocessors import Chain, BatchMapper, LabelEncoder
+from ray.data.preprocessors import Chain, BatchMapper
+from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 
 # Training
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDOneClassSVM, SGDClassifier
+from models.sklearn.partial_trainer import SklearnPartialTrainer
+from models.sklearn.tensor_predictor import SklearnTensorPredictor
 
 # Tuning
-from ray.air.config import RunConfig, ScalingConfig
+from ray.air.config import RunConfig
 
 # Predicting
 from ray.train.batch_predictor import BatchPredictor
-from ray.train.sklearn.sklearn_predictor import SklearnPredictor
+from models.sklearn.probability_predictor import SklearnTensorProbaPredictor
 
 # Parent class
 from models.models_utils import ModelsUtils
-from models.sklearn.partial_trainer import SklearnPartialTrainer
-from models.sklearn.tensor_predictor import SklearnTensorPredictor
-from models.sklearn.probability_predictor import SklearnTensorProbaPredictor
 
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
 
 __author__ = 'Nicolas de Montigny'
 
@@ -105,7 +107,7 @@ def preprocess(self, df):
             self._encoded = np.array([1,-1], dtype = np.int32)
             labels = np.array(['bacteria', 'unknown'], dtype = object)
         else:
-            self._encoder = LabelEncoder(self.taxa)
+            self._encoder = ModelLabelEncoder(self.taxa)
         
         self._preprocessor = Chain(
             TensorMinMaxScaler(self.kmers),
@@ -211,7 +213,6 @@ def _fit_model(self, datasets):
         # Define trainer
         self._trainer = SklearnPartialTrainer(
             estimator=self._clf,
-            label_column=self.taxa,
             labels_list=training_labels,
             features_list=self.kmers,
             params=self._train_params,
@@ -238,10 +239,8 @@ def _predict_cv(self, df):
         print('_predict_cv')
         if df.count() > 0:
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
-# BATCHPREDICTOR DEPRECATED : https://docs.ray.io/en/releases-2.6.3/ray-air/api/doc/ray.train.batch_predictor.BatchPredictor.html#ray.train.batch_predictor.BatchPredictor
-# MUST BE CHANGED TO MAP_BATCHES
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
-            predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = ['__value__'], **predict_kwargs)
+            predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
             predictions = np.array(predictions.to_pandas()).reshape(-1)
 
             return self._label_decode(predictions)
@@ -255,12 +254,12 @@ def predict(self, df, threshold = 0.8):
             if self.classifier == 'onesvm':
                 predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
                 self._predictor = BatchPredictor.from_checkpoint(self._models_collection['domain'], SklearnTensorPredictor)
-                predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = ['__value__'], **predict_kwargs)
+                predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
                 predictions = np.array(predictions.to_pandas()).reshape(-1)
             else:
                 predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
                 self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor)
-                predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = ['__value__'], **predict_kwargs)
+                predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
                 predictions = self._prob_2_cls(predictions, len(self._encoded), threshold)
             return self._label_decode(predictions)    
         else:
@@ -279,8 +278,7 @@ def map_predicted_label(df : pd.DataFrame):
         if nb_cls == 1:
             predict = np.round(abs(np.concatenate(predict.to_pandas()['predictions'])))
         else:
-            mapper = BatchMapper(map_predicted_label, batch_format = 'pandas')
-            predict = mapper.transform(predict)
+            predict = predict.map_batches(map_predicted_label, batch_format = 'pandas')
             predict = np.ravel(np.array(predict.to_pandas()))
 
         return predict
\ No newline at end of file
diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py
index e6fa0d8..e2ff877 100644
--- a/src/models/sklearn/partial_trainer.py
+++ b/src/models/sklearn/partial_trainer.py
@@ -26,6 +26,8 @@
 
 from ray.train.sklearn import SklearnTrainer
 
+LABELS_COLUMN_NAME = 'labels'
+
 simplefilter(action='ignore', category=FutureWarning)
 
 class SklearnPartialTrainer(SklearnTrainer):
@@ -38,7 +40,6 @@ def __init__(
         *,
         estimator,
         datasets,
-        label_column = None,
         labels_list = None,
         features_list = None,
         params = None,
@@ -57,7 +58,7 @@ def __init__(
         super().__init__(
         estimator = estimator,
         datasets = datasets,
-        label_column = label_column,
+        label_column = LABELS_COLUMN_NAME,
         params = params,
         scoring = scoring,
         cv = cv,
@@ -204,17 +205,20 @@ def training_loop(self):
                 start_time = time()
                 for batch_X, batch_y in zip(
                     epoch_X.iter_batches(
-                        batch_size = self._batch_size,
+                        # batch_size = self._batch_size,
+                        batch_size = 1,
                         batch_format = 'numpy'
                     ),
                     epoch_y.iter_batches(
-                        batch_size = self._batch_size,
+                        # batch_size = self._batch_size,
+                        batch_size = 1,
                         batch_format = 'numpy'
                     )
                 ):  
                     if isinstance(batch_X, dict):
                         batch_X = batch_X['__value__']
-                        
+                    
+                    """    
                     try:
                         batch_X = pd.DataFrame(batch_X, columns = self._features_list)
                     except ValueError:
@@ -224,6 +228,7 @@ def training_loop(self):
                                     Removing the last {} additionnal values, this may influence training.\
                                         If this persists over multiple samples, please rerun the K-mers extraction".format(len(batch_X[i]) - len(self._features_list)))
                                 batch_X[i] = batch_X[i][:len(self._features_list)]
+                    """
                     batch_y = np.ravel(batch_y[self.label_column])
                     try:
                         self.estimator.partial_fit(batch_X, batch_y, classes = self._labels, **self.fit_params)
@@ -240,8 +245,10 @@ def training_loop(self):
                 )):
                     X_calib_df[ind] = batch['__value__']
 
+                """
                 X_calib = pd.DataFrame(X_calib_df, columns = self._features_list)
-                y_calib = y_calib.to_pandas()
+                """
+                y_calib = y_calib.to_pandas().to_numpy()
                 self.estimator = CalibratedClassifierCV(
                     estimator = self.estimator,
                     method = 'sigmoid',
@@ -300,16 +307,19 @@ def _score_on_validation_sets(
 
             start_time = time()
             for batch, labels in zip(X_test.iter_batches(
-                    batch_size = self._batch_size,
+                    # batch_size = self._batch_size,
+                    batch_size = 1,
                     batch_format = 'numpy'
                 ), y_test.iter_batches(
-                    batch_size=self._batch_size,
+                    # batch_size = self._batch_size,
+                    batch_size = 1,
                     batch_format = 'numpy'
                 )
             ):
                 if isinstance(batch, dict):
                     batch = batch['__value__']
 
+                """
                 try:
                     batch = pd.DataFrame(batch, columns = self._features_list)
                 except ValueError:
@@ -319,10 +329,10 @@ def _score_on_validation_sets(
                                 Removing the last {} additionnal values, this may influence training.\
                                     If this persists over multiple samples, please rerun the K-mers extraction".format(len(batch[i]) - len(self._features_list)))
                             batch[i] = batch[i][:len(self._features_list)]
+                """
+                
                 labels = np.ravel(labels[self.label_column])
 
-                print(batch)
-
                 try:
                     test_scores.append(_score(estimator, batch, labels, scorers))
                 except Exception:
diff --git a/src/models/sklearn/probability_predictor.py b/src/models/sklearn/probability_predictor.py
index b0f0b22..0b291bd 100644
--- a/src/models/sklearn/probability_predictor.py
+++ b/src/models/sklearn/probability_predictor.py
@@ -41,7 +41,7 @@ def _predict_pandas(
         if TENSOR_COLUMN_NAME in data:
             data = data[TENSOR_COLUMN_NAME]
             data = _unwrap_ndarray_object_type_if_needed(data)
-            data = pd.DataFrame(data, columns = features)
+            # data = pd.DataFrame(data, columns = features)
 
         with parallel_backend("ray", n_jobs=num_estimator_cpus):
             df = pd.DataFrame(self.estimator.predict_proba(data, **predict_kwargs))
diff --git a/src/models/sklearn/tensor_predictor.py b/src/models/sklearn/tensor_predictor.py
index c03e4a8..e94538a 100644
--- a/src/models/sklearn/tensor_predictor.py
+++ b/src/models/sklearn/tensor_predictor.py
@@ -40,7 +40,7 @@ def _predict_pandas(
 
         data = data[TENSOR_COLUMN_NAME]
         data = _unwrap_ndarray_object_type_if_needed(data)
-        data = pd.DataFrame(data, columns = features)
+        # data = pd.DataFrame(data, columns = features)
         
         with parallel_backend("ray", n_jobs=num_estimator_cpus):
             df = pd.DataFrame(self.estimator.predict(data))

From b98e74d03e4e07a2ff6cf71b557c31913e82c762 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 23 Oct 2023 16:07:14 -0400
Subject: [PATCH 13/92] all models debugged in local

---
 src/models/kerasTF/models.py          | 33 +++++++++++----------------
 src/models/sklearn/models.py          |  1 +
 src/models/sklearn/partial_trainer.py |  2 +-
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 39bd8d3..d1e8443 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -139,6 +139,7 @@ def __init__(
     def preprocess(self, df):
         print('preprocess')
         labels = []
+        encoded = []
         for row in df.iter_rows():
             labels.append(row[self.taxa])
         self._nb_classes = len(np.unique(labels))
@@ -154,25 +155,23 @@ def preprocess(self, df):
                 OneHotTensorEncoder(self.taxa),
             )
         self._preprocessor.fit(df)
+        # Labels mapping
+        labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys())
+        encoded = np.arange(len(labels))
+        labels = np.append(labels, 'unknown')
+        encoded = np.append(encoded, -1)
+        self._labels_map = zip(labels, encoded)
 
     def _label_decode(self, predict):
         print('_label_decode')
-        if self._labels_map is None:
-            encoded = []
-            encoded.append(-1)
-            labels = ['unknown']
-            for k, v in self._preprocessor.preprocessors[1].stats_['unique_values({})'.format(self.taxa)].items():
-                encoded.append(v)
-                labels.append(k)
         decoded = pd.Series(np.empty(len(predict), dtype=object))
-        for label, coded in zip(labels, encoded):
-            decoded[predict == coded] = label
+        for label, encoded in self._labels_map:
+            decoded[predict == encoded] = label
 
         return np.array(decoded)
 
     def train(self, datasets, kmers_ds, cv = True):
         print('train')
-
         if cv:
             self._cross_validation(datasets, kmers_ds)
         else:
@@ -180,7 +179,6 @@ def train(self, datasets, kmers_ds, cv = True):
 
     def _cross_validation(self, datasets, kmers_ds):
         print('_cross_validation')
-
         df_test = datasets.pop('test')
 
         self._fit_model(datasets)
@@ -189,7 +187,7 @@ def _cross_validation(self, datasets, kmers_ds):
         for row in df_test.iter_rows():
             y_true.append(row[self.taxa])
 
-        y_pred = self.predict(df_test.drop_columns([self.taxa]), threshold = 0)
+        y_pred = self.predict(df_test.drop_columns([self.taxa]), threshold = 0.8)
 
         self._cv_score(y_true, y_pred)
 
@@ -197,7 +195,6 @@ def _fit_model(self, datasets):
         print('_fit_model')
         # Preprocessing loop
         for name, ds in datasets.items():
-            print(f'dataset preprocessing : {name}')
             ds = ds.drop_columns(['id'])
             ds = self._preprocessor.transform(ds)
             datasets[name] = ds
@@ -244,8 +241,6 @@ def predict(self, df, threshold=0.8):
             # Preprocess
             df = self._preprocessor.preprocessors[0].transform(df)
 
-            print('number of classes :', self._nb_classes)
-
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
                 TensorflowPredictor,
@@ -256,8 +251,6 @@ def predict(self, df, threshold=0.8):
                 batch_size = self.batch_size
             )
 
-            print(predictions.to_pandas())
-
             # Convert predictions to labels
             predictions = self._prob_2_cls(predictions, threshold)
                 
@@ -276,7 +269,6 @@ def map_predicted_label_binary(df, threshold):
                 'proba': df,
                 'predicted_label': np.full(len(df), -1)
             })
-            # predict['predicted_label'] = np.round(predict['proba'])
             predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
             predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
             return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
@@ -284,10 +276,11 @@ def map_predicted_label_binary(df, threshold):
         def map_predicted_label_multiclass(df, threshold):
             df = df['predictions']
             pred = pd.DataFrame({
-                'best_proba': [df[i][np.argmax(df[i])] for i in range(len(df))],
-                'predicted_label': df.map(lambda x: np.array(x).argmax())
+                'best_proba': [np.max(arr) for arr in df],
+                'predicted_label' : [np.argmax(arr) for arr in df]
             })
             pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1
+
             return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
         
         if self._nb_classes == 2:
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 9499230..903fe93 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -114,6 +114,7 @@ def preprocess(self, df):
             self._encoder,
         )
         self._preprocessor.fit(df)
+        # Labels mapping
         if self.classifier != 'onesvm':
             labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys())
             self._encoded = np.arange(len(labels))
diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py
index e2ff877..e9a51c6 100644
--- a/src/models/sklearn/partial_trainer.py
+++ b/src/models/sklearn/partial_trainer.py
@@ -255,7 +255,7 @@ def training_loop(self):
                     cv = 'prefit',
                 )
                 self.estimator.fit(
-                    X_calib,
+                    X_calib_df,
                     y_calib,
                 )
         

From 486036b9cc505a4a53b614295998811688372537 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 24 Oct 2023 12:52:06 -0400
Subject: [PATCH 14/92] features reduction to keep 25% best chi2

---
 src/Caribou_reduce_features.py              |  2 +-
 src/models/classification.py                |  4 ++--
 src/models/encoders/onesvm_label_encoder.py | 12 ++++--------
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index d3e289c..851eb77 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -103,7 +103,7 @@ def features_selection(ds, kmers, taxa):
     preprocessor = TensorFeaturesSelection(
             features = kmers,
             taxa = taxa,
-            threshold = 0.25, # remove lowest 25% significance
+            threshold = 0.75, # Keep 25% higest results
         )
 
     ds = preprocessor.fit_transform(ds)
diff --git a/src/models/classification.py b/src/models/classification.py
index c98042f..10e93dc 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -535,5 +535,5 @@ def split_sim_cv_ds(self, ds, data, name):
 ###############################################################################
 
 def convert_archaea_bacteria(df):
-            df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
-            return df
\ No newline at end of file
+    df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
+    return df
\ No newline at end of file
diff --git a/src/models/encoders/onesvm_label_encoder.py b/src/models/encoders/onesvm_label_encoder.py
index 9464011..1743f95 100644
--- a/src/models/encoders/onesvm_label_encoder.py
+++ b/src/models/encoders/onesvm_label_encoder.py
@@ -26,15 +26,11 @@ def _fit(self, dataset : Dataset) -> Preprocessor:
 
     def _transform_pandas(self, df: pd.DataFrame):
         _validate_df(df, self.label_column)
+        mapping = self.stats_[f"unique_values({self.label_column})"]
+        df[self.label_column] = df[self.label_column].str.lower()
+        df[self.label_column] = df[self.label_column].map(mapping)
+        df[self.label_column] = df[self.label_column].fillna(-1)
 
-        def column_label_encoder(s: pd.Series):
-            s_values = self.stats_[f"unique_values({s.name})"]
-            s = s.str.lower()
-            s = s.map(s_values)
-            s = s.fillna(-1)
-            return s
-
-        df[self.label_column] = df[self.label_column].transform(column_label_encoder)
         df = df.rename(columns = {self.label_column : LABELS_COLUMN_NAME})
 
         return df

From 84dc20858822c76c3e5d0bb33f93108d2c464083 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 25 Oct 2023 17:57:05 -0400
Subject: [PATCH 15/92] read_parquet parallelism == -1 instead of nb of files

---
 src/Caribou_reduce_features.py                 |  4 +---
 src/data/kmers.py                              |  2 +-
 src/models/classification.py                   | 18 +++++++++---------
 .../sklearn}/scoring_one_svm.py                |  0
 src/supplement/sklearn_tuning.py               | 18 +++++++++---------
 5 files changed, 20 insertions(+), 22 deletions(-)
 rename src/{supplement => models/sklearn}/scoring_one_svm.py (100%)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 851eb77..748691f 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -51,9 +51,7 @@ def features_reduction(opt):
     """
 
     # Load data 
-    files_lst = glob(os.path.join(data['profile'], '*.parquet'))
-    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    # ds = ray.data.read_parquet(data['profile'])
+    ds = ray.data.read_parquet(data['profile'], parallelism = -1)
     # Time the computation of transformations
     t_start = time()
     ds, kmers_list = occurence_exclusion(ds, kmers_list)
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 323e3a4..5f3a2ff 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -324,7 +324,7 @@ def _make_ray_ds(self):
                 self.df = self.df.repartition(int(self.df.count()/10))
         else:
             self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet'))
-            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(self._files_list))
+            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1)
 
     def _kmers_tokenization(self):
         print('_kmers_tokenization')
diff --git a/src/models/classification.py b/src/models/classification.py
index 10e93dc..245b804 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -112,7 +112,7 @@ def __init__(
     def execute_training_prediction(self, data2classify):
         print('execute_training_prediction')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         ids2classify = data2classify['ids']
         for i, taxa in enumerate(self._taxas_order):
             if taxa in self._taxas:
@@ -248,7 +248,7 @@ def _multiclass_training(self, taxa):
     def execute_classification(self, data2classify):
         print('execute_classification')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         ids = data2classify['ids']
         if len(self.classified_data['sequence']) == 0:
             raise ValueError('Please train a model before executing classification')
@@ -437,12 +437,12 @@ def _merge_database_host(self, database_data, host_data):
 
         if os.path.exists(self._merged_database_host['profile']):
             files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         else:
             files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
             files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
 
             cols2drop = []
             for col in df_db.schema().names:
@@ -470,7 +470,7 @@ def _load_training_data_merged(self, taxa):
         print('_load_training_data_merged')
         if self._classifier_binary == 'onesvm' and taxa == 'domain':
             files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
             df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
             df_val_test = self._merge_database_host(self._database_data, self._host_data)
             df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
@@ -492,7 +492,7 @@ def _load_training_data_merged(self, taxa):
     def _load_training_data(self):
         print('_load_training_data')
         files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
         df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
         self._training_datasets = {'train': df_train, 'validation': df_val}
@@ -512,7 +512,7 @@ def _sim_4_cv(self, df, kmers_ds, name):
         cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
         sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
         files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         return df
     
     def split_sim_cv_ds(self, ds, data, name):
@@ -522,7 +522,7 @@ def split_sim_cv_ds(self, ds, data, name):
             )
         if os.path.exists(ds_path):
             files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         else:
             cv_ds = ds.random_sample(0.1)
             if cv_ds.count() == 0:
diff --git a/src/supplement/scoring_one_svm.py b/src/models/sklearn/scoring_one_svm.py
similarity index 100%
rename from src/supplement/scoring_one_svm.py
rename to src/models/sklearn/scoring_one_svm.py
diff --git a/src/supplement/sklearn_tuning.py b/src/supplement/sklearn_tuning.py
index 272ddd7..76f88ad 100644
--- a/src/supplement/sklearn_tuning.py
+++ b/src/supplement/sklearn_tuning.py
@@ -21,7 +21,7 @@
 # Preprocessing
 from ray.data.preprocessors import Chain, LabelEncoder
 # Training
-from supplement.scoring_one_svm import ScoringSGDOneClassSVM
+from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 # Tuning
@@ -42,12 +42,12 @@ def merge_db_host(db_data, host_data):
 
     if os.path.exists(merged_db_host['profile']):
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
-        df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
     else:
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
-        df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
 
         col2drop = []
         for col in df_db.schema().names:
@@ -84,7 +84,7 @@ def sim_4_cv(df, database_data, name):
     cv_sim = readsSimulation(database_data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, database_data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
     return df
 
 def convert_archaea_bacteria(df):
@@ -106,7 +106,7 @@ def split_val_test_ds(ds, data):
     test_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_test_data_K{len(data["kmers"][0])}')
     if os.path.exists(val_path):
         files_lst = glob(os.path.join(val_path, '*.parquet'))
-        val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         val_ds = val_ds.map_batches(
             convert_archaea_bacteria,
             batch_format = 'pandas'
@@ -119,7 +119,7 @@ def split_val_test_ds(ds, data):
         val_ds = sim_4_cv(val_ds, data, 'validation')
     if os.path.exists(test_path):
         files_lst = glob(os.path.join(test_path, '*.parquet'))
-        test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         test_ds = test_ds.map_batches(
             convert_archaea_bacteria,
             batch_format = 'pandas'
@@ -164,7 +164,7 @@ def split_val_test_ds(ds, data):
         val_ds, test_ds = split_val_test_ds(test_val_ds,test_val_data)
         db_data = verify_load_data(opt['data'])
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
 elif opt['classifier'] == 'linearsvm' and opt['taxa'] == 'domain':
     if opt['data_host'] is None:
         raise ValueError('To tune for a domain taxa, a host species is required.\
@@ -175,7 +175,7 @@ def split_val_test_ds(ds, data):
 else:
     db_data = verify_load_data(opt['data'])
     files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-    train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
     val_ds, test_ds = split_val_test_ds(train_ds, db_data)
 
 # Preprocessing

From bcf1bf0e45ceeb0e5d94d26254723a791e4488fc Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 30 Oct 2023 07:25:58 -0400
Subject: [PATCH 16/92] revert parallelism to nb of files in bulk read

---
 src/Caribou_reduce_features.py |  4 +++-
 src/data/kmers.py              |  3 ++-
 src/models/classification.py   | 35 +++++++++++++++++++++-------------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 748691f..21ccc6c 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -51,7 +51,9 @@ def features_reduction(opt):
     """
 
     # Load data 
-    ds = ray.data.read_parquet(data['profile'], parallelism = -1)
+    files_lst = glob(data['profile'])
+    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    # ds = ray.data.read_parquet(data['profile'], parallelism = -1)
     # Time the computation of transformations
     t_start = time()
     ds, kmers_list = occurence_exclusion(ds, kmers_list)
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 5f3a2ff..13c1fa8 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -324,7 +324,8 @@ def _make_ray_ds(self):
                 self.df = self.df.repartition(int(self.df.count()/10))
         else:
             self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet'))
-            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1)
+            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(self._files_list))
+            # self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1)
 
     def _kmers_tokenization(self):
         print('_kmers_tokenization')
diff --git a/src/models/classification.py b/src/models/classification.py
index 245b804..cc8fc5e 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -112,7 +112,8 @@ def __init__(
     def execute_training_prediction(self, data2classify):
         print('execute_training_prediction')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         ids2classify = data2classify['ids']
         for i, taxa in enumerate(self._taxas_order):
             if taxa in self._taxas:
@@ -129,9 +130,9 @@ def execute_training_prediction(self, data2classify):
                 # Predicting
                 try:
                     if i == 0:
-                        df2classify = self._classify_first(df2classify, taxa, ids2classify, file2classify)
+                        df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile'])
                     else:
-                        df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, file2classify)
+                        df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile'])
                 except ValueError:
                     print('Stopping classification prematurelly because there are no more sequences to classify')
                     return taxa
@@ -248,16 +249,17 @@ def _multiclass_training(self, taxa):
     def execute_classification(self, data2classify):
         print('execute_classification')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         ids = data2classify['ids']
         if len(self.classified_data['sequence']) == 0:
             raise ValueError('Please train a model before executing classification')
         for i, taxa in enumerate(self.classified_data['sequence']):
             try:
                 if i == 0:
-                    df = self._classify_first(df, taxa, ids, df_file)
+                    df = self._classify_first(df, taxa, ids, data2classify['profile'])
                 else:
-                    df = self._classify_subsequent(df, taxa, ids, df_file)
+                    df = self._classify_subsequent(df, taxa, ids, data2classify['profile'])
             except ValueError:
                 print('Stopping classification prematurelly because there are no more sequences to classify')
                 return taxa
@@ -437,12 +439,15 @@ def _merge_database_host(self, database_data, host_data):
 
         if os.path.exists(self._merged_database_host['profile']):
             files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         else:
             files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
             files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
 
             cols2drop = []
             for col in df_db.schema().names:
@@ -470,7 +475,8 @@ def _load_training_data_merged(self, taxa):
         print('_load_training_data_merged')
         if self._classifier_binary == 'onesvm' and taxa == 'domain':
             files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
             df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
             df_val_test = self._merge_database_host(self._database_data, self._host_data)
             df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
@@ -492,7 +498,8 @@ def _load_training_data_merged(self, taxa):
     def _load_training_data(self):
         print('_load_training_data')
         files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
         df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
         self._training_datasets = {'train': df_train, 'validation': df_val}
@@ -512,7 +519,8 @@ def _sim_4_cv(self, df, kmers_ds, name):
         cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
         sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
         files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         return df
     
     def split_sim_cv_ds(self, ds, data, name):
@@ -522,7 +530,8 @@ def split_sim_cv_ds(self, ds, data, name):
             )
         if os.path.exists(ds_path):
             files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         else:
             cv_ds = ds.random_sample(0.1)
             if cv_ds.count() == 0:

From 7feeaa036b99874d9c57936500cd35091a373c10 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 1 Nov 2023 11:21:06 -0400
Subject: [PATCH 17/92] features selection using rdf + tf-idf

---
 requirements.txt                              |  2 +
 src/Caribou_reduce_features.py                | 54 ++++++++++--
 ...selection.py => chi_features_selection.py} |  2 +-
 src/data/reduction/occurence_exclusion.py     |  9 +-
 src/data/reduction/rdf_features_selection.py  | 85 +++++++++++++++++++
 src/models/kerasTF/build_neural_networks.py   | 15 ++--
 src/models/preprocessors/max_abs_scaler.py    | 16 ++--
 src/models/preprocessors/min_max_scaler.py    |  6 +-
 src/models/preprocessors/power_transformer.py | 10 +--
 src/models/preprocessors/tfidf_transformer.py | 66 ++++++++++++++
 src/models/sklearn/models.py                  | 13 +--
 11 files changed, 236 insertions(+), 42 deletions(-)
 rename src/data/reduction/{features_selection.py => chi_features_selection.py} (98%)
 create mode 100644 src/data/reduction/rdf_features_selection.py
 create mode 100644 src/models/preprocessors/tfidf_transformer.py

diff --git a/requirements.txt b/requirements.txt
index 2e05ef0..a8d4ad5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -93,3 +93,5 @@ Werkzeug==2.3.6
 wrapt==1.15.0
 yarl==1.9.2
 zipp==3.16.2
+xgboost==2.0.1
+xgboost_ray==0.1.18
\ No newline at end of file
diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 21ccc6c..879009f 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -4,15 +4,18 @@
 import os.path
 import argparse
 
+import numpy as np
+
 from utils import *
 from time import time
 from glob import glob
 from pathlib import Path
 
-from ray.data.preprocessors import Chain, LabelEncoder
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 from data.reduction.low_var_selection import TensorLowVarSelection
-from data.reduction.features_selection import TensorFeaturesSelection
+from data.reduction.chi_features_selection import TensorChiFeaturesSelection
+from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
 
 __author__ = "Nicolas de Montigny"
@@ -44,21 +47,33 @@ def features_reduction(opt):
 # Features reduction
 ################################################################################
     """
+    First option : Select features relevant to classification by Random Forest of decision trees
+    
     Brute force -> Features statistically related to classes
     1. OccurenceExclusion (10% extremes)
     2. LowVarSelection (variance > 10%)
     3. Chi2 + SelectPercentile() (75% best values)
     """
 
+    """
+    TODO: Add to preprocessing in model training
+    1. Replace the MinMaxScaling -> TfidfTransformer to scale down the impact of tokens that occur very frequently (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer)
+    2. TruncatedSVD to reduce dimensions and keep 10 000 features ~PCA (https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD)
+    """
+
     # Load data 
-    files_lst = glob(data['profile'])
+    files_lst = glob(os.path.join(data['profile'],'*.parquet'))
     ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     # ds = ray.data.read_parquet(data['profile'], parallelism = -1)
     # Time the computation of transformations
     t_start = time()
-    ds, kmers_list = occurence_exclusion(ds, kmers_list)
-    ds, kmers_list = low_var_selection(ds,kmers_list)
-    ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0])
+    ds = tfidf_transform(ds, kmers_list)
+    ds, kmers_list = tree_relevant_features(ds, kmers_list, 'phylum')
+    print(len(kmers_list))
+    if len(kmers_list) == 0:
+        ds, kmers_list = occurence_exclusion(ds, opt['kmers_list'])
+        ds, kmers_list = low_var_selection(ds,kmers_list)
+        ds, data['kmers'] = features_selection(ds, kmers_list, 'phylum')
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
@@ -78,7 +93,7 @@ def features_reduction(opt):
 def occurence_exclusion(ds, kmers):
     preprocessor = TensorPercentOccurenceExclusion(
         features = kmers,
-        percent = 0.1 # remove features present in less than 5% samples
+        percent = 0.1 # remove features present in less than 10% samples
     )
     
     ds = preprocessor.fit_transform(ds)
@@ -100,7 +115,7 @@ def low_var_selection(ds, kmers):
 
 # Chi2 evaluation of dependance between features and classes
 def features_selection(ds, kmers, taxa):
-    preprocessor = TensorFeaturesSelection(
+    preprocessor = TensorChiFeaturesSelection(
             features = kmers,
             taxa = taxa,
             threshold = 0.75, # Keep 25% higest results
@@ -108,9 +123,32 @@ def features_selection(ds, kmers, taxa):
 
     ds = preprocessor.fit_transform(ds)
     kmers = preprocessor.stats_['cols_keep']
+    print(len(kmers))
 
     return ds, kmers
 
+# TF-IDF scaling of the features
+def tfidf_transform(ds, kmers):
+    preprocessor = TensorTfIdfTransformer(
+        features = kmers
+    )
+    ds = preprocessor.fit_transform(ds)
+
+    return ds
+
+# Decision tree feature selection to keep only those identified as relevant to classification
+def tree_relevant_features(ds, kmers, taxa):
+    preprocessor = TensorRDFFeaturesSelection(
+        features = kmers,
+        taxa = taxa
+    )
+    preprocessor.fit_transform(ds)
+
+    kmers = preprocessor.stats_['cols_keep']
+    
+    return ds, kmers
+
+
 # Argument parsing from CLI
 ################################################################################
 
diff --git a/src/data/reduction/features_selection.py b/src/data/reduction/chi_features_selection.py
similarity index 98%
rename from src/data/reduction/features_selection.py
rename to src/data/reduction/chi_features_selection.py
index c2a02be..95fd013 100644
--- a/src/data/reduction/features_selection.py
+++ b/src/data/reduction/chi_features_selection.py
@@ -15,7 +15,7 @@
 
 TENSOR_COLUMN_NAME = '__value__'
 
-class TensorFeaturesSelection(Preprocessor):
+class TensorChiFeaturesSelection(Preprocessor):
     """
     Custom implementation of SelectKBest with Chi2 inspired by sklearn.feature_selection.SelectPercentile and sklearn.feature_selection.chi2 features selector to be used as a Ray preprocessor.
     https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index cbd7af1..17da804 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -19,7 +19,7 @@ def __init__(self, features: List[str], num_features: int):
         # Parameters
         self.features = features
         self._nb_features = len(features)
-        self.num_features = int((self._nb_features - num_features) / 2)
+        self._num_features = int(self._nb_features - num_features)
 
     def _fit(self, ds: Dataset) -> Preprocessor:
         # Nb of occurences
@@ -31,7 +31,7 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         # Include / Exclude by sorted position
         cols_keep = pd.Series(occurences, index = self.features)
         cols_keep = cols_keep.sort_values(ascending = True) # Long operation
-        cols_keep = cols_keep.iloc[self.num_features : (self._nb_features - self.num_features)]
+        cols_keep = cols_keep.iloc[0 : self._num_features]
         cols_keep = list(cols_keep.index)
 
         # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop}
@@ -54,7 +54,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
         
     def __repr__(self):
-        return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self.num_features!r})")
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self._num_features!r})")
 
 class TensorPercentOccurenceExclusion(Preprocessor):
     """
@@ -69,7 +69,6 @@ def __init__(self, features: List[str], percent : int = 0.05):
     
     def _fit(self, ds: Dataset) -> Preprocessor:
         nb_samples = ds.count()
-        low_treshold = ceil((0 + self.percent) * nb_samples)
         high_treshold = floor((1 -  self.percent) * nb_samples)
         occurences = np.zeros(self._nb_features)
 
@@ -85,7 +84,7 @@ def count_occurences(batch):
             occurences += row['occurences']
 
         # Construct list of features to keep by position
-        cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if low_treshold < occurence < high_treshold]
+        cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if occurence < high_treshold]
         
         self.stats_ = {'cols_keep' : cols_keep}
 
diff --git a/src/data/reduction/rdf_features_selection.py b/src/data/reduction/rdf_features_selection.py
new file mode 100644
index 0000000..4a12e44
--- /dev/null
+++ b/src/data/reduction/rdf_features_selection.py
@@ -0,0 +1,85 @@
+import os
+import logging
+
+import numpy as np
+import pandas as pd
+
+from typing import List
+from warnings import warn
+from ray.data import Dataset
+
+from xgboost import XGBRFClassifier
+
+from ray.air.config import ScalingConfig
+from sklearn.preprocessing import LabelEncoder
+
+
+from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class TensorRDFFeaturesSelection(Preprocessor):
+    """
+    Wrapper class for using Random Forest Classifier from XGBoost in features selection as a Ray preprocessor.
+    XGBRFClassifier trains a random forest of decision trees that is used to determine the features that are most useful in classification.
+    https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/
+    """
+    
+    def __init__(self, features: List[str], taxa: str):
+        # Parameters
+        self.taxa = taxa
+        self.features = features
+        self._nb_features = len(features)
+
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        def xgboost_batch(arr: np.array):
+            # Labels data
+            y = arr[self.taxa]
+            encoder = LabelEncoder()
+            y = encoder.fit_transform(y)
+            # Features data
+            X = _unwrap_ndarray_object_type_if_needed(arr[TENSOR_COLUMN_NAME])
+            X = pd.DataFrame(X, columns = self.features)
+            # XGBoost tree
+            tree = XGBRFClassifier()
+            tree.fit(X,y)
+            # Used features in the tree
+            tree = tree.get_booster()
+            relevant_features = tree.get_fscore()
+            relevant_features = [feat for feat in relevant_features.keys()]
+
+            return {'features':[relevant_features]}
+        
+        cols_keep = []
+
+        relevant_features = ds.map_batches(xgboost_batch, batch_format = 'numpy')
+        for row in relevant_features.iter_rows():
+            cols_keep.extend(row['features'])
+        cols_keep = np.unique(cols_keep)
+
+        self.stats_ = {'cols_keep' : cols_keep}
+
+        return self
+
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        cols_keep = self.stats_['cols_keep']
+
+        if len(cols_keep) < self._nb_features and len(cols_keep) > 0 :
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = pd.DataFrame(tensor_col, columns = self.features)
+
+            tensor_col = tensor_col[cols_keep].to_numpy()
+
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))        
+
+        return df
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})")
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 97a8489..81751dc 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -1,8 +1,11 @@
 
 from keras.models import Model, Sequential
+from tensorflow.keras import mixed_precision
 from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
 from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape
 
+
+
 from models.kerasTF.attentionLayer import AttentionWeightedAverage
 
 __author__ = "Nicolas de Montigny"
@@ -28,7 +31,7 @@ def build_attention(nb_kmers):
     x = Dense(1, activation = "tanh")(x)
 
     model = Model(inputs = inputs, outputs = x)
-    model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy'])
+    model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy'], jit_compile = True)
 
     return model
 
@@ -48,7 +51,7 @@ def build_LSTM(nb_kmers):
     x = Dense(1, activation = 'tanh')(x)
     
     model = Model(inputs = inputs, outputs = x)
-    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'])
+    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -79,7 +82,7 @@ def build_deepLSTM(nb_kmers):
 
     outputs = Dense(1, activation='sigmoid', name='score')(net)
     model = Model(inputs=inputs, outputs=outputs)
-    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'])
+    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -105,7 +108,7 @@ def build_LSTM_attention(nb_kmers, nb_classes):
     net = Dense(nb_classes)(net)
     outputs = Activation('softmax')(net)
     model = Model(inputs = inputs, outputs = outputs)
-    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
+    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -131,7 +134,7 @@ def build_CNN(nb_kmers, nb_classes):
     model.add(Dropout(0.5))
     model.add(Dense(nb_classes))
     model.add(Activation('softmax'))
-    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
+    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -169,6 +172,6 @@ def build_wideCNN(nb_kmers, nb_classes):
     net = Dense(nb_classes)(net)
     outputs = Activation('softmax')(net)
     model = Model(inputs = inputs, outputs = outputs)
-    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
+    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
diff --git a/src/models/preprocessors/max_abs_scaler.py b/src/models/preprocessors/max_abs_scaler.py
index cce6b8c..7148d4b 100644
--- a/src/models/preprocessors/max_abs_scaler.py
+++ b/src/models/preprocessors/max_abs_scaler.py
@@ -14,9 +14,9 @@ class TensorMaxAbsScaler(Preprocessor):
     Custom implementation of Ray's MaxAbsScaler for usage with tensor column in ray.data.dataset.Dataset.
     """
     
-    def __init__(self, features_list):
+    def __init__(self, features):
         # Parameters
-        self._features_list = features_list
+        self._features = features
         # Empty inits
         self._absmax = None
 
@@ -24,9 +24,9 @@ def _fit(self, dataset:ray.data.dataset.Dataset):
         """
         Fit the MaxAbsScaler to the given dataset.
         """
-        self._absmax = np.zeros(len(self._features_list), dtype = np.int32)
+        self._absmax = np.zeros(len(self._features), dtype = np.int32)
         for batch in dataset.iter_batches(batch_format = "numpy"):
-            for i in np.arange(len(self._features_list)):
+            for i in np.arange(len(self._features)):
                 local_max = max(batch[TENSOR_COLUMN_NAME][:,i])
                 if local_max > self._absmax[i]:
                     self._absmax[i] = local_max
@@ -41,8 +41,8 @@ def _transform_pandas(self, batch: pd.DataFrame):
         """
         df = batch[TENSOR_COLUMN_NAME]
         df = _unwrap_ndarray_object_type_if_needed(df)
-        df = pd.DataFrame(df, columns = self._features_list)
-        for i, col in enumerate(self._features_list):
+        df = pd.DataFrame(df, columns = self._features)
+        for i, col in enumerate(self._features):
             df[col] = df[col].apply(value_transform, args=[self._absmax[i]])
 
         batch[TENSOR_COLUMN_NAME] = TensorArray(np.array(df))
@@ -55,7 +55,7 @@ def _transform_numpy(self, batch: dict):
         """
         df = np.array(batch[TENSOR_COLUMN_NAME], dtype = np.float32)
         vecfunc = np.vectorize(value_transform)
-        for i in np.arange(len(self._features_list)):
+        for i in np.arange(len(self._features)):
             df[:,i] = vecfunc(df[:,i], self._absmax[i])
 
         batch[TENSOR_COLUMN_NAME] = df
@@ -63,7 +63,7 @@ def _transform_numpy(self, batch: dict):
         return batch
 
     def __repr__(self):
-        return f"{self.__class__.__name__}(columns={self._features_list!r})"
+        return f"{self.__class__.__name__}(columns={self._features!r})"
 
 # Function to map to the data, used by both data representations
 def value_transform(x, _min, _max):
diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py
index ebf560a..1cb6aa0 100644
--- a/src/models/preprocessors/min_max_scaler.py
+++ b/src/models/preprocessors/min_max_scaler.py
@@ -13,9 +13,9 @@ class TensorMinMaxScaler(Preprocessor):
     Custom implementation of Ray's MinMax Scaler for usage with tensor column in ray.data.dataset.Dataset.
     """
     
-    def __init__(self, features_list):
+    def __init__(self, features):
         # Parameters
-        self._features_list = features_list
+        self._features = features
         
     def _fit(self, ds: Dataset) -> Preprocessor:
         """
@@ -23,7 +23,7 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         """
         min = []
         max = []
-        nb_features = len(self._features_list)
+        nb_features = len(self._features)
 
         def Min(dct):
             arr = dct[TENSOR_COLUMN_NAME]
diff --git a/src/models/preprocessors/power_transformer.py b/src/models/preprocessors/power_transformer.py
index 1cc9c8d..fd5cb2c 100644
--- a/src/models/preprocessors/power_transformer.py
+++ b/src/models/preprocessors/power_transformer.py
@@ -14,8 +14,8 @@ class TensorPowerTransformer(Preprocessor):
     """
     Custom implementation of Ray's PowerTransformer for usage with tensor column in ray.data.dataset.Dataset.
     """
-    def __init__(self, features_list: List[str]):
-        self._features_list = features_list
+    def __init__(self, features: List[str]):
+        self._features = features
         self.method = "yeo-johnson"
         self.stats_ = {}
 
@@ -25,7 +25,7 @@ def _fit(self, ds: ray.data.dataset.Dataset):
         """
         nb_samples = ds.count()
         dct_values = {}
-        for feature in self._features_list:
+        for feature in self._features:
             dct_values[feature] = np.zeros(nb_samples, dtype = np.int32)
         
         previous_pos = 0
@@ -33,7 +33,7 @@ def _fit(self, ds: ray.data.dataset.Dataset):
         for batch in ds.iter_batches(batch_format = 'numpy'):
             batch = batch[TENSOR_COLUMN_NAME]
             batch_size = len(batch)
-            for i, feature in enumerate(self._features_list):
+            for i, feature in enumerate(self._features):
                 dct_values[feature][previous_pos:(previous_pos+batch_size)] = batch[:,i]
             previous_pos = previous_pos + batch_size
         
@@ -49,7 +49,7 @@ def _transform_pandas(self, batch: pd.DataFrame):
         """
         Transform the given dataset to pandas dataframe.
         """
-        df = pd.DataFrame(np.vstack(batch[TENSOR_COLUMN_NAME]), columns = self._features_list)
+        df = pd.DataFrame(np.vstack(batch[TENSOR_COLUMN_NAME]), columns = self._features)
         for feature, transformer in self.stats_.items():
             transformed = df[feature].to_numpy().reshape(-1,1)
             transformed = transformer.transform(transformed)
diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
new file mode 100644
index 0000000..a6032fa
--- /dev/null
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -0,0 +1,66 @@
+
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+
+
+from ray.data.dataset import Dataset
+from sklearn.preprocessing import normalize
+from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class TensorTfIdfTransformer(Preprocessor):
+    """
+    Custom implementation of TF-IDF transformation inspired by sklearn.feature_extraction.text.TfidfTransformer features scaler to be used as a Ray preprocessor.
+    https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
+    TF-IDF transformation is used to scale down the impact of tokens that occur very frequently and scale up the impact of those that occur very rarely.
+    """
+
+    def __init__(self, features):
+        # Parameters
+        self._features = features
+        self._nb_features = len(features)
+
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        nb_samples = ds.count()
+
+        # Nb of occurences
+        occurences = np.zeros(self._nb_features)
+        for batch in ds.iter_batches(batch_format = 'numpy'):
+            batch = batch[TENSOR_COLUMN_NAME]
+            occurences += np.count_nonzero(batch, axis = 0)
+
+        idf = np.log(nb_samples / occurences) + 1
+        
+        idf_diag = sp.diags(
+            idf,
+            offsets=0,
+            shape=(self._nb_features, self._nb_features),
+            format="csr",
+            dtype=np.float64,
+        )
+        
+        self.stats_ = {'idf_diag' : idf_diag}
+
+        return self
+    
+    def _transform_pandas(self, batch: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        idf_diag = self.stats_['idf_diag']
+        
+        df = batch[TENSOR_COLUMN_NAME]
+        df = _unwrap_ndarray_object_type_if_needed(df)
+
+        df = df * idf_diag
+        
+        df = normalize(df, norm = 'l2', copy = False)
+
+        batch[TENSOR_COLUMN_NAME] = pd.Series(list(df))
+
+        return batch
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 903fe93..bc91fab 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -14,6 +14,7 @@
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 
 # Training
+from ray.air.config import ScalingConfig
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDOneClassSVM, SGDClassifier
 from models.sklearn.partial_trainer import SklearnPartialTrainer
@@ -155,7 +156,7 @@ def _cross_validation(self, datasets, kmers_ds):
         y_true = list(y_true)
         
         y_pred = self._predict_cv(df_test.drop_columns([self.taxa]))
-
+        
         self._cv_score(y_true, y_pred)
 
     def _build(self):
@@ -221,11 +222,11 @@ def _fit_model(self, datasets):
             batch_size=self.batch_size,
             training_epochs=self._training_epochs,
             set_estimator_cpus=True,
-            # scaling_config=ScalingConfig(
-            #     trainer_resources={
-            #         'CPU': int(os.cpu_count()*0.6)
-            #     }
-            # ),
+            scaling_config=ScalingConfig(
+                trainer_resources={
+                    'CPU': int(os.cpu_count()*0.6)
+                }
+            ),
             run_config=RunConfig(
                 name=self.classifier,
                 local_dir=self._workdir

From 58cbe4da4fea3f037db3b8c6b0af565a3e1cfed3 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 2 Nov 2023 09:47:12 -0400
Subject: [PATCH 18/92] add rdf features selection & svd reduction + reduction
 in training steps

---
 src/Caribou_reduce_features.py                | 24 +++---
 src/data/kmers.py                             | 40 +---------
 src/data/reduction/rdf_features_selection.py  |  5 --
 src/data/reduction/truncated_svd_reduction.py | 80 +++++++++++++++++++
 src/models/kerasTF/models.py                  | 35 ++++++--
 src/models/models_utils.py                    |  2 +
 src/models/sklearn/models.py                  | 29 ++++---
 7 files changed, 143 insertions(+), 72 deletions(-)
 create mode 100644 src/data/reduction/truncated_svd_reduction.py

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 879009f..44c3b8a 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -11,11 +11,12 @@
 from glob import glob
 from pathlib import Path
 
-from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 from data.reduction.low_var_selection import TensorLowVarSelection
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 from data.reduction.chi_features_selection import TensorChiFeaturesSelection
 from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
+from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
 
 __author__ = "Nicolas de Montigny"
@@ -55,12 +56,6 @@ def features_reduction(opt):
     3. Chi2 + SelectPercentile() (75% best values)
     """
 
-    """
-    TODO: Add to preprocessing in model training
-    1. Replace the MinMaxScaling -> TfidfTransformer to scale down the impact of tokens that occur very frequently (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer)
-    2. TruncatedSVD to reduce dimensions and keep 10 000 features ~PCA (https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD)
-    """
-
     # Load data 
     files_lst = glob(os.path.join(data['profile'],'*.parquet'))
     ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
@@ -68,12 +63,11 @@ def features_reduction(opt):
     # Time the computation of transformations
     t_start = time()
     ds = tfidf_transform(ds, kmers_list)
-    ds, kmers_list = tree_relevant_features(ds, kmers_list, 'phylum')
-    print(len(kmers_list))
+    ds, kmers_list = tree_relevant_features(ds, kmers_list, opt['taxa'])
     if len(kmers_list) == 0:
         ds, kmers_list = occurence_exclusion(ds, opt['kmers_list'])
         ds, kmers_list = low_var_selection(ds,kmers_list)
-        ds, data['kmers'] = features_selection(ds, kmers_list, 'phylum')
+        ds, data['kmers'] = features_selection(ds, kmers_list, opt['taxa'])
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
@@ -148,6 +142,15 @@ def tree_relevant_features(ds, kmers, taxa):
     
     return ds, kmers
 
+# Features decomposition for dimension reduction
+def truncated_svd(ds, kmers):
+    preprocessor = TensorTruncatedSVDReduction(
+        features = kmers,
+        nb_components = 10
+    )
+    ds = preprocessor.fit_transform(ds)
+
+    return ds
 
 # Argument parsing from CLI
 ################################################################################
@@ -159,6 +162,7 @@ def tree_relevant_features(ds, kmers, taxa):
     parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files')
     parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced')
     # Parameters
+    parser.add_argument('-t','--taxa', default='phylum', help='The taxonomic level to use for the classification, defaults to Phylum.')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled')
     args = parser.parse_args()
diff --git a/src/data/kmers.py b/src/data/kmers.py
index 13c1fa8..5a0fbe5 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -15,11 +15,6 @@
 from data.extraction.seen_kmers_vectorizer import SeenKmersVectorizer
 from data.extraction.given_kmers_vectorizer import GivenKmersVectorizer
 
-# Features selection
-from data.reduction.low_var_selection import TensorLowVarSelection
-from data.reduction.features_selection import TensorFeaturesSelection
-from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
-
 __author__ = ['Amine Remita', 'Nicolas de Montigny']
 
 __all__ = ['KmersCollection']
@@ -344,40 +339,7 @@ def _kmers_tokenization(self):
         self.df = tokenizer.transform(self.df)
         if self.method == 'seen':
             self.kmers_list = tokenizer.stats_['tokens(sequence)']
-            # self._kmers_reduction()
-
-    def _kmers_reduction(self):
-        """
-        Brute force -> Features statistically related to classes
-        1. OccurenceExclusion (10% extremes)
-        2. LowVarSelection (variance > 10%)
-        3. Chi2 + SelectPercentile() (75% best values)
-        """
-        # Exclusion of columns occuring in less / more than 10% of the columns = 20% removed
-        excluder = TensorPercentOccurenceExclusion(
-            features = self.kmers_list,
-            percent = 0.1
-        )
-        self.df = excluder.fit_transform(self.df)
-        self.kmers_list = excluder.stats_['cols_keep']
-
-        # Exclusion of columns with less than 10% variance
-        varier = TensorLowVarSelection(
-            features = self.kmers_list,
-            threshold = 0.1,
-        )
-        self.df = varier.fit_transform(self.df)
-        self.kmers_list = varier.stats_['cols_keep']
-
-        # Chi2 evaluation of dependance between features and classes to keep 75% most significative
-        selector = TensorFeaturesSelection(
-            features = self.kmers_list,
-            taxa = self.taxas[0],
-            threshold = 0.25
-        )
-        self.df = selector.fit_transform(self.df)
-        self.kmers_list = selector.stats_['cols_keep']
-
+ 
     def _write_dataset(self):
         self.df.write_parquet(self.Xy_file)
         rmtree(self._tmp_dir)
diff --git a/src/data/reduction/rdf_features_selection.py b/src/data/reduction/rdf_features_selection.py
index 4a12e44..7b67d0c 100644
--- a/src/data/reduction/rdf_features_selection.py
+++ b/src/data/reduction/rdf_features_selection.py
@@ -1,16 +1,11 @@
-import os
-import logging
-
 import numpy as np
 import pandas as pd
 
 from typing import List
-from warnings import warn
 from ray.data import Dataset
 
 from xgboost import XGBRFClassifier
 
-from ray.air.config import ScalingConfig
 from sklearn.preprocessing import LabelEncoder
 
 
diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py
new file mode 100644
index 0000000..9e64773
--- /dev/null
+++ b/src/data/reduction/truncated_svd_reduction.py
@@ -0,0 +1,80 @@
+import numpy as np
+import pandas as pd
+
+from typing import List
+from warnings import warn
+from ray.data import Dataset
+
+from sklearn.utils.extmath import randomized_svd
+
+from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class TensorTruncatedSVDReduction(Preprocessor):
+    """
+    Custom class for using a mix of TruncatedSVD inspired by sklearn.decomposition.TruncatedSVD and applying a batched strategy inspired by sklearn.decomposition.IncrementalPCA to process batches in parallel.
+    This makes it possible to use the class as a Ray preprocessor in a features reduction strategy.
+    TruncatedSVD performs linear dimensionality reduction by means of truncated singular value decomposition (SVD).
+    When it is applied following the TF-IDF normalisation, it becomes a latent semantic analysis (LSA).
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
+    """
+    
+    def __init__(self, features: List[str], nb_components: int = 10000):
+        # Parameters
+        self.features = features
+        self._nb_features = len(features)
+        self._nb_components = nb_components
+        
+    
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        def svd_batch(arr: np.array):
+            df = arr['__value__']
+            df = _unwrap_ndarray_object_type_if_needed(df)
+            U, Sigma, VT = randomized_svd(
+                df,
+                n_components = self._nb_components,
+                n_iter = 5,
+                n_oversamples = 10,
+                power_iteration_normalizer = 'LU',
+                random_state = None
+            )
+
+            return {'VT': [VT]}
+
+        if self._nb_features > self._nb_components:
+            # Exec svd
+            components = []
+            svd_vt = ds.map_batches(svd_batch, batch_format = 'numpy')
+
+            for row in svd_vt.iter_rows():
+                components.append(row['VT'])
+
+            components = np.mean(components, axis = 0)
+
+            self.stats_ = {'components' : components}
+        else:
+            warn('No features reduction to do because the number of features is already lower than the required number of components')
+            self.stats_ = {'components' : False}
+
+        return self
+
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        components = self.stats_['components']
+        
+        if components is not False:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = np.dot(tensor_col, components.T)
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))        
+
+        return df
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})")
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index d1e8443..79c763d 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -8,6 +8,11 @@
 from glob import glob
 from shutil import rmtree
 
+# Dimensions reduction
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
+from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction
+
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
@@ -144,19 +149,31 @@ def preprocess(self, df):
             labels.append(row[self.taxa])
         self._nb_classes = len(np.unique(labels))
         if self._nb_classes == 2:
+            self._encoder = ModelLabelEncoder(self.taxa)
             self._preprocessor = Chain(
-                TensorMinMaxScaler(self.kmers),
-                ModelLabelEncoder(self.taxa),
+                TensorTfIdfTransformer(self.kmers),
+                TensorRDFFeaturesSelection(self.kmers, self.taxa),
             )
         else:
-            self._preprocessor = Chain(
-                TensorMinMaxScaler(self.kmers),
+            self._encoder = Chain(
                 LabelEncoder(self.taxa),
-                OneHotTensorEncoder(self.taxa),
+                OneHotTensorEncoder(self.taxa)
+            )
+            self._preprocessor = Chain(
+                TensorTfIdfTransformer(self.kmers),
+                TensorRDFFeaturesSelection(self.kmers, self.taxa),
             )
-        self._preprocessor.fit(df)
+        
+
+        self._encoder.fit(df)
+        df = self._preprocessor.fit_transform(df)
+        self._reductor = TensorTruncatedSVDReduction(self.kmers)
+        self._reductor.fit(df)
         # Labels mapping
-        labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys())
+        if self._nb_classes == 2:
+            labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+        else:
+            labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys())
         encoded = np.arange(len(labels))
         labels = np.append(labels, 'unknown')
         encoded = np.append(encoded, -1)
@@ -196,7 +213,9 @@ def _fit_model(self, datasets):
         # Preprocessing loop
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
             ds = self._preprocessor.transform(ds)
+            ds = self._reductor.transform(ds)
             datasets[name] = ds
 
         # Training parameters
@@ -239,7 +258,7 @@ def predict(self, df, threshold=0.8):
                 df = df.drop_columns(col_2_drop)
 
             # Preprocess
-            df = self._preprocessor.preprocessors[0].transform(df)
+            df = self._preprocessor.transform(df)
 
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 2f38ff8..c38ca25 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -85,7 +85,9 @@ def __init__(
         self._predict_ids = []
         # Initialize Ray variables
         self._clf = None
+        self._encoder = None
         self._preprocessor = None
+        self._reductor = None
         self._model_ckpt = None
         self._trainer = None
         self._train_params = {}
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index bc91fab..2557952 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -7,8 +7,13 @@
 from glob import glob
 from shutil import rmtree
 
+# Dimensions reduction
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
+from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction
+
 # Preprocessing
-from ray.data.preprocessors import Chain, BatchMapper
+from ray.data.preprocessors import Chain
 from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
@@ -97,7 +102,6 @@ def __init__(
         )
         # Parameters
         self._encoded = []
-        self._encoder = None
         # Computes
         self._build()
 
@@ -111,13 +115,18 @@ def preprocess(self, df):
             self._encoder = ModelLabelEncoder(self.taxa)
         
         self._preprocessor = Chain(
-            TensorMinMaxScaler(self.kmers),
-            self._encoder,
+            TensorTfIdfTransformer(self.kmers),
+            TensorRDFFeaturesSelection(self.kmers, self.taxa),
         )
-        self._preprocessor.fit(df)
+        self._encoder.fit(df)
+        df = self._preprocessor.fit_transform(df)
+        self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep']
+        self._reductor = TensorTruncatedSVDReduction(self.kmers)
+        self._reductor.fit(df)
+
         # Labels mapping
         if self.classifier != 'onesvm':
-            labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys())
+            labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
             self._encoded = np.arange(len(labels))
             labels = np.append(labels, 'unknown')
             self._encoded = np.append(self._encoded, -1)
@@ -146,8 +155,6 @@ def _cross_validation(self, datasets, kmers_ds):
 
         self._fit_model(datasets)
         
-        df_test = self._preprocessor.preprocessors[0].transform(df_test)
-
         y_true = []
         for row in df_test.iter_rows():
             y_true.append(row[self.taxa])
@@ -202,9 +209,10 @@ def _fit_model(self, datasets):
         print('_fit_model')
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
             ds = self._preprocessor.transform(ds)
+            ds = self._reductor.transform(ds)
             datasets[name] = ray.put(ds)
-
         try:
             training_labels = self._encoded.copy()
             training_labels = np.delete(
@@ -252,7 +260,8 @@ def _predict_cv(self, df):
     def predict(self, df, threshold = 0.8):
         print('predict')
         if df.count() > 0:
-            df = self._preprocessor.preprocessors[0].transform(df)
+            df = self._preprocessor.transform(df)
+            df = self._reductor.transform(df)
             if self.classifier == 'onesvm':
                 predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
                 self._predictor = BatchPredictor.from_checkpoint(self._models_collection['domain'], SklearnTensorPredictor)

From a9629932e4361d5da7eb158b8d2b5c2449144578 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 4 Nov 2023 10:19:45 -0400
Subject: [PATCH 19/92] features reduction 2.0 + tf-rdf scaling

---
 src/Caribou_reduce_features.py               | 109 ++++++++-----------
 src/data/reduction/chi_features_selection.py |   5 +-
 src/data/reduction/low_var_selection.py      |   7 +-
 src/data/reduction/occurence_exclusion.py    |   5 +-
 src/data/reduction/rdf_features_selection.py |   7 +-
 src/models/sklearn/models.py                 |   1 +
 src/models/sklearn/partial_trainer.py        |  45 ++++----
 7 files changed, 90 insertions(+), 89 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 44c3b8a..0157423 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -38,7 +38,7 @@ def features_reduction(opt):
     verify_file(opt['kmers_list'])
 
     # Verification of k length
-    k_length, kmers_list = verify_kmers_list_length(k_length, opt['kmers_list'])
+    k_length, kmers = verify_kmers_list_length(k_length, opt['kmers_list'])
 
     outdirs = define_create_outdirs(opt['outdir'])
     
@@ -48,31 +48,38 @@ def features_reduction(opt):
 # Features reduction
 ################################################################################
     """
-    First option : Select features relevant to classification by Random Forest of decision trees
-    
-    Brute force -> Features statistically related to classes
-    1. OccurenceExclusion (10% extremes)
-    2. LowVarSelection (variance > 10%)
-    3. Chi2 + SelectPercentile() (75% best values)
+    Two-step features reduction :
+    0. Features scaling
+        1. TF-IDF scaling (diminish impact of more present and augment impact of less present)
+    1. Brute force features exclusion
+        1. OccurenceExclusion (exclusion of features present in more than 95% of samples)
+        2. LowVarSelection (exclusion of features with less than 5% variance)
+    2. Statistical features selection
+        1. Chi2 + SelectPercentile() (select 25% of features with highest Chi2 values)
+    3. In training features selection
+        1. RandomForestClassification (select features identified as useful for classification)
+        2. TruncatedSVD decomposition (map the features to 10 000 decomposed features if there is still more)
     """
 
     # Load data 
     files_lst = glob(os.path.join(data['profile'],'*.parquet'))
-    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    # ds = ray.data.read_parquet(data['profile'], parallelism = -1)
+    export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     # Time the computation of transformations
     t_start = time()
-    ds = tfidf_transform(ds, kmers_list)
-    ds, kmers_list = tree_relevant_features(ds, kmers_list, opt['taxa'])
-    if len(kmers_list) == 0:
-        ds, kmers_list = occurence_exclusion(ds, opt['kmers_list'])
-        ds, kmers_list = low_var_selection(ds,kmers_list)
-        ds, data['kmers'] = features_selection(ds, kmers_list, opt['taxa'])
+    # Features scaling
+    train_ds = tfidf_transform(train_ds, kmers)
+    # Brute force features exclusion
+    train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers)
+    train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers)
+    # Statistical features selection
+    train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa'])
+    # Time the computation of transformations
     t_end = time()
     t_reduction = t_end - t_start
     # Save reduced dataset
     data['profile'] = f"{data['profile']}_reduced"
-    ds.write_parquet(data['profile'])
+    export_ds.write_parquet(data['profile'])
     # Save reduced K-mers
     with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle:
         handle.writelines("%s\n" % item for item in data['kmers'])
@@ -83,74 +90,54 @@ def features_reduction(opt):
 
     print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.")
 
-# Exclusion of columns occuring in less / more than 10% of the columns = 20% removed
-def occurence_exclusion(ds, kmers):
+# TF-IDF scaling of the features
+def tfidf_transform(ds, kmers):
+    preprocessor = TensorTfIdfTransformer(
+        features = kmers
+    )
+    ds = preprocessor.fit_transform(ds)
+
+    return ds
+
+# Exclusion of columns occuring in more than 95% of the samples
+def occurence_exclusion(train_ds, export_ds, kmers):
     preprocessor = TensorPercentOccurenceExclusion(
         features = kmers,
-        percent = 0.1 # remove features present in less than 10% samples
+        percent = 0.5
     )
     
-    ds = preprocessor.fit_transform(ds)
+    train_ds = preprocessor.fit_transform(train_ds)
+    export_ds = preprocessor.transform(export_ds)
     kmers = preprocessor.stats_['cols_keep']
 
-    return ds, kmers
+    return train_ds, export_ds, kmers
 
-# Exclusion of columns with less than 10% variance
-def low_var_selection(ds, kmers):
+# Exclusion of columns with less than 5% variance
+def low_var_selection(train_ds, export_ds, kmers):
     preprocessor = TensorLowVarSelection(
         features = kmers,
-        threshold = 0.1, # remove features with less than 5% variance
+        threshold = 0.05,
     )
 
-    ds = preprocessor.fit_transform(ds)
+    train_ds = preprocessor.fit_transform(train_ds)
+    export_ds = preprocessor.transform(export_ds)
     kmers = preprocessor.stats_['cols_keep']
 
-    return ds, kmers
+    return train_ds, export_ds, kmers
 
 # Chi2 evaluation of dependance between features and classes
-def features_selection(ds, kmers, taxa):
+def features_selection(train_ds, export_ds, kmers, taxa):
     preprocessor = TensorChiFeaturesSelection(
             features = kmers,
             taxa = taxa,
             threshold = 0.75, # Keep 25% higest results
         )
 
-    ds = preprocessor.fit_transform(ds)
-    kmers = preprocessor.stats_['cols_keep']
-    print(len(kmers))
-
-    return ds, kmers
-
-# TF-IDF scaling of the features
-def tfidf_transform(ds, kmers):
-    preprocessor = TensorTfIdfTransformer(
-        features = kmers
-    )
-    ds = preprocessor.fit_transform(ds)
-
-    return ds
-
-# Decision tree feature selection to keep only those identified as relevant to classification
-def tree_relevant_features(ds, kmers, taxa):
-    preprocessor = TensorRDFFeaturesSelection(
-        features = kmers,
-        taxa = taxa
-    )
-    preprocessor.fit_transform(ds)
-
+    train_ds = preprocessor.fit_transform(train_ds)
+    export_ds = preprocessor.transform(export_ds)
     kmers = preprocessor.stats_['cols_keep']
     
-    return ds, kmers
-
-# Features decomposition for dimension reduction
-def truncated_svd(ds, kmers):
-    preprocessor = TensorTruncatedSVDReduction(
-        features = kmers,
-        nb_components = 10
-    )
-    ds = preprocessor.fit_transform(ds)
-
-    return ds
+    return train_ds, export_ds, kmers
 
 # Argument parsing from CLI
 ################################################################################
diff --git a/src/data/reduction/chi_features_selection.py b/src/data/reduction/chi_features_selection.py
index 95fd013..9f51310 100644
--- a/src/data/reduction/chi_features_selection.py
+++ b/src/data/reduction/chi_features_selection.py
@@ -56,7 +56,10 @@ def stats(batch):
         # Keep features with values higher than the threshold
         cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi > self.threshold]
         
-        self.stats_ = {'cols_keep' : cols_keep}
+        if 0 < len(cols_keep) :
+            self.stats_ = {'cols_keep' : cols_keep}
+        else:
+            self.stats_ = {'cols_keep' : self.features}
 
         return self
 
diff --git a/src/data/reduction/low_var_selection.py b/src/data/reduction/low_var_selection.py
index 0212c8c..912e65a 100644
--- a/src/data/reduction/low_var_selection.py
+++ b/src/data/reduction/low_var_selection.py
@@ -18,7 +18,7 @@ class TensorLowVarSelection(Preprocessor):
     def __init__(
         self,
         features : List[str],
-        threshold: float = 0.1,
+        threshold: float = 0.05,
     ):
         self.features = features
         self.threshold = threshold
@@ -66,7 +66,10 @@ def get_sqr_dev(batch):
         # Keep features with values higher than the threshold
         cols_keep = [self.features[i] for i, var in enumerate(var_arr) if var > self.threshold]
         
-        self.stats_ = {'cols_keep' : cols_keep}
+        if 0 < len(cols_keep) :
+            self.stats_ = {'cols_keep' : cols_keep}
+        else:
+            self.stats_ = {'cols_keep' : self.features}
 
         return self
 
diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index 17da804..fe9b45d 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -86,7 +86,10 @@ def count_occurences(batch):
         # Construct list of features to keep by position
         cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if occurence < high_treshold]
         
-        self.stats_ = {'cols_keep' : cols_keep}
+        if 0 < len(cols_keep) :
+            self.stats_ = {'cols_keep' : cols_keep}
+        else:
+            self.stats_ = {'cols_keep' : self.features}
 
         return self
 
diff --git a/src/data/reduction/rdf_features_selection.py b/src/data/reduction/rdf_features_selection.py
index 7b67d0c..c2ad667 100644
--- a/src/data/reduction/rdf_features_selection.py
+++ b/src/data/reduction/rdf_features_selection.py
@@ -53,7 +53,10 @@ def xgboost_batch(arr: np.array):
             cols_keep.extend(row['features'])
         cols_keep = np.unique(cols_keep)
 
-        self.stats_ = {'cols_keep' : cols_keep}
+        if 0 < len(cols_keep) :
+            self.stats_ = {'cols_keep' : cols_keep}
+        else:
+            self.stats_ = {'cols_keep' : self.features}
 
         return self
 
@@ -61,7 +64,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
         cols_keep = self.stats_['cols_keep']
 
-        if len(cols_keep) < self._nb_features and len(cols_keep) > 0 :
+        if len(cols_keep) < self._nb_features:
             tensor_col = df[TENSOR_COLUMN_NAME]
             tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
             tensor_col = pd.DataFrame(tensor_col, columns = self.features)
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 2557952..fa8139e 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -188,6 +188,7 @@ def _build(self):
                 'eta0' : 0.001,
                 'n_jobs' : -1
             }
+# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems
         elif self.classifier == 'sgd':
             print('Training multiclass SGD classifier')
             self._clf = SGDClassifier()
diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py
index e9a51c6..f08dd7c 100644
--- a/src/models/sklearn/partial_trainer.py
+++ b/src/models/sklearn/partial_trainer.py
@@ -236,28 +236,29 @@ def training_loop(self):
                         self.estimator.partial_fit(batch_X, batch_y, **self.fit_params)
                 fit_time = time() - start_time
 
-        if len(self._labels) > 2:
-            with parallel_backend("ray", n_jobs=num_cpus):
-                X_calib_df = np.empty((X_calib.count(), len(self._features_list)))
-                for ind, batch in enumerate(X_calib.iter_batches(
-                    batch_size = 1,
-                    batch_format = 'numpy'
-                )):
-                    X_calib_df[ind] = batch['__value__']
-
-                """
-                X_calib = pd.DataFrame(X_calib_df, columns = self._features_list)
-                """
-                y_calib = y_calib.to_pandas().to_numpy()
-                self.estimator = CalibratedClassifierCV(
-                    estimator = self.estimator,
-                    method = 'sigmoid',
-                    cv = 'prefit',
-                )
-                self.estimator.fit(
-                    X_calib_df,
-                    y_calib,
-                )
+        # Calibrated classifier was meant to give the predict_proba method but all used models implement it and learning should be faster without it
+        # if len(self._labels) > 2:
+        #     with parallel_backend("ray", n_jobs=num_cpus):
+        #         X_calib_df = np.empty((X_calib.count(), len(self._features_list)))
+        #         for ind, batch in enumerate(X_calib.iter_batches(
+        #             batch_size = 1,
+        #             batch_format = 'numpy'
+        #         )):
+        #             X_calib_df[ind] = batch['__value__']
+
+        #         """
+        #         X_calib = pd.DataFrame(X_calib_df, columns = self._features_list)
+        #         """
+        #         y_calib = y_calib.to_pandas().to_numpy()
+        #         self.estimator = CalibratedClassifierCV(
+        #             estimator = self.estimator,
+        #             method = 'sigmoid',
+        #             cv = 'prefit',
+        #         )
+        #         self.estimator.fit(
+        #             X_calib_df,
+        #             y_calib,
+        #         )
         
         with tune.checkpoint_dir(step=1) as checkpoint_dir:
             with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f:

From 940af1c10db17ab1a9ccac3066bf535aed6c3f9c Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 4 Nov 2023 22:41:36 -0400
Subject: [PATCH 20/92] script for sim

---
 setup.cfg                              |   1 +
 src/Caribou_classification.py          |  40 +--
 src/Caribou_classification_train_cv.py |  31 ++-
 src/Caribou_extraction.py              |  27 +-
 src/Caribou_extraction_train_cv.py     |  26 +-
 src/Caribou_pipeline.py                |   2 +-
 src/Caribou_reduce_features.py         |   6 +-
 src/Caribou_simulate_test_val.py       |  90 +++++++
 src/models/classification.py           | 360 ++++++-------------------
 src/models/classification_old.py       | 335 +++++++++++++++++++++++
 src/models/kerasTF/models.py           |   1 -
 src/supplement/sklearn_tuning.py       |  39 +--
 src/utils.py                           | 104 ++++++-
 13 files changed, 715 insertions(+), 347 deletions(-)
 create mode 100644 src/Caribou_simulate_test_val.py
 create mode 100644 src/models/classification_old.py

diff --git a/setup.cfg b/setup.cfg
index 6e6c672..fce114a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,6 +34,7 @@ scripts =
   src/Caribou_pipeline.py
   src/Caribou_kmers.py
   src/Caribou_reduce_features.py
+  src/Caribou_simulate_test_val.py
   src/Caribou_extraction.py
   src/Caribou_classification.py
   src/Caribou_extraction_train_cv.py
diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index d421389..9c29172 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -6,7 +6,7 @@
 from utils import *
 from time import time
 from pathlib import Path
-from models.classification import ClassificationMethods
+from models.classification_old import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
@@ -15,17 +15,8 @@
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_classification(opt):
-    # Verify existence of files and load data
-    data_bacteria = verify_load_data(opt['data_bacteria'])
-    data_metagenome = verify_load_data(opt['data_metagenome'])
-    k_length = len(data_bacteria['kmers'][0])
     
-    if opt['preclassified_data'] is not None:
-        preclassified_data = verify_load_preclassified(opt['preclassified_data'])
-    else:
-        preclassified_data = None
-
-    # Verify that model type is valid / choose default depending on host presence
+    # Verify that model type is valid / choose default
     if opt['model_type'] is None:
         opt['model_type'] = 'cnn'
 
@@ -35,22 +26,37 @@ def bacteria_classification(opt):
     
     outdirs = define_create_outdirs(opt['outdir'])
 
+    # Initialize cluster
+    init_ray_cluster(opt['workdir'])
+
+# Data loading
+################################################################################
+
+    db_data, db_ds = verify_load_db(opt['data_bacteria'])
+    data_metagenome = verify_load_data(opt['data_metagenome'])
+
+    k_length = len(db_data['kmers'][0])
+
+    if opt['preclassified_data'] is not None:
+        preclassified_data = verify_load_preclassified(opt['preclassified_data'])
+    else:
+        preclassified_data = None
+
     # Validate and extract list of taxas
     if opt['taxa'] is not None:
-        lst_taxas = verify_taxas(opt['taxa'], data_bacteria['taxas'])
+        lst_taxas = verify_taxas(opt['taxa'], db_data['taxas'])
     else:
-        lst_taxas = data_bacteria['taxas'].copy()
+        lst_taxas = db_data['taxas'].copy()
     
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
 
-    # Initialize cluster
-    init_ray_cluster(opt['workdir'])
+    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
 
 # Definition of model for bacteria taxonomic classification + training
 ################################################################################
     clf = ClassificationMethods(
-        database_k_mers = data_bacteria,
+        database_k_mers = db_data,
         k = k_length,
         outdirs = outdirs,
         database = opt['database_name'],
@@ -66,7 +72,7 @@ def bacteria_classification(opt):
 ################################################################################
     
     t_start = time()
-    end_taxa = clf.execute_training_prediction(data_metagenome)
+    end_taxa = clf.fit_predict(data_metagenome)
     t_end = time()
     t_classif = t_end - t_start
     clf_data = merge_save_data(
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index 856c7fe..d7d25a5 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -7,7 +7,7 @@
 from time import time
 from pathlib import Path
 from logging import ERROR
-from models.classification import ClassificationMethods
+from models.classification_old import ClassificationMethods
 
 warnings.filterwarnings('ignore')
 
@@ -18,11 +18,8 @@
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_classification_train_cv(opt):
-    # Verify existence of files and load data
-    data_bacteria = verify_load_data(opt['data_bacteria'])
-    k_length = len(data_bacteria['kmers'][0])
 
-    # Verify that model type is valid / choose default depending on host presence
+    # Verify that model type is valid / choose default
     if opt['model_type'] is None:
         opt['model_type'] = 'cnn'
 
@@ -32,24 +29,34 @@ def bacteria_classification_train_cv(opt):
     
     outdirs = define_create_outdirs(opt['outdir'])
     
+    # Initialize cluster
+    init_ray_cluster(opt['workdir'])
+
+# Data loading
+################################################################################
+
+    db_data, db_ds = verify_load_db(opt['data_bacteria'])
+
+    k_length = len(db_data['kmers'][0])
+
     # Validate and extract list of taxas
     if opt['taxa'] is not None:
-        lst_taxas = verify_taxas(opt['taxa'], data_bacteria['taxas'])
+        lst_taxas = verify_taxas(opt['taxa'], db_data['taxas'])
     else:
-        lst_taxas = data_bacteria['taxas'].copy()
+        lst_taxas = db_data['taxas'].copy()
     
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
-    
-    # Initialize cluster
-    init_ray_cluster(opt['workdir'])
+
+    test_ds = split_sim_dataset(db_ds, db_data, 'test')
+    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
 
 # Training and cross-validation of models for classification of bacterias
 ################################################################################
 
     t_start = time()
     ClassificationMethods(
-        database_k_mers = data_bacteria,
+        database_k_mers = db_data,
         k = k_length,
         outdirs = outdirs,
         database = opt['database_name'],
@@ -60,7 +67,7 @@ def bacteria_classification_train_cv(opt):
         training_epochs = opt['training_epochs'],
         verbose = opt['verbose'],
         cv = True
-    ).execute_training()
+    ).fit()
     t_end = time()
     t_classify = t_end - t_start
     print(
diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py
index c0dbaa0..d3ea11f 100644
--- a/src/Caribou_extraction.py
+++ b/src/Caribou_extraction.py
@@ -5,7 +5,7 @@
 from utils import *
 from time import time
 from pathlib import Path
-from models.classification import ClassificationMethods
+from models.classification_old import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
@@ -14,14 +14,6 @@
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_extraction(opt):
-    # Verify existence of files and load data
-    data_bacteria = verify_load_data(opt['data_bacteria'])
-    if opt['data_host'] is not None:
-        data_host = verify_load_data(opt['data_host'])
-        verify_concordance_klength(len(data_bacteria['kmers'][0]), len(data_host['kmers'][0]))
-    data_metagenome = verify_load_data(opt['data_metagenome'])
-
-    k_length = len(data_bacteria['kmers'][0])
 
     # Verify that model type is valid / choose default depending on host presence
     if opt['host_name'] is None:
@@ -38,11 +30,24 @@ def bacteria_extraction(opt):
     # Initialize cluster
     init_ray_cluster(opt['workdir'])
     
+# Data loading
+################################################################################
+
+    if opt['data_host'] is not None:
+        db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+    else:
+        db_data, db_ds = verify_load_db(opt['data_bacteria'])
+    data_metagenome = verify_load_data(opt['data_metagenome'])
+
+    k_length = len(db_data['kmers'][0])
+
+    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+
 # Definition of model for bacteria extraction / host removal + execution
 ################################################################################
     if opt['host_name'] is None:
         clf = ClassificationMethods(
-            database_k_mers = data_bacteria,
+            database_k_mers = (db_data, db_ds),
             k = k_length,
             outdirs = outdirs,
             database = opt['database_name'],
@@ -55,7 +60,7 @@ def bacteria_extraction(opt):
         )
     else:
         clf = ClassificationMethods(
-            database_k_mers = (data_bacteria, data_host),
+            database_k_mers = (db_data, db_ds),
             k = k_length,
             outdirs = outdirs,
             database = opt['database_name'],
diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py
index aafa701..fccb0c5 100644
--- a/src/Caribou_extraction_train_cv.py
+++ b/src/Caribou_extraction_train_cv.py
@@ -5,7 +5,7 @@
 from utils import *
 from time import time
 from pathlib import Path
-from models.classification import ClassificationMethods
+from models.classification_old import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
@@ -14,13 +14,6 @@
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_extraction_train_cv(opt):
-    # Verify existence of files and load data
-    data_bacteria = verify_load_data(opt['data_bacteria'])
-    if opt['data_host'] is not None:
-        data_host = verify_load_data(opt['data_host'])
-        verify_concordance_klength(len(data_bacteria['kmers'][0]), len(data_host['kmers'][0]))
-
-    k_length = len(data_bacteria['kmers'][0])
 
     # Validate training parameters
     verify_positive_int(opt['batch_size'], 'batch_size')
@@ -31,6 +24,19 @@ def bacteria_extraction_train_cv(opt):
     # Initialize cluster
     init_ray_cluster(opt['workdir'])
 
+# Data loading
+################################################################################
+
+    if opt['data_host'] is not None:
+        db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+    else:
+        db_data, db_ds = verify_load_db(opt['data_bacteria'])
+
+    k_length = len(db_data['kmers'][0])
+
+    test_ds = split_sim_dataset(db_ds, db_data, 'test')
+    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+
 # Training and cross-validation of models for bacteria extraction / host removal
 ################################################################################
     
@@ -38,7 +44,7 @@ def bacteria_extraction_train_cv(opt):
 
     if opt['host_name'] is None:
         ClassificationMethods(
-            database_k_mers = data_bacteria,
+            database_k_mers = (db_data, db_ds),
             k = k_length,
             outdirs = outdirs,
             database = opt['database_name'],
@@ -51,7 +57,7 @@ def bacteria_extraction_train_cv(opt):
         ).execute_training()
     else:
         ClassificationMethods(
-            database_k_mers = (data_bacteria, data_host),
+            database_k_mers = (db_data, db_ds),
             k = k_length,
             outdirs = outdirs,
             database = opt['database_name'],
diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py
index b81800b..f6b1fe5 100644
--- a/src/Caribou_pipeline.py
+++ b/src/Caribou_pipeline.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from outputs.out import Outputs
 from data.build_data import build_load_save_data
-from models.classification import ClassificationMethods
+from models.classification_old import ClassificationMethods
 
 __author__ = 'Nicolas de Montigny'
 
diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 0157423..412b8bc 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -15,8 +15,6 @@
 from data.reduction.low_var_selection import TensorLowVarSelection
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 from data.reduction.chi_features_selection import TensorChiFeaturesSelection
-from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
-from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction
 from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion
 
 __author__ = "Nicolas de Montigny"
@@ -25,7 +23,6 @@
 
 """
 This script computes features reduction to a given K-mers dataset and then applies it.
-The method is based on the KRFE algorithm (Lebatteux et al., 2019)
 """
 
 # Initialisation / validation of parameters from CLI
@@ -126,11 +123,12 @@ def low_var_selection(train_ds, export_ds, kmers):
     return train_ds, export_ds, kmers
 
 # Chi2 evaluation of dependance between features and classes
+# Select 25% of features with highest Chi2 values
 def features_selection(train_ds, export_ds, kmers, taxa):
     preprocessor = TensorChiFeaturesSelection(
             features = kmers,
             taxa = taxa,
-            threshold = 0.75, # Keep 25% higest results
+            threshold = 0.75,
         )
 
     train_ds = preprocessor.fit_transform(train_ds)
diff --git a/src/Caribou_simulate_test_val.py b/src/Caribou_simulate_test_val.py
new file mode 100644
index 0000000..a1ee878
--- /dev/null
+++ b/src/Caribou_simulate_test_val.py
@@ -0,0 +1,90 @@
+#!/usr/bin python3
+
+import argparse
+
+from utils import *
+from time import time
+from pathlib import Path
+
+__author__ = "Nicolas de Montigny"
+
+__all__ = ['simulation']
+
+"""
+This script simulate sequencing reads for validation and/or testing dataset(s) from a whole genome dataset
+The dataset should be in the form of a k-mers counts matrix and could have the k-mers reduced as well
+The script leverages the InSilicoSeq package for simulation of sequencing reads
+"""
+
+# Initialisation / validation of parameters from CLI
+################################################################################
+def simulation(opt):
+    """
+    1. Verify existence of files and load data
+    2. Verify k-mers length concordance
+    3. Initialize cluster
+    """
+    if opt['hostset'] is not None:
+        db_data, db_ds = verify_load_host_merge(opt['dataset'], opt['hostset'])
+    else:
+        db_data, db_ds = verify_load_db(opt['dataset'])
+        
+    verify_file(opt['kmers_list'])
+    
+    outdirs = define_create_outdirs(opt['outdir'])
+    
+    init_ray_cluster(opt['workdir'])
+
+# Dataset(s) simulation
+################################################################################
+    """
+    1. Verify the datasets to simulate
+    2. Split the database dataset (possibly merged) into required dataset
+    3. Run the simulation for each dataset required
+    """
+    t_test = None
+    t_val = None
+    if opt['test']:
+        t_s = time()
+        test_ds = split_dataset(db_ds, db_data, 'test')
+        if test_ds is not None:
+            sim_dataset(test_ds, db_data, 'test')
+        t_test = time() - t_s
+    if opt['validation']:
+        t_s = time()
+        val_ds = split_dataset(db_ds, db_data, 'validation')
+        if val_ds is not None:
+            sim_dataset(val_ds, db_data, 'validation')
+        t_val = time() - t_s
+    
+    if t_test is not None:
+        print(f'Caribou finished generating the test dataset in {t_test} seconds')
+    if t_val is not None:
+        print(f'Caribou finished generating the validation dataset simulated in {t_val} seconds')
+    
+# Argument parsing from CLI
+################################################################################
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='This script simulate sequencing reads for validation and/or testing dataset(s) from a whole genome dataset')
+    # Database
+    parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
+    parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files')
+    # Host
+    parser.add_argument('-dh','--hostset', default=None, type=Path, help='Path to .npz data for extracted k-mers profile of host')
+    parser.add_argument('-ds','--hostset_name', default=None, help='Name of the host database used to name files')
+    # Simulation flags
+    parser.add_argument('-v', '--validation', action='store_true', help='Flag argument for making a "validation"-named simulated dataset')
+    parser.add_argument('-t', '--test', action='store_true', help='Flag argument for making a "test"-named simulated dataset')
+    # Parameters
+    parser.add_argument('-l','--kmers_list', type=Path, default=None, help='Optional. PATH to a file containing a list of k-mers to be extracted after the simulation. Should be the same as the reference database')
+    parser.add_argument('-o','--outdir', required=True, type=Path, help='Path to folder for outputing tuning results')
+    parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled')
+    args = parser.parse_args()
+
+    opt = vars(args)
+
+    if not opt['test'] and not opt['validation']:
+        raise ValueError('Missing flags for datasets to simulate, please use the -v and/or -t flags to decide which dataset to generate.')
+    else:
+        simulation(opt)
\ No newline at end of file
diff --git a/src/models/classification.py b/src/models/classification.py
index cc8fc5e..0b1412b 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -6,6 +6,7 @@
 import pandas as pd
 
 from glob import glob
+from typing import Dict
 from shutil import rmtree
 from utils import load_Xy_data
 from models.sklearn.models import SklearnModel
@@ -45,18 +46,18 @@ class ClassificationMethods():
     """
     def __init__(
         self,
-        database_k_mers,
-        k,
-        outdirs,
-        database,
-        classifier_binary = 'deeplstm',
-        classifier_multiclass = 'widecnn',
-        taxa = None,
-        threshold = 0.8,
-        batch_size = 32,
-        training_epochs = 100,
-        verbose = True,
-        cv = False
+        database_k_mers: Dict,
+        k: int,
+        outdirs: Dict,
+        database: str,
+        classifier_binary: str = 'deeplstm',
+        classifier_multiclass: str = 'widecnn',
+        taxa: str = None,
+        threshold: float = 0.8,
+        batch_size: int = 32,
+        training_epochs: int = 100,
+        verbose: bool = True,
+        cv: bool = False
     ):
         # Parameters
         self._k = k
@@ -104,20 +105,48 @@ def __init__(
         self._taxas_order.reverse()
         # Automatic executions
         self._verify_assign_taxas(taxa)
-        
-    # Main functions
-    #########################################################################################################
 
-    # Wrapper function for training and predicting over each known taxa
-    def execute_training_prediction(self, data2classify):
-        print('execute_training_prediction')
-        files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-        ids2classify = data2classify['ids']
-        for i, taxa in enumerate(self._taxas_order):
+    # Public functions
+    #########################################################################################################
+# TODO: Revise documentation in heading
+# TODO: Remove parameters from global if they are only required for certain functions
+# TODO: Finish transfering the functions & calls from the old version
+# TODO: Validation of params before execution of private functions
+    def fit(self):
+        """
+        Wrapper function to call the fitting method
+        """
+        # TODO: Pass training/validation data here
+
+    def predict(self):
+        """
+        Wrapper function to call the predicting method
+        """
+        # TODO: Pass data to predict here
+
+    def fit_predict(self):
+        """
+        Wrapper function for calling fit and predict
+        """
+        # TODO: Pass training/validation data here
+        # TODO: Pass data to predict here
+    
+    def cross_validation(self):
+        """
+        Wrapper function to call the cross-validation method
+        """
+        # TODO: Pass training/validation data here
+        # TODO: Pass testing data here
+
+    # Private principal functions
+    #########################################################################################################
+# TODO: Pass training/validation data here
+    def _fit(self):
+        """
+        Fit the given model to the training dataset
+        """
+        for taxa in self._taxas_order:
             if taxa in self._taxas:
-                # Training
                 if taxa in ['domain','bacteria','host']:
                     clf = self._classifier_binary
                 else:
@@ -126,46 +155,44 @@ def execute_training_prediction(self, data2classify):
                 self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
                 train = self._verify_load_data_model(self._data_file, self._model_file, taxa)
                 if train:
-                    self._train_model(taxa)
-                # Predicting
-                try:
-                    if i == 0:
-                        df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile'])
+                    if taxa in ['domain','bacteria','host']:
+                        self._binary_training(taxa)
                     else:
-                        df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile'])
-                except ValueError:
-                    print('Stopping classification prematurelly because there are no more sequences to classify')
-                    return taxa
-        return None
+                        self._multiclass_training(taxa)
 
-
-    # Execute training of model(s)
-    def execute_training(self):
-        print('execute_training')
-        for taxa in self._taxas_order:
-            if taxa in self._taxas:
-                if taxa in ['domain','bacteria','host']:
-                    clf = self._classifier_binary
+# TODO: Pass data to predict here
+    def _predict(self, data2classify):
+        """
+        Predict the given data using the trained model
+        """
+        files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        ids = data2classify['ids']
+        if len(self.classified_data['sequence']) == 0:
+            raise ValueError('Please train a model before executing classification')
+        for i, taxa in enumerate(self.classified_data['sequence']):
+            try:
+                if i == 0:
+                    df = self._classify_first(df, taxa, ids, data2classify['profile'])
                 else:
-                    clf = self._classifier_multiclass
-                self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz')
-                self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
-                train = self._verify_load_data_model(self._data_file, self._model_file, taxa)
-                if train:
-                    self._train_model(taxa)
-
-    # Train model according to passed taxa
-    def _train_model(self, taxa):
-        print('_train_model')
-        if taxa in ['domain','bacteria','host']:
-            self._binary_training(taxa)
-        else:
-            self._multiclass_training(taxa)
+                    df = self._classify_subsequent(df, taxa, ids, data2classify['profile'])
+            except ValueError:
+                print('Stopping classification prematurelly because there are no more sequences to classify')
+                return taxa
+        return None
+    
+    def _cross_validation(self):
+        """
+        Execute cross-validation of a model by fitting a model and predicting over a test dataset
+        """
 
+    # Private training secondary functions
+    #########################################################################################################
+# TODO: Remove data loading & verification from inside these functions
     def _binary_training(self, taxa):
         print('_binary_training')
         self._verify_classifier_binary()
-        self._load_training_data_merged(taxa)
         if self._classifier_binary == 'onesvm':
             self.models[taxa] = SklearnModel(
                 self._classifier_binary,
@@ -209,7 +236,7 @@ def _binary_training(self, taxa):
         self.models[taxa].preprocess(self._merged_training_datasets['train'])
         self.models[taxa].train(self._merged_training_datasets, self._merged_database_host, self._cv)
 
-        self._save_model(self._model_file, taxa)            
+        self._save_model(self._model_file, taxa)
 
     def _multiclass_training(self, taxa):
         print('_multiclass_training')
@@ -244,27 +271,10 @@ def _multiclass_training(self, taxa):
         self.models[taxa].preprocess(self._training_datasets['train'])
         self.models[taxa].train(self._training_datasets, self._database_data, self._cv)
         self._save_model(self._model_file, taxa)
-        
-    # Execute classification using trained model(s) over a given taxa
-    def execute_classification(self, data2classify):
-        print('execute_classification')
-        files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-        ids = data2classify['ids']
-        if len(self.classified_data['sequence']) == 0:
-            raise ValueError('Please train a model before executing classification')
-        for i, taxa in enumerate(self.classified_data['sequence']):
-            try:
-                if i == 0:
-                    df = self._classify_first(df, taxa, ids, data2classify['profile'])
-                else:
-                    df = self._classify_subsequent(df, taxa, ids, data2classify['profile'])
-            except ValueError:
-                print('Stopping classification prematurelly because there are no more sequences to classify')
-                return taxa
-        return None
 
+    # Private predicting secondary functions
+    #########################################################################################################
+# TODO: Revise these functions to parallelise with Ray + ease process
     # Classify sequences for first iteration
     def _classify_first(self, df, taxa, ids, df_file):
         print('_classify_first')
@@ -351,198 +361,6 @@ def _extract_subset(self, df, df_file, ids, taxa, status):
             df_clf.write_parquet(clf_file)
         return df_clf, clf_file
 
-    # Utils functions
+    # Helper functions
     #########################################################################################################
-    
-    # Verify taxas and assign to class variable
-    def _verify_assign_taxas(self, taxa):
-        print('_verify_assign_taxas')
-        if taxa is None:
-            self._taxas = self._database_data['taxas'].copy()            
-        elif isinstance(taxa, list):
-            self._taxas = taxa
-        elif isinstance(taxa, str):
-            self._taxas = [taxa]
-        else:
-            raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract")
-        self._verify_taxas()
-
-    # Verify if selected taxas are in database
-    def _verify_taxas(self):
-        print('_verify_taxas')
-        for taxa in self._taxas:
-            if taxa not in self._database_data['taxas']:
-                raise ValueError("Taxa {} not found in database".format(taxa))
-
-    # Caller function for verifying if the data and model already exist
-    def _verify_load_data_model(self, data_file, model_file, taxa):
-        print('_verify_load_data_model')
-        self._verify_files(data_file, taxa)
-        return self._verify_load_model(model_file, taxa)
-        
-    # Load extracted data if already exists
-    def _verify_files(self, file, taxa):
-        print('_verify_files')
-        self.classified_data['sequence'].append(taxa)
-        if os.path.isfile(file):
-            self.classified_data[taxa] = load_Xy_data(file)
-        else:
-            self.classified_data[taxa] = {}
-
-    # Load model if already exists
-    def _verify_load_model(self, model_file, taxa):
-        print('_verify_load_model')
-        if os.path.exists(model_file):
-            with open(model_file, 'rb') as f:
-                self.models[taxa] = cloudpickle.load(f)
-            return False
-        else:
-            return True
-
-    def _save_model(self, model_file, taxa):
-        print('_save_model')
-        with open(model_file, 'wb') as f:
-            cloudpickle.dump(self.models[taxa], f)
-
-    def _verify_classifier_binary(self):
-        print('_verify_classifier_binary')
-        if self._classifier_binary == 'onesvm':
-            if self._cv == True and self._host == True:
-                pass
-            elif self._cv == True and self._host == False:
-                raise ValueError('Classifier One-Class SVM cannot be cross-validated with bacteria data only!\nEither add host data from parameters or choose to predict directly using this method')
-            elif self._cv == False and self._host == True:
-                raise ValueError('Classifier One-Class SVM cannot classify with host data!\nEither remove host data from parameters or choose another bacteria extraction method')
-            elif self._cv == False and self._host == False:
-                pass
-        elif self._classifier_binary == 'onesvm' and self._host == False:
-            pass
-        elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == True:
-            pass
-        elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == False:
-            raise ValueError('Classifier {} cannot classify without host data!\nEither add host data to config file or choose the One-Class SVM classifier'.format(self._classifier_binary))
-        else:
-            raise ValueError('Invalid classifier option for bacteria extraction!\n\tModels implemented at this moment are :\n\tBacteria isolator :  One Class SVM (onesvm)\n\tClassic algorithm : Linear SVM (linearsvm)\n\tNeural networks : Attention (attention), Shallow LSTM (lstm) and Deep LSTM (deeplstm)')
 
-    def _verify_classifier_multiclass(self):
-        print('_verify_classifier_multiclass')
-        if self._classifier_multiclass in ['sgd','mnb','lstm_attention','cnn','widecnn']:
-            pass
-        else:
-            raise ValueError('Invalid classifier option for bacteria classification!\n\tModels implemented at this moment are :\n\tClassic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)\n\tNeural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)')
-
-    # Merge database and host reference data for bacteria extraction training
-    def _merge_database_host(self, database_data, host_data):
-        print('_merge_database_host')
-        self._merged_database_host = {}
-        self._merged_database_host['profile'] = f"{database_data['profile']}_host_merged" # Kmers profile
-
-        if os.path.exists(self._merged_database_host['profile']):
-            files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-        else:
-            files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-            files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-
-            cols2drop = []
-            for col in df_db.schema().names:
-                if col not in ['id','domain','__value__']:
-                    cols2drop.append(col)
-            df_db = df_db.drop_columns(cols2drop)
-            cols2drop = []
-            for col in df_host.schema().names:
-                if col not in ['id','domain','__value__']:
-                    cols2drop.append(col)
-            df_host = df_host.drop_columns(cols2drop)
-
-            df_merged = df_db.union(df_host)
-            df_merged.write_parquet(self._merged_database_host['profile'])
-
-        self._merged_database_host['ids'] = np.concatenate((database_data["ids"], host_data["ids"]))  # IDs
-        self._merged_database_host['kmers'] = database_data["kmers"]  # Features
-        self._merged_database_host['taxas'] = ['domain']  # Known taxas for classification
-        self._merged_database_host['fasta'] = (database_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
-
-        return df_merged
-
-    # Load, merge db + host & simulate validation / test datasets
-    def _load_training_data_merged(self, taxa):
-        print('_load_training_data_merged')
-        if self._classifier_binary == 'onesvm' and taxa == 'domain':
-            files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-            df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-            df_val_test = self._merge_database_host(self._database_data, self._host_data)
-            df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-            df_val = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_validation')
-            self._merged_training_datasets = {'train': df_train, 'validation': df_val}
-            if self._cv:
-                df_test = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_test')
-                self._merged_training_datasets['test'] = df_test
-        else:
-            df_train = self._merge_database_host(self._database_data, self._host_data)
-            df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-            df_val = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_validation')
-            self._merged_training_datasets = {'train': df_train, 'validation': df_val}
-            if self._cv:
-                df_test = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_test')
-                self._merged_training_datasets['test'] = df_test
-
-    # Load db & simulate validation / test datasets
-    def _load_training_data(self):
-        print('_load_training_data')
-        files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-        df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-        df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
-        self._training_datasets = {'train': df_train, 'validation': df_val}
-        if self._cv:
-            df_test = self.split_sim_cv_ds(df_train,self._database_data, 'test')
-            self._training_datasets['test'] = df_test
-
-    def _sim_4_cv(self, df, kmers_ds, name):
-        print('_sim_4_cv')
-        cols = ['id']
-        cols.extend(kmers_ds['taxas'])
-        cls = pd.DataFrame(columns = cols)
-        for batch in df.iter_batches(batch_format = 'pandas'):
-            cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
-        
-        sim_outdir = os.path.dirname(kmers_ds['profile'])
-        cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
-        sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
-        files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-        return df
-    
-    def split_sim_cv_ds(self, ds, data, name):
-        ds_path = os.path.join(
-            os.path.dirname(data['profile']),
-            f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}'
-            )
-        if os.path.exists(ds_path):
-            files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-        else:
-            cv_ds = ds.random_sample(0.1)
-            if cv_ds.count() == 0:
-                nb_smpl = round(ds.count() * 0.1)
-                cv_ds = ds.random_shuffle().limit(nb_smpl)
-            cv_ds = self._sim_4_cv(cv_ds, data, name)
-        return cv_ds
-
-# Helper functions outside of class
-###############################################################################
-
-def convert_archaea_bacteria(df):
-    df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
-    return df
\ No newline at end of file
diff --git a/src/models/classification_old.py b/src/models/classification_old.py
new file mode 100644
index 0000000..7419d92
--- /dev/null
+++ b/src/models/classification_old.py
@@ -0,0 +1,335 @@
+import os
+import ray
+import cloudpickle
+
+import numpy as np
+import pandas as pd
+
+from glob import glob
+from shutil import rmtree
+from utils import load_Xy_data
+from models.sklearn.models import SklearnModel
+from models.kerasTF.models import KerasTFModel
+
+# Simulation class
+from models.reads_simulation import readsSimulation
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['ClassificationMethods']
+
+class ClassificationMethods():
+    """
+    Utilities class for classifying sequences from metagenomes using ray
+
+    ----------
+    Attributes
+    ----------
+    
+    classified_data : dictionary
+        Dictionary containing the classified data for each classified taxonomic level
+
+    models : dictionary
+        Dictionary containing the trained models for each taxonomic level
+
+    ----------
+    Methods
+    ----------
+
+    execute_training : launch the training of the models for the chosen taxonomic levels
+        no parameters to pass
+
+    execute_classification : 
+        data2classify : a dictionnary containing the data to classify produced by the function Caribou.src.data.build_data.build_X_data
+
+    """
+    def __init__(
+        self,
+        database_k_mers,
+        k,
+        outdirs,
+        database,
+        classifier_binary = 'deeplstm',
+        classifier_multiclass = 'widecnn',
+        taxa = None,
+        threshold = 0.8,
+        batch_size = 32,
+        training_epochs = 100,
+        verbose = True,
+        cv = False
+    ):
+        # Parameters
+        self._k = k
+        self._cv = cv
+        self._taxas = taxa
+        self._outdirs = outdirs
+        self._database = database
+        self._verbose = verbose
+        self._threshold = threshold
+        self._classifier_binary = classifier_binary
+        self._classifier_multiclass = classifier_multiclass
+        self._batch_size = batch_size
+        self._training_epochs = training_epochs
+        # Initialize with values
+        self.classified_data = {
+            'sequence': [],
+            'classification' : None,
+            'classified_ids' : [],
+            'unknown_ids' : []
+        }
+        # Empty initializations
+        self.models = {}
+        self._host = False
+        self._taxas_order = []
+        self._host_data = None
+        self._database_data = None
+        self._training_datasets = None
+        self._merged_training_datasets = None
+        self._merged_database_host = None
+        self.previous_taxa_unclassified = None
+        # Extract database data 
+        if isinstance(database_k_mers, tuple):
+            self._host = True
+            self._database_data = database_k_mers[0]
+            self._host_data = database_k_mers[1]
+        else:
+            self._database_data = database_k_mers
+        # Remove 'id' from kmers if present
+        if 'id' in self._database_data['kmers']:
+            self._database_data['kmers'].remove('id')
+        if self._host and 'id' in self._host_data['kmers']:
+            self._host_data['kmers'].remove('id')
+        # Assign taxas order for top-down strategy
+        self._taxas_order = self._database_data['taxas'].copy()
+        self._taxas_order.reverse()
+        # Automatic executions
+        self._verify_assign_taxas(taxa)
+        
+    # Main functions
+    #########################################################################################################
+
+    # Wrapper function for training and predicting over each known taxa
+    def execute_training_prediction(self, data2classify):
+        print('execute_training_prediction')
+        files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
+        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        ids2classify = data2classify['ids']
+        for i, taxa in enumerate(self._taxas_order):
+            if taxa in self._taxas:
+                # Training
+                if taxa in ['domain','bacteria','host']:
+                    clf = self._classifier_binary
+                else:
+                    clf = self._classifier_multiclass
+                self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz')
+                self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
+                train = self._verify_load_data_model(self._data_file, self._model_file, taxa)
+                if train:
+                    self._train_model(taxa)
+                # Predicting
+                try:
+                    if i == 0:
+                        df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile'])
+                    else:
+                        df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile'])
+                except ValueError:
+                    print('Stopping classification prematurelly because there are no more sequences to classify')
+                    return taxa
+        return None
+    
+    # Utils functions
+    #########################################################################################################
+    
+    # Verify taxas and assign to class variable
+    def _verify_assign_taxas(self, taxa):
+        print('_verify_assign_taxas')
+        if taxa is None:
+            self._taxas = self._database_data['taxas'].copy()            
+        elif isinstance(taxa, list):
+            self._taxas = taxa
+        elif isinstance(taxa, str):
+            self._taxas = [taxa]
+        else:
+            raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract")
+        self._verify_taxas()
+
+    # Verify if selected taxas are in database
+    def _verify_taxas(self):
+        print('_verify_taxas')
+        for taxa in self._taxas:
+            if taxa not in self._database_data['taxas']:
+                raise ValueError("Taxa {} not found in database".format(taxa))
+
+    # Caller function for verifying if the data and model already exist
+    def _verify_load_data_model(self, data_file, model_file, taxa):
+        print('_verify_load_data_model')
+        self._verify_files(data_file, taxa)
+        return self._verify_load_model(model_file, taxa)
+        
+    # Load extracted data if already exists
+    def _verify_files(self, file, taxa):
+        print('_verify_files')
+        self.classified_data['sequence'].append(taxa)
+        if os.path.isfile(file):
+            self.classified_data[taxa] = load_Xy_data(file)
+        else:
+            self.classified_data[taxa] = {}
+
+    # Load model if already exists
+    def _verify_load_model(self, model_file, taxa):
+        print('_verify_load_model')
+        if os.path.exists(model_file):
+            with open(model_file, 'rb') as f:
+                self.models[taxa] = cloudpickle.load(f)
+            return False
+        else:
+            return True
+
+    def _save_model(self, model_file, taxa):
+        print('_save_model')
+        with open(model_file, 'wb') as f:
+            cloudpickle.dump(self.models[taxa], f)
+
+    def _verify_classifier_binary(self):
+        print('_verify_classifier_binary')
+        if self._classifier_binary == 'onesvm':
+            if self._cv == True and self._host == True:
+                pass
+            elif self._cv == True and self._host == False:
+                raise ValueError('Classifier One-Class SVM cannot be cross-validated with bacteria data only!\nEither add host data from parameters or choose to predict directly using this method')
+            elif self._cv == False and self._host == True:
+                raise ValueError('Classifier One-Class SVM cannot classify with host data!\nEither remove host data from parameters or choose another bacteria extraction method')
+            elif self._cv == False and self._host == False:
+                pass
+        elif self._classifier_binary == 'onesvm' and self._host == False:
+            pass
+        elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == True:
+            pass
+        elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == False:
+            raise ValueError('Classifier {} cannot classify without host data!\nEither add host data to config file or choose the One-Class SVM classifier'.format(self._classifier_binary))
+        else:
+            raise ValueError('Invalid classifier option for bacteria extraction!\n\tModels implemented at this moment are :\n\tBacteria isolator :  One Class SVM (onesvm)\n\tClassic algorithm : Linear SVM (linearsvm)\n\tNeural networks : Attention (attention), Shallow LSTM (lstm) and Deep LSTM (deeplstm)')
+
+    def _verify_classifier_multiclass(self):
+        print('_verify_classifier_multiclass')
+        if self._classifier_multiclass in ['sgd','mnb','lstm_attention','cnn','widecnn']:
+            pass
+        else:
+            raise ValueError('Invalid classifier option for bacteria classification!\n\tModels implemented at this moment are :\n\tClassic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)\n\tNeural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)')
+
+    # Merge database and host reference data for bacteria extraction training
+    def _merge_database_host(self, database_data, host_data):
+        print('_merge_database_host')
+        self._merged_database_host = {}
+        self._merged_database_host['profile'] = f"{database_data['profile']}_host_merged" # Kmers profile
+
+        if os.path.exists(self._merged_database_host['profile']):
+            files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
+            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        else:
+            files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
+            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
+            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+
+            cols2drop = []
+            for col in df_db.schema().names:
+                if col not in ['id','domain','__value__']:
+                    cols2drop.append(col)
+            df_db = df_db.drop_columns(cols2drop)
+            cols2drop = []
+            for col in df_host.schema().names:
+                if col not in ['id','domain','__value__']:
+                    cols2drop.append(col)
+            df_host = df_host.drop_columns(cols2drop)
+
+            df_merged = df_db.union(df_host)
+            df_merged.write_parquet(self._merged_database_host['profile'])
+
+        self._merged_database_host['ids'] = np.concatenate((database_data["ids"], host_data["ids"]))  # IDs
+        self._merged_database_host['kmers'] = database_data["kmers"]  # Features
+        self._merged_database_host['taxas'] = ['domain']  # Known taxas for classification
+        self._merged_database_host['fasta'] = (database_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
+
+        return df_merged
+
+    # Load, merge db + host & simulate validation / test datasets
+    def _load_training_data_merged(self, taxa):
+        print('_load_training_data_merged')
+        if self._classifier_binary == 'onesvm' and taxa == 'domain':
+            files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
+            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
+            df_val_test = self._merge_database_host(self._database_data, self._host_data)
+            df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
+            df_val = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_validation')
+            self._merged_training_datasets = {'train': df_train, 'validation': df_val}
+            if self._cv:
+                df_test = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_test')
+                self._merged_training_datasets['test'] = df_test
+        else:
+            df_train = self._merge_database_host(self._database_data, self._host_data)
+            df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
+            df_val = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_validation')
+            self._merged_training_datasets = {'train': df_train, 'validation': df_val}
+            if self._cv:
+                df_test = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_test')
+                self._merged_training_datasets['test'] = df_test
+
+    # Load db & simulate validation / test datasets
+    def _load_training_data(self):
+        print('_load_training_data')
+        files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
+        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
+        df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
+        self._training_datasets = {'train': df_train, 'validation': df_val}
+        if self._cv:
+            df_test = self.split_sim_cv_ds(df_train,self._database_data, 'test')
+            self._training_datasets['test'] = df_test
+
+    def _sim_4_cv(self, df, kmers_ds, name):
+        print('_sim_4_cv')
+        cols = ['id']
+        cols.extend(kmers_ds['taxas'])
+        cls = pd.DataFrame(columns = cols)
+        for batch in df.iter_batches(batch_format = 'pandas'):
+            cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
+        
+        sim_outdir = os.path.dirname(kmers_ds['profile'])
+        cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
+        sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
+        files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        return df
+    
+    def split_sim_cv_ds(self, ds, data, name):
+        ds_path = os.path.join(
+            os.path.dirname(data['profile']),
+            f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}'
+            )
+        if os.path.exists(ds_path):
+            files_lst = glob(os.path.join(ds_path,'*.parquet'))
+            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        else:
+            cv_ds = ds.random_sample(0.1)
+            if cv_ds.count() == 0:
+                nb_smpl = round(ds.count() * 0.1)
+                cv_ds = ds.random_shuffle().limit(nb_smpl)
+            cv_ds = self._sim_4_cv(cv_ds, data, name)
+        return cv_ds
+
+# Helper functions outside of class
+###############################################################################
+
+def convert_archaea_bacteria(df):
+    df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
+    return df
\ No newline at end of file
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 79c763d..cd57ef5 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -164,7 +164,6 @@ def preprocess(self, df):
                 TensorRDFFeaturesSelection(self.kmers, self.taxa),
             )
         
-
         self._encoder.fit(df)
         df = self._preprocessor.fit_transform(df)
         self._reductor = TensorTruncatedSVDReduction(self.kmers)
diff --git a/src/supplement/sklearn_tuning.py b/src/supplement/sklearn_tuning.py
index 76f88ad..ed52dc3 100644
--- a/src/supplement/sklearn_tuning.py
+++ b/src/supplement/sklearn_tuning.py
@@ -14,16 +14,19 @@
 from utils import *
 from models.reads_simulation import readsSimulation
 from models.ray_tensor_min_max import TensorMinMaxScaler
+
 # from ray.data.preprocessors import MinMaxScaler
 from src.models.sklearn.partial_trainer import SklearnPartialTrainer
 from src.models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 
 # Preprocessing
 from ray.data.preprocessors import Chain, LabelEncoder
+
 # Training
 from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
+
 # Tuning
 from ray import tune
 from ray.tune import Tuner, TuneConfig
@@ -42,41 +45,41 @@ def merge_db_host(db_data, host_data):
 
     if os.path.exists(merged_db_host['profile']):
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
-        df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     else:
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
-        df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
         col2drop = []
-        for col in df_db.schema().names:
+        for col in db_ds.schema().names:
             if col not in ['id','domain','__value__']:
                 col2drop.append(col)
-        df_db = df_db.drop_columns(col2drop)
+        db_ds = db_ds.drop_columns(col2drop)
         col2drop = []
-        for col in df_host.schema().names:
+        for col in host_ds.schema().names:
             if col not in ['id','domain','__value__']:
                 col2drop.append(col)
-        df_host = df_host.drop_columns(col2drop)
+        host_ds = host_ds.drop_columns(col2drop)
 
-        df_merged = df_db.union(df_host)
-        df_merged.write_parquet(merged_db_host['profile'])
+        merged_ds = db_ds.union(host_ds)
+        merged_ds.write_parquet(merged_db_host['profile'])
     
     merged_db_host['ids'] = np.concatenate((db_data["ids"], host_data["ids"]))  # IDs
     merged_db_host['kmers'] = db_data['kmers']  # Features
     merged_db_host['taxas'] = ['domain']  # Known taxas for classification
     merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
     
-    return merged_db_host, df_merged
+    return merged_db_host, merged_ds
 
-def sim_4_cv(df, database_data, name):
+def sim_4_cv(ds, database_data, name):
     print('_sim_4_cv')
     k = len(database_data['kmers'][0])
     cols = ['id']
     cols.extend(database_data['taxas'])
     cls = pd.DataFrame(columns = cols)
-    for batch in df.iter_batches(batch_format = 'pandas'):
+    for batch in ds.iter_batches(batch_format = 'pandas'):
         cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
     
     sim_outdir = os.path.dirname(database_data['profile'])
@@ -84,8 +87,8 @@ def sim_4_cv(df, database_data, name):
     cv_sim = readsSimulation(database_data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, database_data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
-    return df
+    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    return ds
 
 def convert_archaea_bacteria(df):
     df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
@@ -106,7 +109,7 @@ def split_val_test_ds(ds, data):
     test_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_test_data_K{len(data["kmers"][0])}')
     if os.path.exists(val_path):
         files_lst = glob(os.path.join(val_path, '*.parquet'))
-        val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         val_ds = val_ds.map_batches(
             convert_archaea_bacteria,
             batch_format = 'pandas'
@@ -119,7 +122,7 @@ def split_val_test_ds(ds, data):
         val_ds = sim_4_cv(val_ds, data, 'validation')
     if os.path.exists(test_path):
         files_lst = glob(os.path.join(test_path, '*.parquet'))
-        test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         test_ds = test_ds.map_batches(
             convert_archaea_bacteria,
             batch_format = 'pandas'
@@ -164,7 +167,7 @@ def split_val_test_ds(ds, data):
         val_ds, test_ds = split_val_test_ds(test_val_ds,test_val_data)
         db_data = verify_load_data(opt['data'])
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 elif opt['classifier'] == 'linearsvm' and opt['taxa'] == 'domain':
     if opt['data_host'] is None:
         raise ValueError('To tune for a domain taxa, a host species is required.\
@@ -175,7 +178,7 @@ def split_val_test_ds(ds, data):
 else:
     db_data = verify_load_data(opt['data'])
     files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-    train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+    train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     val_ds, test_ds = split_val_test_ds(train_ds, db_data)
 
 # Preprocessing
diff --git a/src/utils.py b/src/utils.py
index f7e36a6..8b826b0 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -2,14 +2,19 @@
 import ray
 import json
 import logging
+import warnings
+
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
+from glob import glob
 from pathlib import Path
 from warnings import warn
 from psutil import virtual_memory
 
+from models.reads_simulation import readsSimulation
+
 __author__ = "Nicolas de Montigny"
 
 __all__ = [
@@ -36,7 +41,13 @@
     'verify_load_preclassified',
     'merge_save_data',
     'zip_X_y',
-    'ensure_length_ds'
+    'ensure_length_ds',
+    'convert_archaea_bacteria',
+    'verify_load_db',
+    'verify_load_host_merge',
+    'merge_db_host',
+    'split_sim_dataset',
+    'sim_dataset'
 ]
 
 # System
@@ -58,7 +69,7 @@ def init_ray_cluster(workdir):
             ray.shutdown()
             frac -= 0.05
 
-# Data handling
+# Data I/O
 #########################################################################################################
 
 # Load data from file
@@ -282,3 +293,92 @@ def ensure_length_ds(len_x, len_y):
     if len_x != len_y:
         raise ValueError(
             'X and y have different lengths: {} and {}'.format(len_x, len_y))
+
+# Datasets handling
+#########################################################################################################
+
+def convert_archaea_bacteria(df):
+    df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
+    return df
+
+def verify_load_db(db_data):
+    """
+    Wrapper function for verifying and loading the db dataset
+    """
+    db_data = verify_load_data(db_data)
+    files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
+    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
+    
+    return db_data, db_ds
+
+def verify_load_host_merge(db_data, host_data):
+    """
+    Wrapper function for verifying, loading and merging both datasets
+    """
+    db_data = verify_load_data(db_data)
+    host_data = verify_load_data(host_data)
+    verify_concordance_klength(len(db_data['kmers'][0]), len(host_data['kmers'][0]))
+    merged_data, merged_ds = merge_db_host(db_data, host_data)
+    
+    return merged_data, merged_ds
+
+def merge_db_host(db_data, host_data):
+    """
+    Merge the two databases along the rows axis
+    """
+    merged_db_host = {}
+    merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
+
+    if os.path.exists(merged_db_host['profile']):
+        files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
+        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    else:
+        files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
+        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
+        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+
+        merged_ds = db_ds.union(host_ds)
+        merged_ds = merged_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
+        merged_ds.write_parquet(merged_db_host['profile'])
+    
+    merged_db_host['ids'] = np.concatenate((db_data["ids"], host_data["ids"]))  # IDs
+    merged_db_host['kmers'] = db_data['kmers']  # Features
+    merged_db_host['taxas'] = ['domain']  # Known taxas for classification
+    merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
+    
+    return merged_db_host, merged_ds
+
+def split_sim_dataset(ds, data, name):
+    splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}')
+    if os.path.exists(splitted_path):
+        warnings.warn(f'Splitted dataset {name} already exists, skipping simulation')
+        return None
+    else:
+        splitted_ds = ds.random_sample(0.1)
+        if splitted_ds.count() == 0:
+            nb_samples = round(ds.count() * 0.1)
+            splitted_ds = ds.random_shuffle().limit(nb_samples)
+        
+        sim_dataset(ds, data, name)
+        return splitted_ds
+
+def sim_dataset(ds, data, name):
+    """
+    Simulate the dataset from the database and generate its data
+    """
+    k = len(data['kmers'][0])
+    cols = ['id']
+    cols.extend(data['taxas'])
+    cls = pd.DataFrame(columns = cols)
+    for batch in ds.iter_batches(batch_format = 'pandas'):
+        cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
+    
+    sim_outdir = os.path.dirname(data['profile'])
+    cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
+    sim_data = cv_sim.simulation(k, data['kmers'])
+    files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
+    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    return sim_ds
+

From a44dcf7459330b212917eb5aba09cbb77fc7e13e Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 4 Nov 2023 22:50:49 -0400
Subject: [PATCH 21/92] debug circular import

---
 src/Caribou_simulate_test_val.py |  9 +++-----
 src/models/reads_simulation.py   | 39 ++++++++++++++++++++++++++++++--
 src/utils.py                     | 39 +-------------------------------
 3 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/src/Caribou_simulate_test_val.py b/src/Caribou_simulate_test_val.py
index a1ee878..969f533 100644
--- a/src/Caribou_simulate_test_val.py
+++ b/src/Caribou_simulate_test_val.py
@@ -5,6 +5,7 @@
 from utils import *
 from time import time
 from pathlib import Path
+from models.reads_simulation import split_sim_dataset
 
 __author__ = "Nicolas de Montigny"
 
@@ -46,15 +47,11 @@ def simulation(opt):
     t_val = None
     if opt['test']:
         t_s = time()
-        test_ds = split_dataset(db_ds, db_data, 'test')
-        if test_ds is not None:
-            sim_dataset(test_ds, db_data, 'test')
+        test_ds = split_sim_dataset(db_ds, db_data, 'test')
         t_test = time() - t_s
     if opt['validation']:
         t_s = time()
-        val_ds = split_dataset(db_ds, db_data, 'validation')
-        if val_ds is not None:
-            sim_dataset(val_ds, db_data, 'validation')
+        val_ds = split_sim_dataset(db_ds, db_data, 'validation')
         t_val = time() - t_s
     
     if t_test is not None:
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index 0c47f81..5437dbd 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -15,7 +15,7 @@
 
 __author__ = "Nicolas de Montigny"
 
-__all__ = ['ReadsSimulation']
+__all__ = ['ReadsSimulation','split_sim_dataset','sim_dataset']
 
 # Reduce number of cpus used to reduce nb of tmp files
 # reduce number of reads generated
@@ -203,4 +203,39 @@ def _verify_sim_arguments(self, k, kmers_list):
         elif k is not None and kmers_list is None:
             warn("K is provided but k-mers list is None, k-mers list will be generated")
             raise ValueError("k value was provided but not k-mers list, please provide a k-mers list or no k value")
-        return k, kmers_list
\ No newline at end of file
+        return k, kmers_list
+
+# Helper functions
+#########################################################################################################
+
+def split_sim_dataset(ds, data, name):
+    splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}')
+    if os.path.exists(splitted_path):
+        warnings.warn(f'Splitted dataset {name} already exists, skipping simulation')
+        return None
+    else:
+        splitted_ds = ds.random_sample(0.1)
+        if splitted_ds.count() == 0:
+            nb_samples = round(ds.count() * 0.1)
+            splitted_ds = ds.random_shuffle().limit(nb_samples)
+        
+        sim_dataset(ds, data, name)
+        return splitted_ds
+
+def sim_dataset(ds, data, name):
+    """
+    Simulate the dataset from the database and generate its data
+    """
+    k = len(data['kmers'][0])
+    cols = ['id']
+    cols.extend(data['taxas'])
+    cls = pd.DataFrame(columns = cols)
+    for batch in ds.iter_batches(batch_format = 'pandas'):
+        cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
+    
+    sim_outdir = os.path.dirname(data['profile'])
+    cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
+    sim_data = cv_sim.simulation(k, data['kmers'])
+    files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
+    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    return sim_ds
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index 8b826b0..f52d467 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -13,8 +13,6 @@
 from warnings import warn
 from psutil import virtual_memory
 
-from models.reads_simulation import readsSimulation
-
 __author__ = "Nicolas de Montigny"
 
 __all__ = [
@@ -45,9 +43,7 @@
     'convert_archaea_bacteria',
     'verify_load_db',
     'verify_load_host_merge',
-    'merge_db_host',
-    'split_sim_dataset',
-    'sim_dataset'
+    'merge_db_host'
 ]
 
 # System
@@ -349,36 +345,3 @@ def merge_db_host(db_data, host_data):
     merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
     
     return merged_db_host, merged_ds
-
-def split_sim_dataset(ds, data, name):
-    splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}')
-    if os.path.exists(splitted_path):
-        warnings.warn(f'Splitted dataset {name} already exists, skipping simulation')
-        return None
-    else:
-        splitted_ds = ds.random_sample(0.1)
-        if splitted_ds.count() == 0:
-            nb_samples = round(ds.count() * 0.1)
-            splitted_ds = ds.random_shuffle().limit(nb_samples)
-        
-        sim_dataset(ds, data, name)
-        return splitted_ds
-
-def sim_dataset(ds, data, name):
-    """
-    Simulate the dataset from the database and generate its data
-    """
-    k = len(data['kmers'][0])
-    cols = ['id']
-    cols.extend(data['taxas'])
-    cls = pd.DataFrame(columns = cols)
-    for batch in ds.iter_batches(batch_format = 'pandas'):
-        cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
-    
-    sim_outdir = os.path.dirname(data['profile'])
-    cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
-    sim_data = cv_sim.simulation(k, data['kmers'])
-    files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    return sim_ds
-

From b081221da800e14d6d8112a63001f47bb58117b2 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 5 Nov 2023 17:22:31 -0500
Subject: [PATCH 22/92] reads_simulation debug

---
 src/models/reads_simulation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index 5437dbd..b78d7d9 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -4,7 +4,9 @@
 import pandas as pd
 
 import os
+import ray
 import gzip
+import warnings
 
 from Bio import SeqIO
 from glob import glob

From 5393fdba745d27f4b9267e482e6dc239ce4471c8 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 5 Nov 2023 18:50:59 -0500
Subject: [PATCH 23/92] datasets loading in split for test/val + host merge

---
 src/Caribou_classification.py          |  5 ++++-
 src/Caribou_classification_train_cv.py |  8 ++++++--
 src/Caribou_extraction.py              |  7 ++++++-
 src/Caribou_extraction_train_cv.py     | 10 ++++++++--
 src/Caribou_pipeline.py                |  1 +
 src/Caribou_simulate_test_val.py       | 25 ++++++++++++++++---------
 src/models/classification.py           | 16 +++++++++++-----
 src/models/reads_simulation.py         | 16 ++++++++++------
 src/utils.py                           | 19 +++++++++++++++++--
 9 files changed, 79 insertions(+), 28 deletions(-)

diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index 9c29172..a1992c7 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -6,12 +6,15 @@
 from utils import *
 from time import time
 from pathlib import Path
+from models.reads_simulation import split_sim_dataset
 from models.classification_old import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['bacteria_classification_train_cv']
 
+VALIDATION_DATASET_NAME = 'validation'
+
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_classification(opt):
@@ -51,7 +54,7 @@ def bacteria_classification(opt):
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
 
-    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+    val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
 
 # Definition of model for bacteria taxonomic classification + training
 ################################################################################
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index d7d25a5..aac75d8 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -7,6 +7,7 @@
 from time import time
 from pathlib import Path
 from logging import ERROR
+from models.reads_simulation import split_sim_dataset
 from models.classification_old import ClassificationMethods
 
 warnings.filterwarnings('ignore')
@@ -15,6 +16,9 @@
 
 __all__ = ['bacteria_classification_train_cv']
 
+VALIDATION_DATASET_NAME = 'validation'
+TEST_DATASET_NAME = 'test'
+
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_classification_train_cv(opt):
@@ -48,8 +52,8 @@ def bacteria_classification_train_cv(opt):
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
 
-    test_ds = split_sim_dataset(db_ds, db_data, 'test')
-    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+    test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME)
+    val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
 
 # Training and cross-validation of models for classification of bacterias
 ################################################################################
diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py
index d3ea11f..eda156b 100644
--- a/src/Caribou_extraction.py
+++ b/src/Caribou_extraction.py
@@ -5,12 +5,15 @@
 from utils import *
 from time import time
 from pathlib import Path
+from models.reads_simulation import split_sim_dataset
 from models.classification_old import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['bacteria_extraction_train_cv']
 
+VALIDATION_DATASET_NAME = 'validation'
+
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_extraction(opt):
@@ -35,13 +38,15 @@ def bacteria_extraction(opt):
 
     if opt['data_host'] is not None:
         db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+        db_name = 'host_merged'
     else:
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
+        db_name = opt['dataset_name']
     data_metagenome = verify_load_data(opt['data_metagenome'])
 
     k_length = len(db_data['kmers'][0])
 
-    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+    val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
 
 # Definition of model for bacteria extraction / host removal + execution
 ################################################################################
diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py
index fccb0c5..2a77471 100644
--- a/src/Caribou_extraction_train_cv.py
+++ b/src/Caribou_extraction_train_cv.py
@@ -5,12 +5,16 @@
 from utils import *
 from time import time
 from pathlib import Path
+from models.reads_simulation import split_sim_dataset
 from models.classification_old import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['bacteria_extraction_train_cv']
 
+VALIDATION_DATASET_NAME = 'validation'
+TEST_DATASET_NAME = 'test'
+
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def bacteria_extraction_train_cv(opt):
@@ -29,13 +33,15 @@ def bacteria_extraction_train_cv(opt):
 
     if opt['data_host'] is not None:
         db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+        db_name = 'host_merged'
     else:
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
+        db_name = opt['dataset_name']
 
     k_length = len(db_data['kmers'][0])
 
-    test_ds = split_sim_dataset(db_ds, db_data, 'test')
-    val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+    test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
+    val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
 
 # Training and cross-validation of models for bacteria extraction / host removal
 ################################################################################
diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py
index f6b1fe5..a6fdb0b 100644
--- a/src/Caribou_pipeline.py
+++ b/src/Caribou_pipeline.py
@@ -15,6 +15,7 @@
 
 __all__ = ['caribou']
 
+
 # Part 0 - Initialisation / extraction of parameters from config file
 ################################################################################
 def caribou(opt):
diff --git a/src/Caribou_simulate_test_val.py b/src/Caribou_simulate_test_val.py
index 969f533..ca45c4d 100644
--- a/src/Caribou_simulate_test_val.py
+++ b/src/Caribou_simulate_test_val.py
@@ -17,6 +17,9 @@
 The script leverages the InSilicoSeq package for simulation of sequencing reads
 """
 
+VALIDATION_DATASET_NAME = 'validation'
+TEST_DATASET_NAME = 'test'
+
 # Initialisation / validation of parameters from CLI
 ################################################################################
 def simulation(opt):
@@ -24,18 +27,22 @@ def simulation(opt):
     1. Verify existence of files and load data
     2. Verify k-mers length concordance
     3. Initialize cluster
+    4. Load data and merge if necessary
     """
-    if opt['hostset'] is not None:
-        db_data, db_ds = verify_load_host_merge(opt['dataset'], opt['hostset'])
-    else:
-        db_data, db_ds = verify_load_db(opt['dataset'])
-        
+    
     verify_file(opt['kmers_list'])
     
     outdirs = define_create_outdirs(opt['outdir'])
     
     init_ray_cluster(opt['workdir'])
 
+    if opt['hostset'] is not None:
+        db_data, db_ds = verify_load_host_merge(opt['dataset'], opt['hostset'])
+        db_name = 'host_merged'
+    else:
+        db_data, db_ds = verify_load_db(opt['dataset'])
+        db_name = opt['dataset_name']
+    
 # Dataset(s) simulation
 ################################################################################
     """
@@ -47,17 +54,17 @@ def simulation(opt):
     t_val = None
     if opt['test']:
         t_s = time()
-        test_ds = split_sim_dataset(db_ds, db_data, 'test')
+        test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
         t_test = time() - t_s
     if opt['validation']:
         t_s = time()
-        val_ds = split_sim_dataset(db_ds, db_data, 'validation')
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
         t_val = time() - t_s
     
     if t_test is not None:
-        print(f'Caribou finished generating the test dataset in {t_test} seconds')
+        print(f'Caribou finished generating the {TEST_DATASET_NAME} dataset in {t_test} seconds')
     if t_val is not None:
-        print(f'Caribou finished generating the validation dataset simulated in {t_val} seconds')
+        print(f'Caribou finished generating the {VALIDATION_DATASET_NAME} dataset simulated in {t_val} seconds')
     
 # Argument parsing from CLI
 ################################################################################
diff --git a/src/models/classification.py b/src/models/classification.py
index 0b1412b..b9bc523 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -19,6 +19,10 @@
 
 __all__ = ['ClassificationMethods']
 
+TRAINING_DATASET_NAME = 'train'
+VALIDATION_DATASET_NAME = 'validation'
+TEST_DATASET_NAME = 'test'
+
 class ClassificationMethods():
     """
     Utilities class for classifying sequences from metagenomes using ray
@@ -37,12 +41,14 @@ class ClassificationMethods():
     Methods
     ----------
 
-    execute_training : launch the training of the models for the chosen taxonomic levels
-        no parameters to pass
+    fit : function to call the fitting method
+    
+    predict : function to call the predicting method
 
-    execute_classification : 
-        data2classify : a dictionnary containing the data to classify produced by the function Caribou.src.data.build_data.build_X_data
+    fit_predict : wrapper function for calling fit and predict
 
+    cross_validation : function to call the cross-validation process
+    
     """
     def __init__(
         self,
@@ -112,7 +118,7 @@ def __init__(
 # TODO: Remove parameters from global if they are only required for certain functions
 # TODO: Finish transfering the functions & calls from the old version
 # TODO: Validation of params before execution of private functions
-    def fit(self):
+    def fit(self, datasets, ):
         """
         Wrapper function to call the fitting method
         """
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index b78d7d9..e517cc4 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -8,6 +8,7 @@
 import gzip
 import warnings
 
+from utils import *
 from Bio import SeqIO
 from glob import glob
 from pathlib import Path
@@ -211,18 +212,21 @@ def _verify_sim_arguments(self, k, kmers_list):
 #########################################################################################################
 
 def split_sim_dataset(ds, data, name):
-    splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}')
+    splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}.npz')
     if os.path.exists(splitted_path):
-        warnings.warn(f'Splitted dataset {name} already exists, skipping simulation')
-        return None
+        warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset')
+        splitted_data = load_Xy_data(splitted_path)
+        files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet'))
+        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        return splitted_ds, splitted_data
     else:
         splitted_ds = ds.random_sample(0.1)
         if splitted_ds.count() == 0:
             nb_samples = round(ds.count() * 0.1)
             splitted_ds = ds.random_shuffle().limit(nb_samples)
         
-        sim_dataset(ds, data, name)
-        return splitted_ds
+        splitted_ds, splitted_data = sim_dataset(ds, data, name)
+        return splitted_ds, splitted_data
 
 def sim_dataset(ds, data, name):
     """
@@ -240,4 +244,4 @@ def sim_dataset(ds, data, name):
     sim_data = cv_sim.simulation(k, data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
     sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    return sim_ds
\ No newline at end of file
+    return sim_ds, sim_data
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index f52d467..f9f1d4b 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -46,6 +46,11 @@
     'merge_db_host'
 ]
 
+# Constants
+#########################################################################################################
+
+TENSOR_COLUMN_NAME = '__value__'
+
 # System
 #########################################################################################################
 
@@ -324,17 +329,25 @@ def merge_db_host(db_data, host_data):
     Merge the two databases along the rows axis
     """
     merged_db_host = {}
-    merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
+    merged_db_host_file = f"{db_data['profile']}_host_merged.npz"
 
-    if os.path.exists(merged_db_host['profile']):
+    if os.path.exists(merged_db_host_file):
+        merged_db_host = load_Xy_data(merged_db_host_file)
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
         merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     else:
+        merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
         db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
         host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
+        cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]]
+        db_ds = db_ds.drop_columns(cols2drop)
+
+        cols2drop = [col for col in host_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]]
+        host_ds = host_ds.drop_columns(cols2drop)
+
         merged_ds = db_ds.union(host_ds)
         merged_ds = merged_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
         merged_ds.write_parquet(merged_db_host['profile'])
@@ -344,4 +357,6 @@ def merge_db_host(db_data, host_data):
     merged_db_host['taxas'] = ['domain']  # Known taxas for classification
     merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
     
+    save_Xy_data(merged_db_host, merged_db_host_file)
+
     return merged_db_host, merged_ds

From 5cc20ee4e59052b7d30674d0f14ad8dd72d80cc2 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 6 Nov 2023 08:20:13 -0500
Subject: [PATCH 24/92] simulation wrong dataset passed

---
 src/models/reads_simulation.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index e517cc4..8057421 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -12,6 +12,7 @@
 from Bio import SeqIO
 from glob import glob
 from pathlib import Path
+from shutil import rmtree
 from warnings import warn
 from data.build_data import build_load_save_data
 from joblib import Parallel, delayed, parallel_backend
@@ -92,7 +93,11 @@ def __init__(
         self._cls_out = os.path.join(outdir, f'sim_{self._name}_class.csv')
         # Dataset variables
         self.kmers_data = {}
-        os.mkdir(self._tmp_path)
+        try:
+            os.mkdir(self._tmp_path)
+        except FileExistsError:
+            rmtree(self._tmp_path)
+            os.mkdir(self._tmp_path)
 
     def simulation(self, k = None, kmers_list = None):
         k, kmers_list = self._verify_sim_arguments(k, kmers_list)
@@ -224,8 +229,7 @@ def split_sim_dataset(ds, data, name):
         if splitted_ds.count() == 0:
             nb_samples = round(ds.count() * 0.1)
             splitted_ds = ds.random_shuffle().limit(nb_samples)
-        
-        splitted_ds, splitted_data = sim_dataset(ds, data, name)
+        splitted_ds, splitted_data = sim_dataset(splitted_ds, data, name)
         return splitted_ds, splitted_data
 
 def sim_dataset(ds, data, name):
@@ -238,7 +242,6 @@ def sim_dataset(ds, data, name):
     cls = pd.DataFrame(columns = cols)
     for batch in ds.iter_batches(batch_format = 'pandas'):
         cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
-    
     sim_outdir = os.path.dirname(data['profile'])
     cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, data['kmers'])

From d94bd2cfd50e081f2affce0b5280c41d471c28a7 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 6 Nov 2023 17:29:46 -0500
Subject: [PATCH 25/92] read parallelism -1 + reduce nb simulation

---
 src/data/kmers.py                |  3 +--
 src/models/classification.py     |  3 +--
 src/models/classification_old.py | 24 ++++++++----------------
 src/models/reads_simulation.py   |  6 +++---
 src/utils.py                     |  8 ++++----
 5 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/src/data/kmers.py b/src/data/kmers.py
index 5a0fbe5..7b1cb52 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -319,8 +319,7 @@ def _make_ray_ds(self):
                 self.df = self.df.repartition(int(self.df.count()/10))
         else:
             self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet'))
-            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(self._files_list))
-            # self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1)
+            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1)
 
     def _kmers_tokenization(self):
         print('_kmers_tokenization')
diff --git a/src/models/classification.py b/src/models/classification.py
index b9bc523..a2e8613 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -172,8 +172,7 @@ def _predict(self, data2classify):
         Predict the given data using the trained model
         """
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         ids = data2classify['ids']
         if len(self.classified_data['sequence']) == 0:
             raise ValueError('Please train a model before executing classification')
diff --git a/src/models/classification_old.py b/src/models/classification_old.py
index 7419d92..55847ec 100644
--- a/src/models/classification_old.py
+++ b/src/models/classification_old.py
@@ -112,8 +112,7 @@ def __init__(
     def execute_training_prediction(self, data2classify):
         print('execute_training_prediction')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         ids2classify = data2classify['ids']
         for i, taxa in enumerate(self._taxas_order):
             if taxa in self._taxas:
@@ -226,15 +225,12 @@ def _merge_database_host(self, database_data, host_data):
 
         if os.path.exists(self._merged_database_host['profile']):
             files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         else:
             files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
             files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
 
             cols2drop = []
             for col in df_db.schema().names:
@@ -262,8 +258,7 @@ def _load_training_data_merged(self, taxa):
         print('_load_training_data_merged')
         if self._classifier_binary == 'onesvm' and taxa == 'domain':
             files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
             df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
             df_val_test = self._merge_database_host(self._database_data, self._host_data)
             df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
@@ -285,8 +280,7 @@ def _load_training_data_merged(self, taxa):
     def _load_training_data(self):
         print('_load_training_data')
         files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
         df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
         self._training_datasets = {'train': df_train, 'validation': df_val}
@@ -306,8 +300,7 @@ def _sim_4_cv(self, df, kmers_ds, name):
         cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
         sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
         files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         return df
     
     def split_sim_cv_ds(self, ds, data, name):
@@ -317,8 +310,7 @@ def split_sim_cv_ds(self, ds, data, name):
             )
         if os.path.exists(ds_path):
             files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         else:
             cv_ds = ds.random_sample(0.1)
             if cv_ds.count() == 0:
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index 8057421..630f86f 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -79,7 +79,7 @@ def __init__(
             self._fasta_host = None
         self._cls_in = cls
         self._genomes = genomes
-        self._nb_reads = len(genomes) * 5
+        self._nb_reads = len(genomes) * 3
         self._sequencing = sequencing
         self._path = outdir
         self._tmp_path = os.path.join(outdir,'tmp')
@@ -222,7 +222,7 @@ def split_sim_dataset(ds, data, name):
         warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset')
         splitted_data = load_Xy_data(splitted_path)
         files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet'))
-        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         return splitted_ds, splitted_data
     else:
         splitted_ds = ds.random_sample(0.1)
@@ -246,5 +246,5 @@ def sim_dataset(ds, data, name):
     cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
     return sim_ds, sim_data
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index f9f1d4b..71c564d 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -308,7 +308,7 @@ def verify_load_db(db_data):
     """
     db_data = verify_load_data(db_data)
     files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
     db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
     
     return db_data, db_ds
@@ -334,13 +334,13 @@ def merge_db_host(db_data, host_data):
     if os.path.exists(merged_db_host_file):
         merged_db_host = load_Xy_data(merged_db_host_file)
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
-        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
     else:
         merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
         files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
-        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
 
         cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]]
         db_ds = db_ds.drop_columns(cols2drop)

From 75bd9d1fdfbedb1cf257b8eeda77751df494133a Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 6 Nov 2023 17:35:35 -0500
Subject: [PATCH 26/92] reads parallelism = len(files_lst)/100

---
 src/data/kmers.py                |  2 +-
 src/models/classification.py     |  2 +-
 src/models/classification_old.py | 16 ++++++++--------
 src/models/reads_simulation.py   |  4 ++--
 src/utils.py                     |  8 ++++----
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/data/kmers.py b/src/data/kmers.py
index 7b1cb52..bc19b21 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -319,7 +319,7 @@ def _make_ray_ds(self):
                 self.df = self.df.repartition(int(self.df.count()/10))
         else:
             self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet'))
-            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1)
+            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(files_lst)/100)
 
     def _kmers_tokenization(self):
         print('_kmers_tokenization')
diff --git a/src/models/classification.py b/src/models/classification.py
index a2e8613..e1e65c1 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -172,7 +172,7 @@ def _predict(self, data2classify):
         Predict the given data using the trained model
         """
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         ids = data2classify['ids']
         if len(self.classified_data['sequence']) == 0:
             raise ValueError('Please train a model before executing classification')
diff --git a/src/models/classification_old.py b/src/models/classification_old.py
index 55847ec..15d8f23 100644
--- a/src/models/classification_old.py
+++ b/src/models/classification_old.py
@@ -112,7 +112,7 @@ def __init__(
     def execute_training_prediction(self, data2classify):
         print('execute_training_prediction')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         ids2classify = data2classify['ids']
         for i, taxa in enumerate(self._taxas_order):
             if taxa in self._taxas:
@@ -225,12 +225,12 @@ def _merge_database_host(self, database_data, host_data):
 
         if os.path.exists(self._merged_database_host['profile']):
             files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         else:
             files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
             files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
 
             cols2drop = []
             for col in df_db.schema().names:
@@ -258,7 +258,7 @@ def _load_training_data_merged(self, taxa):
         print('_load_training_data_merged')
         if self._classifier_binary == 'onesvm' and taxa == 'domain':
             files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
             df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
             df_val_test = self._merge_database_host(self._database_data, self._host_data)
             df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
@@ -280,7 +280,7 @@ def _load_training_data_merged(self, taxa):
     def _load_training_data(self):
         print('_load_training_data')
         files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
         df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
         self._training_datasets = {'train': df_train, 'validation': df_val}
@@ -300,7 +300,7 @@ def _sim_4_cv(self, df, kmers_ds, name):
         cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
         sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
         files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         return df
     
     def split_sim_cv_ds(self, ds, data, name):
@@ -310,7 +310,7 @@ def split_sim_cv_ds(self, ds, data, name):
             )
         if os.path.exists(ds_path):
             files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         else:
             cv_ds = ds.random_sample(0.1)
             if cv_ds.count() == 0:
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index 630f86f..eb61077 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -222,7 +222,7 @@ def split_sim_dataset(ds, data, name):
         warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset')
         splitted_data = load_Xy_data(splitted_path)
         files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet'))
-        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         return splitted_ds, splitted_data
     else:
         splitted_ds = ds.random_sample(0.1)
@@ -246,5 +246,5 @@ def sim_dataset(ds, data, name):
     cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
     return sim_ds, sim_data
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index 71c564d..5127ed3 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -308,7 +308,7 @@ def verify_load_db(db_data):
     """
     db_data = verify_load_data(db_data)
     files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
     db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
     
     return db_data, db_ds
@@ -334,13 +334,13 @@ def merge_db_host(db_data, host_data):
     if os.path.exists(merged_db_host_file):
         merged_db_host = load_Xy_data(merged_db_host_file)
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
-        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
     else:
         merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
         files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
-        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1)
+        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
 
         cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]]
         db_ds = db_ds.drop_columns(cols2drop)

From 9e9a663de32beb3d4e72afd18a9d8a48016e8147 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 6 Nov 2023 17:39:00 -0500
Subject: [PATCH 27/92] parallelism = len(files_lst)

---
 src/data/kmers.py                |  2 +-
 src/models/classification.py     |  2 +-
 src/models/classification_old.py | 16 ++++++++--------
 src/models/reads_simulation.py   |  4 ++--
 src/utils.py                     |  8 ++++----
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/data/kmers.py b/src/data/kmers.py
index bc19b21..a42f7d9 100644
--- a/src/data/kmers.py
+++ b/src/data/kmers.py
@@ -319,7 +319,7 @@ def _make_ray_ds(self):
                 self.df = self.df.repartition(int(self.df.count()/10))
         else:
             self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet'))
-            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(files_lst)/100)
+            self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(files_lst))
 
     def _kmers_tokenization(self):
         print('_kmers_tokenization')
diff --git a/src/models/classification.py b/src/models/classification.py
index e1e65c1..6a0cbc0 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -172,7 +172,7 @@ def _predict(self, data2classify):
         Predict the given data using the trained model
         """
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         ids = data2classify['ids']
         if len(self.classified_data['sequence']) == 0:
             raise ValueError('Please train a model before executing classification')
diff --git a/src/models/classification_old.py b/src/models/classification_old.py
index 15d8f23..7638c17 100644
--- a/src/models/classification_old.py
+++ b/src/models/classification_old.py
@@ -112,7 +112,7 @@ def __init__(
     def execute_training_prediction(self, data2classify):
         print('execute_training_prediction')
         files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         ids2classify = data2classify['ids']
         for i, taxa in enumerate(self._taxas_order):
             if taxa in self._taxas:
@@ -225,12 +225,12 @@ def _merge_database_host(self, database_data, host_data):
 
         if os.path.exists(self._merged_database_host['profile']):
             files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         else:
             files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
             files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
             cols2drop = []
             for col in df_db.schema().names:
@@ -258,7 +258,7 @@ def _load_training_data_merged(self, taxa):
         print('_load_training_data_merged')
         if self._classifier_binary == 'onesvm' and taxa == 'domain':
             files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
             df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
             df_val_test = self._merge_database_host(self._database_data, self._host_data)
             df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
@@ -280,7 +280,7 @@ def _load_training_data_merged(self, taxa):
     def _load_training_data(self):
         print('_load_training_data')
         files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
         df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
         self._training_datasets = {'train': df_train, 'validation': df_val}
@@ -300,7 +300,7 @@ def _sim_4_cv(self, df, kmers_ds, name):
         cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
         sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
         files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         return df
     
     def split_sim_cv_ds(self, ds, data, name):
@@ -310,7 +310,7 @@ def split_sim_cv_ds(self, ds, data, name):
             )
         if os.path.exists(ds_path):
             files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         else:
             cv_ds = ds.random_sample(0.1)
             if cv_ds.count() == 0:
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index eb61077..463c077 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -222,7 +222,7 @@ def split_sim_dataset(ds, data, name):
         warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset')
         splitted_data = load_Xy_data(splitted_path)
         files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet'))
-        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         return splitted_ds, splitted_data
     else:
         splitted_ds = ds.random_sample(0.1)
@@ -246,5 +246,5 @@ def sim_dataset(ds, data, name):
     cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     return sim_ds, sim_data
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index 5127ed3..f9f1d4b 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -308,7 +308,7 @@ def verify_load_db(db_data):
     """
     db_data = verify_load_data(db_data)
     files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
     
     return db_data, db_ds
@@ -334,13 +334,13 @@ def merge_db_host(db_data, host_data):
     if os.path.exists(merged_db_host_file):
         merged_db_host = load_Xy_data(merged_db_host_file)
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
-        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
     else:
         merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
         files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
         files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
-        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100)
+        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
         cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]]
         db_ds = db_ds.drop_columns(cols2drop)

From 8e828f3d3724ce9e8127207f4eb6a7551493bd79 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 9 Nov 2023 17:50:56 -0500
Subject: [PATCH 28/92] new version of ClassificationMethods without data
 loading + move cv out of framework specific classes + debug sklearn

---
 src/Caribou_classification.py                 |  77 +--
 src/Caribou_classification_train_cv.py        |  59 +-
 src/Caribou_extraction.py                     | 109 ++--
 src/Caribou_extraction_train_cv.py            |  87 ++-
 src/models/classification.py                  | 596 ++++++++++--------
 src/models/classification_old.py              | 327 ----------
 src/models/encoders/model_label_encoder.py    |   1 +
 src/models/encoders/one_hot_tensor_encoder.py |   2 +-
 src/models/kerasTF/models.py                  |  79 +--
 src/models/models_utils.py                    |  59 +-
 src/models/preprocessors/tfidf_transformer.py |   2 +-
 src/models/sklearn/models.py                  |  94 +--
 src/models/sklearn/partial_trainer.py         |   7 +-
 src/utils.py                                  |  22 +-
 14 files changed, 564 insertions(+), 957 deletions(-)
 delete mode 100644 src/models/classification_old.py

diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index a1992c7..0c4b460 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -7,12 +7,13 @@
 from time import time
 from pathlib import Path
 from models.reads_simulation import split_sim_dataset
-from models.classification_old import ClassificationMethods
+from models.classification import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['bacteria_classification_train_cv']
 
+TRAINING_DATASET_NAME = 'train'
 VALIDATION_DATASET_NAME = 'validation'
 
 # Initialisation / validation of parameters from CLI
@@ -36,14 +37,6 @@ def bacteria_classification(opt):
 ################################################################################
 
     db_data, db_ds = verify_load_db(opt['data_bacteria'])
-    data_metagenome = verify_load_data(opt['data_metagenome'])
-
-    k_length = len(db_data['kmers'][0])
-
-    if opt['preclassified_data'] is not None:
-        preclassified_data = verify_load_preclassified(opt['preclassified_data'])
-    else:
-        preclassified_data = None
 
     # Validate and extract list of taxas
     if opt['taxa'] is not None:
@@ -56,60 +49,60 @@ def bacteria_classification(opt):
 
     val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
 
-# Definition of model for bacteria taxonomic classification + training
+    datasets = {
+        TRAINING_DATASET_NAME : db_ds,
+        VALIDATION_DATASET_NAME : val_ds
+    }
+
+    metagenome_data, metagenome_ds = verify_load_metagenome(opt['data_metagenome'])
+
+# Definition of model for bacteria taxonomic classification
 ################################################################################
+    
     clf = ClassificationMethods(
-        database_k_mers = db_data,
-        k = k_length,
+        db_data = db_data,
         outdirs = outdirs,
-        database = opt['database_name'],
-        classifier_multiclass = opt['model_type'],
-        taxa = lst_taxas,
+        db_name = opt['database_name'],
+        clf_multiclass = opt['model_type'],
+        taxa = 'domain',
         batch_size = opt['batch_size'],
-        training_epochs = opt['training_epochs'],
-        verbose = opt['verbose'],
-        cv = False
+        training_epochs = opt['training_epochs']
     )
     
 # Execution of bacteria taxonomic classification on metagenome + save results
 ################################################################################
     
-    t_start = time()
-    end_taxa = clf.fit_predict(data_metagenome)
-    t_end = time()
-    t_classif = t_end - t_start
-    clf_data = merge_save_data(
-        clf.classified_data,
-        data_metagenome,
-        end_taxa,
-        outdirs['results_dir'],
-        opt['metagenome_name'],
-        preclassified = preclassified_data,
-    )
-    if opt['taxa'] is None:
-        opt['taxa'] = 'all'
-    clf_data['classification'].to_csv(os.path.join(outdirs['results_dir'], f"classification_K{k_length}_{opt['taxa']}_{opt['model_type']}.csv"))
-    if end_taxa is None:
-        print(f"Caribou finished training the {opt['model_type']} model and classifying bacterial sequences at {opt['taxa']} taxonomic level with it. \
-            \nThe training and classification steps took {t_classif} seconds to execute.")
-    else:
-        print(f"Caribou finished training the {opt['model_type']} model and classifying bacterial sequences at {opt['taxa']} taxonomic level until {end_taxa} because there were no more sequences to classify. \
-            \nThe training and classification steps took {t_classif} seconds to execute.")
+    t_s = time()
+    clf.fit(datasets)
+    t_fit = time() - t_s
+
+    t_s = time()
+    predictions = clf.predict(metagenome_ds)
+    t_clf = time() - t_s
+
+    Xy_file = os.path.join(outdirs['results_dir'], f"extracted_bacteria_{opt['metagenome_name']}_{opt['model_type']}.npz")
+    save_Xy_data(predictions, Xy_file)
+
+    print(f"""
+          Caribou finished training the {opt['model_type']} model in {t_fit} seconds.
+          Classification of bacteria from {opt['metagenome_name']} dataset was then executed in {t_clf} seconds.
+          """)
 
 # Argument parsing from CLI
 ################################################################################
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='This script trains a model and classifies bacteria sequences iteratively over known taxonomic levels.')
+    # Database
     parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
-    parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify')
     parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files')
+    # Dataset
+    parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify')
     parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files')
-    parser.add_argument('-pc','--preclassified_data', default=None, type=Path,help='Optional. PATH to a .npz file contianing classified data at another taxonomic level than the ones in the current analysis')
+    # Parameters
     parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
     parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
-    parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data')
     args = parser.parse_args()
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index aac75d8..f6832a8 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 from logging import ERROR
 from models.reads_simulation import split_sim_dataset
-from models.classification_old import ClassificationMethods
+from models.classification import ClassificationMethods
 
 warnings.filterwarnings('ignore')
 
@@ -16,6 +16,7 @@
 
 __all__ = ['bacteria_classification_train_cv']
 
+TRAINING_DATASET_NAME = 'train'
 VALIDATION_DATASET_NAME = 'validation'
 TEST_DATASET_NAME = 'test'
 
@@ -41,53 +42,59 @@ def bacteria_classification_train_cv(opt):
 
     db_data, db_ds = verify_load_db(opt['data_bacteria'])
 
-    k_length = len(db_data['kmers'][0])
-
     # Validate and extract list of taxas
     if opt['taxa'] is not None:
         lst_taxas = verify_taxas(opt['taxa'], db_data['taxas'])
     else:
         lst_taxas = db_data['taxas'].copy()
-    
+        
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
+    
+    for taxa in lst_taxas:
+
+        test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME)
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
 
-    test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME)
-    val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
+        datasets = {
+            TRAINING_DATASET_NAME : db_ds,
+            TEST_DATASET_NAME : test_ds,
+            VALIDATION_DATASET_NAME : val_ds
+        }
 
 # Training and cross-validation of models for classification of bacterias
 ################################################################################
 
-    t_start = time()
-    ClassificationMethods(
-        database_k_mers = db_data,
-        k = k_length,
-        outdirs = outdirs,
-        database = opt['database_name'],
-        classifier_binary = None,
-        classifier_multiclass = opt['model_type'],
-        taxa = lst_taxas,
-        batch_size = opt['batch_size'],
-        training_epochs = opt['training_epochs'],
-        verbose = opt['verbose'],
-        cv = True
-    ).fit()
-    t_end = time()
-    t_classify = t_end - t_start
-    print(
-        f"Caribou finished training and cross-validating the {opt['model_type']} model in {t_classify} seconds")
+        clf = ClassificationMethods(
+            db_data = db_data,
+            outdirs = outdirs,
+            db_name = opt['database_name'],
+            clf_multiclass = opt['model_type'],
+            taxa = taxa,
+            batch_size = opt['batch_size'],
+            training_epochs = opt['training_epochs']
+        )
+
+        t_s = time()
+
+        cv_scores = clf.cross_validation(datasets)
+
+        t_clf = time() - t_s
+
+        print(f"Caribou finished training and cross-validating the {opt['model_type']} model at taxa {taxa} in {t_clf} seconds")
 
 # Argument parsing from CLI
 ################################################################################
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='This script trains and cross-validates a model for the bacteria classification step.')
+    # Database
     parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
-    parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files')
+    parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files')
+    # Parameters
     parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
     parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
-    parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data')
     args = parser.parse_args()
diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py
index eda156b..3876f2b 100644
--- a/src/Caribou_extraction.py
+++ b/src/Caribou_extraction.py
@@ -1,17 +1,19 @@
 #!/usr/bin python3
 
+import os
 import argparse
 
 from utils import *
 from time import time
 from pathlib import Path
 from models.reads_simulation import split_sim_dataset
-from models.classification_old import ClassificationMethods
+from models.classification import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['bacteria_extraction_train_cv']
 
+TRAINING_DATASET_NAME = 'train'
 VALIDATION_DATASET_NAME = 'validation'
 
 # Initialisation / validation of parameters from CLI
@@ -36,82 +38,79 @@ def bacteria_extraction(opt):
 # Data loading
 ################################################################################
 
-    if opt['data_host'] is not None:
+    if opt['model_type'] != 'onesvm':
+        if opt['data_host'] is not None:
+            db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+            db_name = 'host_merged'
+        else:
+            db_data, db_ds = verify_load_db(opt['data_bacteria'])
+            db_name = opt['dataset_name']
+
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+    else:
         db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
         db_name = 'host_merged'
-    else:
+
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
         db_name = opt['dataset_name']
-    data_metagenome = verify_load_data(opt['data_metagenome'])
 
-    k_length = len(db_data['kmers'][0])
+    datasets = {
+        TRAINING_DATASET_NAME : db_ds,
+        VALIDATION_DATASET_NAME : val_ds
+    }
 
-    val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+    metagenome_data, metagenome_ds = verify_load_metagenome(opt['data_metagenome'])
 
-# Definition of model for bacteria extraction / host removal + execution
+# Definition of model for bacteria extraction / host removal
 ################################################################################
-    if opt['host_name'] is None:
-        clf = ClassificationMethods(
-            database_k_mers = (db_data, db_ds),
-            k = k_length,
-            outdirs = outdirs,
-            database = opt['database_name'],
-            classifier_binary = opt['model_type'],
-            taxa = 'domain',
-            batch_size = opt['batch_size'],
-            training_epochs = opt['training_epochs'],
-            verbose = opt['verbose'],
-            cv = False
-        )
-    else:
-        clf = ClassificationMethods(
-            database_k_mers = (db_data, db_ds),
-            k = k_length,
-            outdirs = outdirs,
-            database = opt['database_name'],
-            classifier_binary = opt['model_type'],
-            taxa = 'domain',
-            batch_size = opt['batch_size'],
-            training_epochs = opt['training_epochs'],
-            verbose = opt['verbose'],
-            cv = False
-        )
+    
+    clf = ClassificationMethods(
+        db_data = db_data,
+        outdirs = outdirs,
+        db_name = opt['database_name'],
+        clf_binary = opt['model_type'],
+        taxa = 'domain',
+        batch_size = opt['batch_size'],
+        training_epochs = opt['training_epochs']
+    )
+
 # Execution of bacteria extraction / host removal on metagenome + save results
 ################################################################################
     
-    t_start = time()
-    end_taxa = clf.execute_training_prediction(data_metagenome)
-    t_end = time()
-    t_classify = t_end - t_start
-
-    if end_taxa is None:
-        clf_data = merge_save_data(
-            clf.classified_data,
-            data_bacteria,
-            end_taxa,
-            outdirs['results_dir'],
-            opt['metagenome_name'],
-        )
-        print(f"Caribou finished training the {opt['model_type']} model and extracting bacteria with it. \
-            \nThe training and classification steps took {t_classify} seconds.")
-    else:
-        print(f"Caribou finished training the {opt['model_type']} model but there was no data to classify. \
-            \nThe training and classification steps took {t_classify} seconds.")
+    t_s = time()
+    clf.fit(datasets)
+    t_fit = time() - t_s
+
+    t_s = time()
+    predictions = clf.predict(metagenome_ds)
+    t_clf = time() - t_s
+
+    Xy_file = os.path.join(outdirs['results_dir'], f"extracted_bacteria_{opt['metagenome_name']}_{opt['model_type']}.npz")
+    save_Xy_data(predictions, Xy_file)
+
+    print(f"""
+          Caribou finished training the {opt['model_type']} model in {t_fit} seconds.
+          Extraction of bacteria from {opt['metagenome_name']} dataset was then executed in {t_clf} seconds.
+          """)
 
 # Argument parsing from CLI
 ################################################################################
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='This script trains a model and extracts bacteria / host sequences.')
+    # Database
     parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
     parser.add_argument('-dh','--data_host', default=None, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the host')
-    parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify')
-    parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files')
-    parser.add_argument('-ds','--host_name', default=None, help='Name of the host database used to name files')
+    parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files')
+    parser.add_argument('-hn','--host_name', default=None, help='Name of the host database used to name files')
+    # Dataset
+    parser.add_argument('-dm','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify')
     parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files')
+    # Parameters
     parser.add_argument('-model','--model_type', default=None, choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
-    parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data')
     args = parser.parse_args()
diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py
index 2a77471..1c73cad 100644
--- a/src/Caribou_extraction_train_cv.py
+++ b/src/Caribou_extraction_train_cv.py
@@ -6,12 +6,13 @@
 from time import time
 from pathlib import Path
 from models.reads_simulation import split_sim_dataset
-from models.classification_old import ClassificationMethods
+from models.classification import ClassificationMethods
 
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['bacteria_extraction_train_cv']
 
+TRAINING_DATASET_NAME = 'train'
 VALIDATION_DATASET_NAME = 'validation'
 TEST_DATASET_NAME = 'test'
 
@@ -31,68 +32,66 @@ def bacteria_extraction_train_cv(opt):
 # Data loading
 ################################################################################
 
-    if opt['data_host'] is not None:
+    if opt['model_type'] != 'onesvm':
+        if opt['data_host'] is not None:
+            db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+            db_name = 'host_merged'
+        else:
+            db_data, db_ds = verify_load_db(opt['data_bacteria'])
+            db_name = opt['database_name']
+
+        test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+    else:
         db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
         db_name = 'host_merged'
-    else:
-        db_data, db_ds = verify_load_db(opt['data_bacteria'])
-        db_name = opt['dataset_name']
 
-    k_length = len(db_data['kmers'][0])
+        test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+
+        db_data, db_ds = verify_load_db(opt['data_bacteria'])
+        db_name = opt['database_name']
 
-    test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
-    val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+    datasets = {
+        TRAINING_DATASET_NAME : db_ds,
+        TEST_DATASET_NAME : test_ds,
+        VALIDATION_DATASET_NAME : val_ds
+    }
 
 # Training and cross-validation of models for bacteria extraction / host removal
 ################################################################################
     
-    t_start = time()
-
-    if opt['host_name'] is None:
-        ClassificationMethods(
-            database_k_mers = (db_data, db_ds),
-            k = k_length,
-            outdirs = outdirs,
-            database = opt['database_name'],
-            classifier_binary = opt['model_type'],
-            taxa = 'domain',
-            batch_size = opt['batch_size'],
-            training_epochs = opt['training_epochs'],
-            verbose = opt['verbose'],
-            cv = True
-        ).execute_training()
-    else:
-        ClassificationMethods(
-            database_k_mers = (db_data, db_ds),
-            k = k_length,
-            outdirs = outdirs,
-            database = opt['database_name'],
-            classifier_binary = opt['model_type'],
-            taxa = 'domain',
-            batch_size = opt['batch_size'],
-            training_epochs = opt['training_epochs'],
-            verbose = opt['verbose'],
-            cv = True
-        ).execute_training()
-
-    t_end = time()
-    t_classify = t_end - t_start
-    print(
-        f"Caribou finished training and cross-validating the {opt['model_type']} model in {t_classify} seconds")
 
+    clf = ClassificationMethods(
+        db_data = db_data,
+        outdirs = outdirs,
+        db_name = opt['database_name'],
+        clf_binary = opt['model_type'],
+        taxa = 'domain',
+        batch_size = opt['batch_size'],
+        training_epochs = opt['training_epochs']
+    )
+
+    t_s = time()
+
+    cv_scores = clf.cross_validation(datasets)
+
+    t_clf = time() - t_s
+    print(f"Caribou finished training and cross-validating the {opt['model_type']} model in {t_clf} seconds")
 
 # Argument parsing from CLI
 ################################################################################
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='This script trains and cross-validates a model for the bacteria extraction / host removal step.')
+    # Database
     parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
     parser.add_argument('-dh','--data_host', default=None, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the host')
-    parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files')
-    parser.add_argument('-ds','--host_name', default=None, help='Name of the host database used to name files')
+    parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files')
+    parser.add_argument('-hn','--host_name', default=None, help='Name of the host database used to name files')
+    # Parameters
     parser.add_argument('-model','--model_type', required = True, choices=['onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one is chosen, defaults to 100')
-    parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data')
     args = parser.parse_args()
diff --git a/src/models/classification.py b/src/models/classification.py
index 6a0cbc0..cbad2be 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -5,15 +5,13 @@
 import numpy as np
 import pandas as pd
 
-from glob import glob
-from typing import Dict
-from shutil import rmtree
-from utils import load_Xy_data
+from warnings import warn
+from typing import Dict, List
 from models.sklearn.models import SklearnModel
 from models.kerasTF.models import KerasTFModel
 
-# Simulation class
-from models.reads_simulation import readsSimulation
+# CV metrics
+from sklearn.metrics import precision_recall_fscore_support
 
 __author__ = 'Nicolas de Montigny'
 
@@ -22,20 +20,11 @@
 TRAINING_DATASET_NAME = 'train'
 VALIDATION_DATASET_NAME = 'validation'
 TEST_DATASET_NAME = 'test'
+TENSOR_COLUMN_NAME = '__value__'
 
 class ClassificationMethods():
     """
-    Utilities class for classifying sequences from metagenomes using ray
-
-    ----------
-    Attributes
-    ----------
-    
-    classified_data : dictionary
-        Dictionary containing the classified data for each classified taxonomic level
-
-    models : dictionary
-        Dictionary containing the trained models for each taxonomic level
+    Class for classifying sequences from metagenomes in a recursive manner
 
     ----------
     Methods
@@ -52,320 +41,387 @@ class ClassificationMethods():
     """
     def __init__(
         self,
-        database_k_mers: Dict,
-        k: int,
+        db_data: Dict,
         outdirs: Dict,
-        database: str,
-        classifier_binary: str = 'deeplstm',
-        classifier_multiclass: str = 'widecnn',
-        taxa: str = None,
-        threshold: float = 0.8,
+        db_name: str,
+        clf_binary: str = None,
+        clf_multiclass: str = None,
+        taxa: [str, List] = None,
         batch_size: int = 32,
-        training_epochs: int = 100,
-        verbose: bool = True,
-        cv: bool = False
+        training_epochs: int = 100
     ):
         # Parameters
-        self._k = k
-        self._cv = cv
         self._taxas = taxa
         self._outdirs = outdirs
-        self._database = database
-        self._verbose = verbose
-        self._threshold = threshold
-        self._classifier_binary = classifier_binary
-        self._classifier_multiclass = classifier_multiclass
+        self._database = db_name
+        self._database_data = db_data
+        self._classifier_binary = clf_binary
+        self._classifier_multiclass = clf_multiclass
         self._batch_size = batch_size
         self._training_epochs = training_epochs
-        # Initialize with values
-        self.classified_data = {
-            'sequence': [],
-            'classification' : None,
-            'classified_ids' : [],
-            'unknown_ids' : []
-        }
-        # Empty initializations
-        self.models = {}
-        self._host = False
-        self._taxas_order = []
-        self._host_data = None
-        self._database_data = None
-        self._training_datasets = None
-        self._merged_training_datasets = None
-        self._merged_database_host = None
-        self.previous_taxa_unclassified = None
-        # Extract database data 
-        if isinstance(database_k_mers, tuple):
-            self._host = True
-            self._database_data = database_k_mers[0]
-            self._host_data = database_k_mers[1]
-        else:
-            self._database_data = database_k_mers
-        # Remove 'id' from kmers if present
-        if 'id' in self._database_data['kmers']:
-            self._database_data['kmers'].remove('id')
-        if self._host and 'id' in self._host_data['kmers']:
-            self._host_data['kmers'].remove('id')
-        # Assign taxas order for top-down strategy
-        self._taxas_order = self._database_data['taxas'].copy()
-        self._taxas_order.reverse()
-        # Automatic executions
-        self._verify_assign_taxas(taxa)
+        # Init not fitted
+        self.is_fitted = False
 
     # Public functions
     #########################################################################################################
-# TODO: Revise documentation in heading
-# TODO: Remove parameters from global if they are only required for certain functions
-# TODO: Finish transfering the functions & calls from the old version
-# TODO: Validation of params before execution of private functions
-    def fit(self, datasets, ):
+
+    def fit(self, datasets):
         """
-        Wrapper function to call the fitting method
+        Public function to call the fitting method after validation of parameters
         """
-        # TODO: Pass training/validation data here
-
-    def predict(self):
+        self._valid_assign_taxas()
+        self._valid_classifier()
+        tax_map = self._verify_model_trained()
+        
+        self._fit(datasets, tax_map)
+        
+    def predict(self, dataset):
         """
-        Wrapper function to call the predicting method
+        Public function to call the predicting method after validation of parameters
         """
-        # TODO: Pass data to predict here
+        model_mapping = self._verify_load_model()
+        predictions = self._predict(dataset, model_mapping)
+        
+        return predictions
 
-    def fit_predict(self):
+    def fit_predict(self, datasets, predict_ds):
         """
-        Wrapper function for calling fit and predict
+        Public function for calling fit and predict after validation of parameters
         """
-        # TODO: Pass training/validation data here
-        # TODO: Pass data to predict here
+        self._valid_assign_taxas()
+        self._valid_classifier()
+        tax_map = self._verify_model_trained()
+
+        self._fit(datasets, tax_map)
+
+        model_mapping = self._verify_load_model()
+        predictions = self._predict(predict_ds, model_mapping)
     
-    def cross_validation(self):
+        return predictions
+
+    def cross_validation(self, datasets):
         """
-        Wrapper function to call the cross-validation method
+        Public function to call the cross-validation method after validation of parameters
+        Executes cross-validation of a model by fitting it and predicting over a test dataset
         """
-        # TODO: Pass training/validation data here
-        # TODO: Pass testing data here
+        
+        if isinstance(self._taxas, str):
+            self._valid_assign_taxas()
+            tax_map = self._verify_model_trained()
+
+            test_ds = datasets.pop(TEST_DATASET_NAME)
+            y_true, test_ds = self._get_true_classif(test_ds, self._taxas)
+
+            self._fit(datasets, tax_map)
+
+            model_mapping = self._verify_load_model()
+            y_pred = self._cv_predict(test_ds, model_mapping)
+            cv_scores = self._score_cv(y_true, y_pred, self._taxas[0])
+            
+            return cv_scores
+        else:
+            raise ValueError('Cross-validation can only be done on one taxa, please pass one taxa while initiating the ClassificationMethods object')
+
 
     # Private principal functions
     #########################################################################################################
-# TODO: Pass training/validation data here
-    def _fit(self):
+
+    def _fit(self, datasets, tax_map):
         """
         Fit the given model to the training dataset
         """
-        for taxa in self._taxas_order:
-            if taxa in self._taxas:
-                if taxa in ['domain','bacteria','host']:
-                    clf = self._classifier_binary
-                else:
-                    clf = self._classifier_multiclass
-                self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz')
-                self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
-                train = self._verify_load_data_model(self._data_file, self._model_file, taxa)
-                if train:
-                    if taxa in ['domain','bacteria','host']:
-                        self._binary_training(taxa)
-                    else:
-                        self._multiclass_training(taxa)
-
-# TODO: Pass data to predict here
-    def _predict(self, data2classify):
-        """
-        Predict the given data using the trained model
-        """
-        files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        ids = data2classify['ids']
-        if len(self.classified_data['sequence']) == 0:
-            raise ValueError('Please train a model before executing classification')
-        for i, taxa in enumerate(self.classified_data['sequence']):
+        for taxa, file in tax_map.items():
+            if taxa in ['domain','bacteria','host']:
+                self._binary_training(datasets, taxa, file)
+            else:
+                self._multiclass_training(datasets, taxa, file)
+        self.is_fitted = True
+
+    def _predict(self, ds, model_map):
+        """
+        Predict the given data using the trained model in a recursive manner over taxas using a top-down approach
+        Returns a mapping of the predictions made by the models for the targeted taxas
+        """
+        mapping = {}
+        if self.is_fitted:
             try:
-                if i == 0:
-                    df = self._classify_first(df, taxa, ids, data2classify['profile'])
-                else:
-                    df = self._classify_subsequent(df, taxa, ids, data2classify['profile'])
+                for taxa, model in model_map.items():
+                    predictions = model.predict(ds) # np.array
+                    ds, predictions, ids = self._remove_unknown(ds, predictions)
+                    file = self._save_dataset(ds, taxa)
+                    mapping[taxa] = {
+                        'classification' : predictions,
+                        'ids' : ids,
+                        'dataset' : file
+                    }
+                return mapping
             except ValueError:
                 print('Stopping classification prematurelly because there are no more sequences to classify')
-                return taxa
-        return None
-    
-    def _cross_validation(self):
+                return mapping
+        else:
+            raise ValueError('The model was not fitted yet! Please call either the `fit` or the `fit_predict` method before making predictions')
+
+    def _cv_predict(self, ds, model_map):
         """
-        Execute cross-validation of a model by fitting a model and predicting over a test dataset
+        Predict the given data using the trained model for cross-validation
+        Returns a mapping of the predictions made by the models for the targeted taxas
         """
+        mapping = {}
+        for taxa, model in model_map.items():
+                mapping[taxa] = model.predict(ds) # np.array
+        return mapping
 
     # Private training secondary functions
     #########################################################################################################
-# TODO: Remove data loading & verification from inside these functions
-    def _binary_training(self, taxa):
+
+    def _binary_training(self, datasets, taxa, file):
         print('_binary_training')
-        self._verify_classifier_binary()
         if self._classifier_binary == 'onesvm':
-            self.models[taxa] = SklearnModel(
+            model = SklearnModel(
+                self._classifier_binary,
+                self._outdirs['models_dir'],
+                self._batch_size,
+                self._training_epochs,
+                taxa,
+                self._database_data['kmers']
+            )
+        elif self._classifier_binary == 'linearsvm':
+            model = SklearnModel(
                 self._classifier_binary,
-                self._database,
                 self._outdirs['models_dir'],
-                self._outdirs['results_dir'],
                 self._batch_size,
                 self._training_epochs,
-                self._k,
                 taxa,
-                self._database_data['kmers'],
-                self._verbose
+                self._database_data['kmers']
             )
         else:
-            if self._classifier_binary == 'linearsvm':
-                self.models[taxa] = SklearnModel(
-                    self._classifier_binary,
-                    self._database,
-                    self._outdirs['models_dir'],
-                    self._outdirs['results_dir'],
-                    self._batch_size,
-                    self._training_epochs,
-                    self._k,
-                    taxa,
-                    self._merged_database_host['kmers'],
-                    self._verbose
-                )
-            else:
-                self.models[taxa] = KerasTFModel(
-                    self._classifier_binary,
-                    self._database,
-                    self._outdirs['models_dir'],
-                    self._outdirs['results_dir'],
-                    self._batch_size,
-                    self._training_epochs,
-                    self._k,
-                    taxa,
-                    self._merged_database_host['kmers'],
-                    self._verbose
-                )
-        self.models[taxa].preprocess(self._merged_training_datasets['train'])
-        self.models[taxa].train(self._merged_training_datasets, self._merged_database_host, self._cv)
-
-        self._save_model(self._model_file, taxa)
-
-    def _multiclass_training(self, taxa):
+            model = KerasTFModel(
+                self._classifier_binary,
+                self._outdirs['models_dir'],
+                self._batch_size,
+                self._training_epochs,
+                taxa,
+                self._database_data['kmers']
+            )
+        model.preprocess(datasets[TRAINING_DATASET_NAME])
+        model.fit(datasets)
+
+        self._save_model(model, file)
+
+    def _multiclass_training(self, datasets, taxa, file):
         print('_multiclass_training')
-        self._verify_classifier_multiclass()
-        self._load_training_data()
         if self._classifier_multiclass in ['sgd','mnb']:
-            self.models[taxa] = SklearnModel(
+            model = SklearnModel(
                 self._classifier_multiclass,
-                self._database,
                 self._outdirs['models_dir'],
-                self._outdirs['results_dir'],
                 self._batch_size,
                 self._training_epochs,
-                self._k,
                 taxa,
-                self._database_data['kmers'],
-                self._verbose
+                self._database_data['kmers']
             )
         else:
-            self.models[taxa] = KerasTFModel(
+            model = KerasTFModel(
                 self._classifier_multiclass,
-                self._database,
                 self._outdirs['models_dir'],
-                self._outdirs['results_dir'],
                 self._batch_size,
                 self._training_epochs,
-                self._k,
                 taxa,
-                self._database_data['kmers'],
-                self._verbose
+                self._database_data['kmers']
             )
-        self.models[taxa].preprocess(self._training_datasets['train'])
-        self.models[taxa].train(self._training_datasets, self._database_data, self._cv)
-        self._save_model(self._model_file, taxa)
+        model.preprocess(datasets[TRAINING_DATASET_NAME])
+        model.fit(datasets)
+
+        self._save_model(model, file)
 
     # Private predicting secondary functions
     #########################################################################################################
-# TODO: Revise these functions to parallelise with Ray + ease process
-    # Classify sequences for first iteration
-    def _classify_first(self, df, taxa, ids, df_file):
-        print('_classify_first')
-        try:
-            pred_df = self._predict_sequences(df, taxa, ids)
-            not_pred_df = pred_df[pred_df[taxa] == 'unknown']
-            pred_df = pred_df[pred_df[taxa] != 'unknown']
-
-            self.classified_data['classified_ids'] = list(pred_df['id'].values)
-            self.classified_data['unknown_ids'] = list(not_pred_df['id'].values)
-
-            self.classified_data['classification'] = pred_df
-
-            if taxa == 'domain':
-                if self._host == True:
-                    pred_df_host = pred_df[pred_df['domain'] == 'host']
-                    pred_df = pred_df[pred_df['domain'] != 'host']
-                    classified_host, classified_host_file = self._extract_subset(df, df_file, list(pred_df_host['id'].values), taxa, 'bacteria')
-                    self.classified_data[taxa]['host'] = {
-                        'classification' : classified_host_file
-                    }
-                classified, classified_file = self._extract_subset(df, df_file, self.classified_data['classified_ids'], taxa, 'bacteria')
-                self.classified_data[taxa]['bacteria'] = classified_file
-                not_classified, not_classified_file = self._extract_subset(df, df_file, self.classified_data['unknown_ids'], taxa, 'unknown')
-                self.classified_data[taxa]['unknown'] = not_classified_file
-                return classified
-            else:
-                classified, classified_file = self._extract_subset(df, df_file, self.classified_data['classified_ids'], taxa, 'bacteria')
-                self.classified_data[taxa]['classified'] = classified_file
-                not_classified, not_classified_file = self._extract_subset(df, df_file, self.classified_data['unknown_ids'], taxa, 'unknown')
-                self.classified_data[taxa]['unknown'] = not_classified_file
-                return classified
-        except:
-            raise ValueError('No sequences to classify for {}.'.format(taxa))
-
-    # Classify sequences according to passed taxa and model
-    def _classify_subsequent(self, df, taxa, ids, df_file):
-        print('_classify_subsequent')
-        try:
-            pred_df = self._predict_sequences(df, taxa, ids)
-            not_pred_df = pred_df[pred_df[taxa] == 'unknown']
-            pred_df = pred_df[pred_df[taxa] != 'unknown']
-
-            self.classified_data['classification'] = self.classified_data['classification'].join(pred_df, how = 'outer', on = 'id')
-
-            classified, classified_file = self._extract_subset(df, df_file, list(pred_df['id'].values), taxa, 'classified')
-            self.classified_data[taxa]['classified'] = classified_file
-            not_classified, not_classified_file = self._extract_subset(df, df_file, list(not_pred_df['id'].values), taxa, 'unknown')
-            self.classified_data[taxa]['unknown'] = not_classified_file
-            
-            return classified
-        except:
-            raise ValueError('No sequences to classify for {}.'.format(taxa))
-
-    # Make predictions
-    def _predict_sequences(self, df, taxa, ids):
-        print('_predict_sequences')
-        try:
-            predictions = self.models[taxa].predict(df, self._threshold)
-            pred_df = pd.DataFrame({'id': ids, taxa: predictions.values})
-
-            taxa_pos = self.classified_data['sequence'].index(taxa)
-            lst_taxa = self.classified_data['sequence'][taxa_pos:]
-            db_df = pd.DataFrame(
-                self._database_data['classes'],
-                columns=self._database_data['taxas']
-            )[lst_taxa]
-            pred_df = pred_df.merge(db_df, on=taxa, how='left')
-            
-            return pred_df
-        except ValueError:
-            raise ValueError('No sequences to classify for {}.'.format(taxa))
-
-    # Extract subset of classified or not classified sequences
-    def _extract_subset(self, df, df_file, ids, taxa, status):
-        print('_extract_subset')
-        clf_file = df_file + '_{}_{}'.format(taxa, status)
-        rows_clf = []
-        for row in df.iter_rows():
-            if row['id'] in ids:
-                rows_clf.append(row)
-        df_clf = ray.data.from_items(rows_clf)
-        if df_clf.count() > 0:
-            df_clf.write_parquet(clf_file)
-        return df_clf, clf_file
-
-    # Helper functions
+
+    def _remove_unknown(self, ds, predict):
+        ids = []
+        for row in ds.iter_rows():
+            ids.append(row['id'])
+        mapping = pd.DataFrame({
+            'ids' : ids,
+            'predictions' : predict
+        })
+        mapping = mapping[mapping['predictions'] != -1]
+        ids = mapping['ids']
+        predict = mapping['predictions']
+
+        def remove_unknown(df):
+            df = df[df['ids'].isin(ids)]
+            return df
+        
+        ds = ds.map_batches(remove_unknown, batch_format = 'pandas')
+        
+        return ds, predict, ids
+
+    # Private cross-validation secondary methods
     #########################################################################################################
 
+    def _get_true_classif(self, ds, taxas):
+        """
+        Extract the true classification of the dataset used for cross-validation
+        """
+        classif = {taxa : [] for taxa in taxas}
+        
+        cols2drop = [col for col in ds.schema().names if col not in ['id', taxas[0]]]
+        classif_ds = ds.drop_columns(cols2drop)
+
+        cols2drop = [col for col in ds.schema().names if col not in ['id',TENSOR_COLUMN_NAME]]
+        ds = ds.drop_columns(cols2drop)
+
+        for row in classif_ds.iter_rows():
+            for taxa in taxas:
+                classif[taxa].append(row[taxa])
+
+        return classif, ds
+
+    def _score_cv(self, y_true, y_pred, taxa):
+        """
+        Compute the cross validation scores
+        """
+        if self._classifier_binary is not None:
+            model = self._classifier_binary
+        else :
+            model = self._classifier_multiclass
+
+        cv_csv = os.path.join(self._outdirs['results_dir'],f'{self._database}_{model}_{taxa}_cv_scores.csv')
+
+
+        y_compare = pd.DataFrame({
+            'y_true': y_true[taxa],
+            'y_pred': y_pred[taxa]
+        })
+        y_compare['y_true'] = y_compare['y_true'].str.lower()
+        y_compare['y_pred'] = y_compare['y_pred'].str.lower()
+        y_compare.to_csv(os.path.join(self._outdirs['models_dir'], f'y_compare_{self._database}_{model}_{taxa}.csv'))
+
+        support = precision_recall_fscore_support(
+            y_compare['y_true'],
+            y_compare['y_pred'],
+            average = 'weighted'
+        )
+
+        scores = pd.DataFrame({
+            taxa : [support[0],support[1],support[2]]
+            },
+            index = ['Precision','Recall','F-score']
+        )
+        
+        scores.T.to_csv(cv_csv, index = True)
+
+        return scores
+    
+    # Validation & verification methods
+    #########################################################################################################
+
+    def _valid_assign_taxas(self):
+        """
+        Validate taxas and assign to class variable
+        Assign order for top-down strategy
+        """
+        print('_valid_assign_taxas')
+        if self._taxas is None:
+            self._taxas = self._database_data['taxas'].copy()            
+        elif isinstance(self._taxas, list):
+            self._taxas = self._taxas
+        elif isinstance(self._taxas, str):
+            self._taxas = [self._taxas]
+        else:
+            raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract")
+        self._valid_taxas()
+        self._taxas = [taxa for taxa in self._database_data['taxas'] if taxa in self._taxas]
+        self._taxas.reverse()
+
+    def _valid_taxas(self):
+        """
+        Validate that selected taxas are in database
+        """
+        print('_valid_taxas')
+        for taxa in self._taxas:
+            if taxa not in self._database_data['taxas']:
+                raise ValueError("Taxa {} not found in database".format(taxa))
+
+    def _valid_classifier(self):
+        if self._classifier_binary is not None:
+            if self._classifier_binary not in ['onesvm','linearsvm','attention','lstm','deeplstm']:
+                raise ValueError("""
+                                 Invalid classifier option for bacteria extraction!
+                                 Models implemented at this moment are :
+                                 Classic algorithm : One-class SVM (onesvm) and Linear SVM (linearsvm)
+                                 Neural networks : Attention (attention), LSTM (lstm) and Deep LSTM (deeplstm)
+                                 """)
+        if self._classifier_multiclass is not None:
+            if self._classifier_multiclass not in ['sgd','mnb','lstm_attention','cnn','widecnn']:
+                raise ValueError("""
+                                 Invalid classifier option for bacteria classification!
+                                 Models implemented at this moment are :
+                                 Classic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)
+                                 Neural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)
+                                 """)
+
+    def _verify_model_trained(self):
+        """
+        Verify if the model is already trained for all desired taxas
+        Taxas for which a model is already trained will be removed from the list
+        Returns a mapping of the file per taxa to train
+        """
+        mapping = {}
+        for taxa in self._taxas:
+            if taxa in ['domain','bacteria','host']:
+                clf = self._classifier_binary
+            else:
+                clf = self._classifier_multiclass
+            file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
+            if not os.path.isfile(file):
+                mapping[taxa] = file
+        
+        return mapping
+    
+    def _verify_load_model(self):
+        """
+        Verify if the model is already trained for all desired taxas
+        Taxas for which no model was not trained will raise a ValueError
+        Returns a mapping of the model per taxa for predicting
+        """
+        mapping = {}
+        for taxa in self._taxas:
+            if taxa in ['domain','bacteria','host']:
+                clf = self._classifier_binary
+            else:
+                clf = self._classifier_multiclass
+            file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
+            if not os.path.isfile(file):
+                raise ValueError(f'No model found for {taxa}')
+            else:
+                mapping[taxa] = self._load_model(file, taxa)
+        return mapping
+    
+    def _load_model(self, file, taxa):
+        """
+        Load a model from the specified file
+        """
+        print('_load_model')
+        with open(file, 'rb') as handle:
+            return cloudpickle.load(handle)
+        
+    def _save_model(self, model, file):
+        """
+        Save a model to a specified file
+        """
+        print('_save_model')
+        with open(file, 'wb') as handle:
+            cloudpickle.dump(model, handle)
+    
+    def _save_dataset(self, ds, taxa):
+        """
+        Save a dataset to disk and return the filename
+        """
+        if taxa in ['domain','bacteria','host']:
+            model = self._classifier_binary
+        else:
+            model = self._classifier_multiclass
+        file = os.path.join(self._outdirs['results'], f'data_classified_{model}_{taxa}.parquet')
+        ds.write_parquet(file)
+        return file
\ No newline at end of file
diff --git a/src/models/classification_old.py b/src/models/classification_old.py
deleted file mode 100644
index 7638c17..0000000
--- a/src/models/classification_old.py
+++ /dev/null
@@ -1,327 +0,0 @@
-import os
-import ray
-import cloudpickle
-
-import numpy as np
-import pandas as pd
-
-from glob import glob
-from shutil import rmtree
-from utils import load_Xy_data
-from models.sklearn.models import SklearnModel
-from models.kerasTF.models import KerasTFModel
-
-# Simulation class
-from models.reads_simulation import readsSimulation
-
-__author__ = 'Nicolas de Montigny'
-
-__all__ = ['ClassificationMethods']
-
-class ClassificationMethods():
-    """
-    Utilities class for classifying sequences from metagenomes using ray
-
-    ----------
-    Attributes
-    ----------
-    
-    classified_data : dictionary
-        Dictionary containing the classified data for each classified taxonomic level
-
-    models : dictionary
-        Dictionary containing the trained models for each taxonomic level
-
-    ----------
-    Methods
-    ----------
-
-    execute_training : launch the training of the models for the chosen taxonomic levels
-        no parameters to pass
-
-    execute_classification : 
-        data2classify : a dictionnary containing the data to classify produced by the function Caribou.src.data.build_data.build_X_data
-
-    """
-    def __init__(
-        self,
-        database_k_mers,
-        k,
-        outdirs,
-        database,
-        classifier_binary = 'deeplstm',
-        classifier_multiclass = 'widecnn',
-        taxa = None,
-        threshold = 0.8,
-        batch_size = 32,
-        training_epochs = 100,
-        verbose = True,
-        cv = False
-    ):
-        # Parameters
-        self._k = k
-        self._cv = cv
-        self._taxas = taxa
-        self._outdirs = outdirs
-        self._database = database
-        self._verbose = verbose
-        self._threshold = threshold
-        self._classifier_binary = classifier_binary
-        self._classifier_multiclass = classifier_multiclass
-        self._batch_size = batch_size
-        self._training_epochs = training_epochs
-        # Initialize with values
-        self.classified_data = {
-            'sequence': [],
-            'classification' : None,
-            'classified_ids' : [],
-            'unknown_ids' : []
-        }
-        # Empty initializations
-        self.models = {}
-        self._host = False
-        self._taxas_order = []
-        self._host_data = None
-        self._database_data = None
-        self._training_datasets = None
-        self._merged_training_datasets = None
-        self._merged_database_host = None
-        self.previous_taxa_unclassified = None
-        # Extract database data 
-        if isinstance(database_k_mers, tuple):
-            self._host = True
-            self._database_data = database_k_mers[0]
-            self._host_data = database_k_mers[1]
-        else:
-            self._database_data = database_k_mers
-        # Remove 'id' from kmers if present
-        if 'id' in self._database_data['kmers']:
-            self._database_data['kmers'].remove('id')
-        if self._host and 'id' in self._host_data['kmers']:
-            self._host_data['kmers'].remove('id')
-        # Assign taxas order for top-down strategy
-        self._taxas_order = self._database_data['taxas'].copy()
-        self._taxas_order.reverse()
-        # Automatic executions
-        self._verify_assign_taxas(taxa)
-        
-    # Main functions
-    #########################################################################################################
-
-    # Wrapper function for training and predicting over each known taxa
-    def execute_training_prediction(self, data2classify):
-        print('execute_training_prediction')
-        files_lst = glob(os.path.join(data2classify['profile'],'*.parquet'))
-        df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        ids2classify = data2classify['ids']
-        for i, taxa in enumerate(self._taxas_order):
-            if taxa in self._taxas:
-                # Training
-                if taxa in ['domain','bacteria','host']:
-                    clf = self._classifier_binary
-                else:
-                    clf = self._classifier_multiclass
-                self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz')
-                self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl')
-                train = self._verify_load_data_model(self._data_file, self._model_file, taxa)
-                if train:
-                    self._train_model(taxa)
-                # Predicting
-                try:
-                    if i == 0:
-                        df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile'])
-                    else:
-                        df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile'])
-                except ValueError:
-                    print('Stopping classification prematurelly because there are no more sequences to classify')
-                    return taxa
-        return None
-    
-    # Utils functions
-    #########################################################################################################
-    
-    # Verify taxas and assign to class variable
-    def _verify_assign_taxas(self, taxa):
-        print('_verify_assign_taxas')
-        if taxa is None:
-            self._taxas = self._database_data['taxas'].copy()            
-        elif isinstance(taxa, list):
-            self._taxas = taxa
-        elif isinstance(taxa, str):
-            self._taxas = [taxa]
-        else:
-            raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract")
-        self._verify_taxas()
-
-    # Verify if selected taxas are in database
-    def _verify_taxas(self):
-        print('_verify_taxas')
-        for taxa in self._taxas:
-            if taxa not in self._database_data['taxas']:
-                raise ValueError("Taxa {} not found in database".format(taxa))
-
-    # Caller function for verifying if the data and model already exist
-    def _verify_load_data_model(self, data_file, model_file, taxa):
-        print('_verify_load_data_model')
-        self._verify_files(data_file, taxa)
-        return self._verify_load_model(model_file, taxa)
-        
-    # Load extracted data if already exists
-    def _verify_files(self, file, taxa):
-        print('_verify_files')
-        self.classified_data['sequence'].append(taxa)
-        if os.path.isfile(file):
-            self.classified_data[taxa] = load_Xy_data(file)
-        else:
-            self.classified_data[taxa] = {}
-
-    # Load model if already exists
-    def _verify_load_model(self, model_file, taxa):
-        print('_verify_load_model')
-        if os.path.exists(model_file):
-            with open(model_file, 'rb') as f:
-                self.models[taxa] = cloudpickle.load(f)
-            return False
-        else:
-            return True
-
-    def _save_model(self, model_file, taxa):
-        print('_save_model')
-        with open(model_file, 'wb') as f:
-            cloudpickle.dump(self.models[taxa], f)
-
-    def _verify_classifier_binary(self):
-        print('_verify_classifier_binary')
-        if self._classifier_binary == 'onesvm':
-            if self._cv == True and self._host == True:
-                pass
-            elif self._cv == True and self._host == False:
-                raise ValueError('Classifier One-Class SVM cannot be cross-validated with bacteria data only!\nEither add host data from parameters or choose to predict directly using this method')
-            elif self._cv == False and self._host == True:
-                raise ValueError('Classifier One-Class SVM cannot classify with host data!\nEither remove host data from parameters or choose another bacteria extraction method')
-            elif self._cv == False and self._host == False:
-                pass
-        elif self._classifier_binary == 'onesvm' and self._host == False:
-            pass
-        elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == True:
-            pass
-        elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == False:
-            raise ValueError('Classifier {} cannot classify without host data!\nEither add host data to config file or choose the One-Class SVM classifier'.format(self._classifier_binary))
-        else:
-            raise ValueError('Invalid classifier option for bacteria extraction!\n\tModels implemented at this moment are :\n\tBacteria isolator :  One Class SVM (onesvm)\n\tClassic algorithm : Linear SVM (linearsvm)\n\tNeural networks : Attention (attention), Shallow LSTM (lstm) and Deep LSTM (deeplstm)')
-
-    def _verify_classifier_multiclass(self):
-        print('_verify_classifier_multiclass')
-        if self._classifier_multiclass in ['sgd','mnb','lstm_attention','cnn','widecnn']:
-            pass
-        else:
-            raise ValueError('Invalid classifier option for bacteria classification!\n\tModels implemented at this moment are :\n\tClassic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)\n\tNeural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)')
-
-    # Merge database and host reference data for bacteria extraction training
-    def _merge_database_host(self, database_data, host_data):
-        print('_merge_database_host')
-        self._merged_database_host = {}
-        self._merged_database_host['profile'] = f"{database_data['profile']}_host_merged" # Kmers profile
-
-        if os.path.exists(self._merged_database_host['profile']):
-            files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet'))
-            df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        else:
-            files_lst = glob(os.path.join(database_data['profile'],'*.parquet'))
-            df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            files_lst = glob(os.path.join(host_data['profile'],'*.parquet'))
-            df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-
-            cols2drop = []
-            for col in df_db.schema().names:
-                if col not in ['id','domain','__value__']:
-                    cols2drop.append(col)
-            df_db = df_db.drop_columns(cols2drop)
-            cols2drop = []
-            for col in df_host.schema().names:
-                if col not in ['id','domain','__value__']:
-                    cols2drop.append(col)
-            df_host = df_host.drop_columns(cols2drop)
-
-            df_merged = df_db.union(df_host)
-            df_merged.write_parquet(self._merged_database_host['profile'])
-
-        self._merged_database_host['ids'] = np.concatenate((database_data["ids"], host_data["ids"]))  # IDs
-        self._merged_database_host['kmers'] = database_data["kmers"]  # Features
-        self._merged_database_host['taxas'] = ['domain']  # Known taxas for classification
-        self._merged_database_host['fasta'] = (database_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
-
-        return df_merged
-
-    # Load, merge db + host & simulate validation / test datasets
-    def _load_training_data_merged(self, taxa):
-        print('_load_training_data_merged')
-        if self._classifier_binary == 'onesvm' and taxa == 'domain':
-            files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-            df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-            df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-            df_val_test = self._merge_database_host(self._database_data, self._host_data)
-            df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-            df_val = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_validation')
-            self._merged_training_datasets = {'train': df_train, 'validation': df_val}
-            if self._cv:
-                df_test = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_test')
-                self._merged_training_datasets['test'] = df_test
-        else:
-            df_train = self._merge_database_host(self._database_data, self._host_data)
-            df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-            df_val = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_validation')
-            self._merged_training_datasets = {'train': df_train, 'validation': df_val}
-            if self._cv:
-                df_test = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_test')
-                self._merged_training_datasets['test'] = df_test
-
-    # Load db & simulate validation / test datasets
-    def _load_training_data(self):
-        print('_load_training_data')
-        files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet'))
-        df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
-        df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation')
-        self._training_datasets = {'train': df_train, 'validation': df_val}
-        if self._cv:
-            df_test = self.split_sim_cv_ds(df_train,self._database_data, 'test')
-            self._training_datasets['test'] = df_test
-
-    def _sim_4_cv(self, df, kmers_ds, name):
-        print('_sim_4_cv')
-        cols = ['id']
-        cols.extend(kmers_ds['taxas'])
-        cls = pd.DataFrame(columns = cols)
-        for batch in df.iter_batches(batch_format = 'pandas'):
-            cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True)
-        
-        sim_outdir = os.path.dirname(kmers_ds['profile'])
-        cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
-        sim_data = cv_sim.simulation(self._k, kmers_ds['kmers'])
-        files_lst = glob(os.path.join(sim_data['profile'],'*.parquet'))
-        df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        return df
-    
-    def split_sim_cv_ds(self, ds, data, name):
-        ds_path = os.path.join(
-            os.path.dirname(data['profile']),
-            f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}'
-            )
-        if os.path.exists(ds_path):
-            files_lst = glob(os.path.join(ds_path,'*.parquet'))
-            cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        else:
-            cv_ds = ds.random_sample(0.1)
-            if cv_ds.count() == 0:
-                nb_smpl = round(ds.count() * 0.1)
-                cv_ds = ds.random_shuffle().limit(nb_smpl)
-            cv_ds = self._sim_4_cv(cv_ds, data, name)
-        return cv_ds
-
-# Helper functions outside of class
-###############################################################################
-
-def convert_archaea_bacteria(df):
-    df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
-    return df
\ No newline at end of file
diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py
index 2ed90e1..7084b2b 100644
--- a/src/models/encoders/model_label_encoder.py
+++ b/src/models/encoders/model_label_encoder.py
@@ -2,6 +2,7 @@
 from functools import partial
 from typing import Dict, List, Optional
 
+import ray
 import numpy as np
 import pandas as pd
 import pandas.api.types
diff --git a/src/models/encoders/one_hot_tensor_encoder.py b/src/models/encoders/one_hot_tensor_encoder.py
index 8acd7fe..3ae7950 100644
--- a/src/models/encoders/one_hot_tensor_encoder.py
+++ b/src/models/encoders/one_hot_tensor_encoder.py
@@ -23,7 +23,7 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
             [self.column],
             encode_lists = False,
         )
-
+            
         return self
 
     def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index cd57ef5..ff51baa 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -73,7 +73,7 @@ class KerasTFModel(ModelsUtils):
     train : train a model using the given datasets
 
     predict : predict the classes of a dataset
-        df : ray.data.Dataset
+        ds : ray.data.Dataset
             Dataset containing K-mers profiles of sequences to be classified
 
         threshold : float
@@ -86,27 +86,19 @@ class KerasTFModel(ModelsUtils):
     def __init__(
         self,
         classifier,
-        dataset,
         outdir_model,
-        outdir_results,
         batch_size,
         training_epochs,
-        k,
         taxa,
-        kmers_list,
-        verbose
+        kmers_list
     ):
         super().__init__(
             classifier,
-            dataset,
             outdir_model,
-            outdir_results,
             batch_size,
             training_epochs,
-            k,
             taxa,
-            kmers_list,
-            verbose
+            kmers_list
         )
         # Parameters
         # Initialize hidden
@@ -141,11 +133,11 @@ def __init__(
         elif self.classifier == 'widecnn':
             print('Training multiclass classifier based on Wide CNN Network')
 
-    def preprocess(self, df):
+    def preprocess(self, ds):
         print('preprocess')
         labels = []
         encoded = []
-        for row in df.iter_rows():
+        for row in ds.iter_rows():
             labels.append(row[self.taxa])
         self._nb_classes = len(np.unique(labels))
         if self._nb_classes == 2:
@@ -164,10 +156,10 @@ def preprocess(self, df):
                 TensorRDFFeaturesSelection(self.kmers, self.taxa),
             )
         
-        self._encoder.fit(df)
-        df = self._preprocessor.fit_transform(df)
+        self._encoder.fit(ds)
+        ds = self._preprocessor.fit_transform(ds)
         self._reductor = TensorTruncatedSVDReduction(self.kmers)
-        self._reductor.fit(df)
+        self._reductor.fit(ds)
         # Labels mapping
         if self._nb_classes == 2:
             labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
@@ -186,29 +178,8 @@ def _label_decode(self, predict):
 
         return np.array(decoded)
 
-    def train(self, datasets, kmers_ds, cv = True):
-        print('train')
-        if cv:
-            self._cross_validation(datasets, kmers_ds)
-        else:
-            self._fit_model(datasets)
-
-    def _cross_validation(self, datasets, kmers_ds):
-        print('_cross_validation')
-        df_test = datasets.pop('test')
-
-        self._fit_model(datasets)
-
-        y_true = []
-        for row in df_test.iter_rows():
-            y_true.append(row[self.taxa])
-
-        y_pred = self.predict(df_test.drop_columns([self.taxa]), threshold = 0.8)
-
-        self._cv_score(y_true, y_pred)
-
-    def _fit_model(self, datasets):
-        print('_fit_model')
+    def fit(self, datasets):
+        print('fit')
         # Preprocessing loop
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
@@ -249,15 +220,15 @@ def _fit_model(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
-    def predict(self, df, threshold=0.8):
+    def predict(self, ds, threshold=0.8):
         print('predict')
-        if df.count() > 0:
-            if len(df.schema().names) > 1:
-                col_2_drop = [col for col in df.schema().names if col != TENSOR_COLUMN_NAME]
-                df = df.drop_columns(col_2_drop)
+        if ds.count() > 0:
+            if len(ds.schema().names) > 1:
+                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
+                ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            df = self._preprocessor.transform(df)
+            ds = self._preprocessor.transform(ds)
 
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
@@ -265,7 +236,7 @@ def predict(self, df, threshold=0.8):
                 model_definition = lambda: build_model(self.classifier, self._nb_classes, len(self.kmers))
             )
             predictions = self._predictor.predict(
-                data = df,
+                data = ds,
                 batch_size = self.batch_size
             )
 
@@ -279,23 +250,23 @@ def predict(self, df, threshold=0.8):
     # Iterate over batches of predictions to transform probabilities to labels without mapping
     def _prob_2_cls(self, predictions, threshold):
         print('_prob_2_cls')
-        def map_predicted_label_binary(df, threshold):
-            df = np.ravel(df['predictions'])
+        def map_predicted_label_binary(ds, threshold):
+            ds = np.ravel(ds['predictions'])
             lower_threshold = 0.5 - (threshold * 0.5)
             upper_threshold = 0.5 + (threshold * 0.5)
             predict = pd.DataFrame({
-                'proba': df,
-                'predicted_label': np.full(len(df), -1)
+                'proba': ds,
+                'predicted_label': np.full(len(ds), -1)
             })
             predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
             predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
             return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
         
-        def map_predicted_label_multiclass(df, threshold):
-            df = df['predictions']
+        def map_predicted_label_multiclass(ds, threshold):
+            ds = ds['predictions']
             pred = pd.DataFrame({
-                'best_proba': [np.max(arr) for arr in df],
-                'predicted_label' : [np.argmax(arr) for arr in df]
+                'best_proba': [np.max(arr) for arr in ds],
+                'predicted_label' : [np.argmax(arr) for arr in ds]
             })
             pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1
 
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index c38ca25..9ccc27d 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -5,9 +5,6 @@
 # Class construction
 from abc import ABC, abstractmethod
 
-# CV metrics
-from sklearn.metrics import precision_recall_fscore_support
-
 __author__ = 'Nicolas de Montigny'
 
 __all__ = ['ModelsUtils']
@@ -43,14 +40,11 @@ class ModelsUtils(ABC):
     Methods
     ----------
 
-    train : only train or cross-validate training of classifier
+    fit : only train or cross-validate training of classifier
         X : ray.data.Dataset
             Dataset containing the K-mers profiles of sequences for learning
         y : ray.data.Dataset
             Dataset containing the classes of sequences for learning
-        cv : boolean
-            Should cross-validation be verified or not.
-            Defaults to True.
 
     predict : abstract method to predict the classes of a dataset
 
@@ -58,31 +52,22 @@ class ModelsUtils(ABC):
     def __init__(
         self,
         classifier,
-        dataset,
         outdir_model,
-        outdir_results,
         batch_size,
         training_epochs,
-        k,
         taxa,
-        kmers_list,
-        verbose
+        kmers_list
     ):
         # Parameters
         self.classifier = classifier
-        self.dataset = dataset
-        self.outdir_results = outdir_results
         self.batch_size = batch_size
-        self.k = k
         self.taxa = taxa
         self.kmers = kmers_list
-        self.verbose = verbose
         # Initialize hidden
         self._nb_kmers = len(kmers_list)
         self._training_epochs = training_epochs
         # Initialize empty
         self._labels_map = None
-        self._predict_ids = []
         # Initialize Ray variables
         self._clf = None
         self._encoder = None
@@ -93,53 +78,17 @@ def __init__(
         self._train_params = {}
         self._predictor = None
         self._workdir = outdir_model
-        # Files
-        self._cv_csv = os.path.join(self.outdir_results,'{}_{}_K{}_cv_scores.csv'.format(self.classifier, self.taxa, self.k))
-
-    @abstractmethod
-    def preprocess(self, df):
-        """
-        """
 
     @abstractmethod
-    def train(self):
+    def preprocess(self, ds):
         """
         """
 
     @abstractmethod
-    def _fit_model(self):
+    def fit(self):
         """
         """
 
-    @abstractmethod
-    def _cross_validation(self):
-        """
-        """
-
-    def _cv_score(self, y_true, y_pred):
-        print('_cv_score')
-
-        y_compare = pd.DataFrame({
-            'y_true': y_true,
-            'y_pred': y_pred
-        })
-        y_compare['y_true'] = y_compare['y_true'].str.lower()
-        y_compare['y_pred'] = y_compare['y_pred'].str.lower()
-        y_compare.to_csv(os.path.join(self._workdir, f'y_compare_{self.dataset}_{self.classifier}.csv'))
-
-        support = precision_recall_fscore_support(
-            y_compare['y_true'],
-            y_compare['y_pred'],
-            average = 'weighted'
-        )
-
-        scores = pd.DataFrame(
-            {self.classifier : [support[0],support[1],support[2]]},
-            index = ['Precision','Recall','F-score']
-        )
-
-        scores.to_csv(self._cv_csv, index = True)
-
     @abstractmethod
     def predict(self):
         """
diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
index a6032fa..88d899c 100644
--- a/src/models/preprocessors/tfidf_transformer.py
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -47,7 +47,7 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         return self
     
     def _transform_pandas(self, batch: pd.DataFrame) -> pd.DataFrame:
-        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        # _validate_df(batch, TENSOR_COLUMN_NAME, self._nb_features)
         idf_diag = self.stats_['idf_diag']
         
         df = batch[TENSOR_COLUMN_NAME]
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index fa8139e..93bebaf 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -21,8 +21,9 @@
 # Training
 from ray.air.config import ScalingConfig
 from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import SGDOneClassSVM, SGDClassifier
+from sklearn.linear_model import SGDClassifier
 from models.sklearn.partial_trainer import SklearnPartialTrainer
+from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
 from models.sklearn.tensor_predictor import SklearnTensorPredictor
 
 # Tuning
@@ -66,7 +67,7 @@ class SklearnModel(ModelsUtils):
     train : train a model using the given datasets
 
     predict : predict the classes of a dataset
-        df : ray.data.Dataset
+        ds : ray.data.Dataset
             Dataset containing K-mers profiles of sequences to be classified
 
         threshold : float
@@ -78,34 +79,26 @@ class SklearnModel(ModelsUtils):
     def __init__(
         self,
         classifier,
-        dataset,
         outdir_model,
-        outdir_results,
         batch_size,
         training_epochs,
-        k,
         taxa,
-        kmers_list,
-        verbose
+        kmers_list
     ):
         super().__init__(
             classifier,
-            dataset,
             outdir_model,
-            outdir_results,
             batch_size,
             training_epochs,
-            k,
             taxa,
-            kmers_list,
-            verbose
+            kmers_list
         )
         # Parameters
         self._encoded = []
         # Computes
         self._build()
 
-    def preprocess(self, df):
+    def preprocess(self, ds):
         print('preprocess')
         if self.classifier == 'onesvm':
             self._encoder = OneClassSVMLabelEncoder(self.taxa)
@@ -118,11 +111,11 @@ def preprocess(self, df):
             TensorTfIdfTransformer(self.kmers),
             TensorRDFFeaturesSelection(self.kmers, self.taxa),
         )
-        self._encoder.fit(df)
-        df = self._preprocessor.fit_transform(df)
+        self._encoder.fit(ds)
+        ds = self._preprocessor.fit_transform(ds)
         self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep']
         self._reductor = TensorTruncatedSVDReduction(self.kmers)
-        self._reductor.fit(df)
+        self._reductor.fit(ds)
 
         # Labels mapping
         if self.classifier != 'onesvm':
@@ -140,37 +133,11 @@ def _label_decode(self, predict):
 
         return np.array(decoded)
 
-    def train(self, datasets, kmers_ds, cv = True):
-        print('train')
-        
-        if cv:
-            self._cross_validation(datasets, kmers_ds)
-        else:
-            self._fit_model(datasets)
-            
-    def _cross_validation(self, datasets, kmers_ds):
-        print('_cross_validation')
-        
-        df_test = datasets.pop('test')
-
-        self._fit_model(datasets)
-        
-        y_true = []
-        for row in df_test.iter_rows():
-            y_true.append(row[self.taxa])
-
-        y_true = np.array(y_true)
-        y_true = list(y_true)
-        
-        y_pred = self._predict_cv(df_test.drop_columns([self.taxa]))
-        
-        self._cv_score(y_true, y_pred)
-
     def _build(self):
         print('_build')
         if self.classifier == 'onesvm':
             print('Training bacterial extractor with One Class SVM')
-            self._clf = SGDOneClassSVM()
+            self._clf = ScoringSGDOneClassSVM()
             self._train_params = {
                 'nu' : 0.026441491,
                 'learning_rate' : 'constant',
@@ -206,7 +173,7 @@ def _build(self):
                 'fit_prior' : True
             }
 
-    def _fit_model(self, datasets):
+    def fit(self, datasets):
         print('_fit_model')
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
@@ -216,8 +183,7 @@ def _fit_model(self, datasets):
             datasets[name] = ray.put(ds)
         try:
             training_labels = self._encoded.copy()
-            training_labels = np.delete(
-                training_labels, np.where(training_labels == -1))
+            training_labels = np.delete(training_labels, np.where(training_labels == -1))
         except:
             pass
 
@@ -246,43 +212,25 @@ def _fit_model(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.checkpoint
 
-    def _predict_cv(self, df):
-        print('_predict_cv')
-        if df.count() > 0:
+    def predict(self, ds, threshold = 0.8):
+        print('predict')
+        if ds.count() > 0:
+            ds = self._preprocessor.transform(ds)
+            ds = self._reductor.transform(ds)
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
-            predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
+            predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
             predictions = np.array(predictions.to_pandas()).reshape(-1)
-
-            return self._label_decode(predictions)
-        else:
-            raise ValueError('No data to predict')
-        
-    def predict(self, df, threshold = 0.8):
-        print('predict')
-        if df.count() > 0:
-            df = self._preprocessor.transform(df)
-            df = self._reductor.transform(df)
-            if self.classifier == 'onesvm':
-                predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
-                self._predictor = BatchPredictor.from_checkpoint(self._models_collection['domain'], SklearnTensorPredictor)
-                predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
-                predictions = np.array(predictions.to_pandas()).reshape(-1)
-            else:
-                predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
-                self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor)
-                predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
-                predictions = self._prob_2_cls(predictions, len(self._encoded), threshold)
             return self._label_decode(predictions)    
         else:
             raise ValueError('No data to predict')
 
     def _prob_2_cls(self, predict, nb_cls, threshold):
         print('_prob_2_cls')
-        def map_predicted_label(df : pd.DataFrame):
+        def map_predicted_label(ds : pd.DataFrame):
             predict = pd.DataFrame({
-                'best_proba': [max(df.iloc[i].values) for i in range(len(df))],
-                'predicted_label': [np.argmax(df.iloc[i].values) for i in range(len(df))]
+                'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))],
+                'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))]
             })
             predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1
             return pd.DataFrame(predict['predicted_label'])
diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py
index f08dd7c..021d9ce 100644
--- a/src/models/sklearn/partial_trainer.py
+++ b/src/models/sklearn/partial_trainer.py
@@ -26,6 +26,7 @@
 
 from ray.train.sklearn import SklearnTrainer
 
+TENSOR_COLUMN_NAME = '__value__'
 LABELS_COLUMN_NAME = 'labels'
 
 simplefilter(action='ignore', category=FutureWarning)
@@ -216,7 +217,7 @@ def training_loop(self):
                     )
                 ):  
                     if isinstance(batch_X, dict):
-                        batch_X = batch_X['__value__']
+                        batch_X = batch_X[TENSOR_COLUMN_NAME]
                     
                     """    
                     try:
@@ -244,7 +245,7 @@ def training_loop(self):
         #             batch_size = 1,
         #             batch_format = 'numpy'
         #         )):
-        #             X_calib_df[ind] = batch['__value__']
+                    # X_calib_df[ind] = batch[TENSOR_COLUMN_NAME]
 
         #         """
         #         X_calib = pd.DataFrame(X_calib_df, columns = self._features_list)
@@ -318,7 +319,7 @@ def _score_on_validation_sets(
                 )
             ):
                 if isinstance(batch, dict):
-                    batch = batch['__value__']
+                    batch = batch[TENSOR_COLUMN_NAME]
 
                 """
                 try:
diff --git a/src/utils.py b/src/utils.py
index f9f1d4b..5e7924f 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -1,8 +1,6 @@
 import os
 import ray
-import json
 import logging
-import warnings
 
 import numpy as np
 import pandas as pd
@@ -41,6 +39,7 @@
     'zip_X_y',
     'ensure_length_ds',
     'convert_archaea_bacteria',
+    'verify_load_metagenome',
     'verify_load_db',
     'verify_load_host_merge',
     'merge_db_host'
@@ -75,12 +74,12 @@ def init_ray_cluster(workdir):
 
 # Load data from file
 def load_Xy_data(Xy_file):
-    with np.load(Xy_file, allow_pickle=True) as f:
-        return f['data'].tolist()
+    with np.load(Xy_file, allow_pickle=True) as handle:
+        return handle['data'].tolist()
 
 # Save data to file
-def save_Xy_data(df, Xy_file):
-    np.savez(Xy_file, data = df)
+def save_Xy_data(data, Xy_file):
+    np.savez(Xy_file, data = data)
 
 # User arguments verification
 #########################################################################################################
@@ -302,6 +301,17 @@ def convert_archaea_bacteria(df):
     df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
     return df
 
+def verify_load_metagenome(data):
+    """
+    Wrapper function for verifying and loading the metagenome dataset
+    """
+    data = verify_load_data(data)
+    files_lst = glob(os.path.join(data['profile'], '*.parquet'))
+    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    
+    return data, ds
+
+
 def verify_load_db(db_data):
     """
     Wrapper function for verifying and loading the db dataset

From 9babd8fb4d66e2243bab7957d713694499005c03 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 10 Nov 2023 08:24:09 -0500
Subject: [PATCH 29/92] remove XGBoost model for features selection from
 training

---
 src/models/kerasTF/models.py | 27 ++++++++++++++++-----------
 src/models/models_utils.py   |  1 +
 src/models/sklearn/models.py | 21 +++++++++++++--------
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index ff51baa..f2d0a7d 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -142,22 +142,25 @@ def preprocess(self, ds):
         self._nb_classes = len(np.unique(labels))
         if self._nb_classes == 2:
             self._encoder = ModelLabelEncoder(self.taxa)
-            self._preprocessor = Chain(
-                TensorTfIdfTransformer(self.kmers),
-                TensorRDFFeaturesSelection(self.kmers, self.taxa),
-            )
+            self._scaler = TensorTfIdfTransformer(self.kmers)
+            # self._preprocessor = Chain(
+            #     TensorTfIdfTransformer(self.kmers),
+            #     TensorRDFFeaturesSelection(self.kmers, self.taxa),
+            # )
         else:
             self._encoder = Chain(
                 LabelEncoder(self.taxa),
                 OneHotTensorEncoder(self.taxa)
             )
-            self._preprocessor = Chain(
-                TensorTfIdfTransformer(self.kmers),
-                TensorRDFFeaturesSelection(self.kmers, self.taxa),
-            )
+            self._scaler = TensorTfIdfTransformer(self.kmers)
+            # self._preprocessor = Chain(
+            #     TensorTfIdfTransformer(self.kmers),
+            #     TensorRDFFeaturesSelection(self.kmers, self.taxa),
+            # )
         
         self._encoder.fit(ds)
-        ds = self._preprocessor.fit_transform(ds)
+        ds = self._scaler.fit_transform(ds)
+        # ds = self._preprocessor.fit_transform(ds)
         self._reductor = TensorTruncatedSVDReduction(self.kmers)
         self._reductor.fit(ds)
         # Labels mapping
@@ -184,7 +187,8 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._preprocessor.transform(ds)
+            ds = self._scaler.transform(ds)
+            # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
             datasets[name] = ds
 
@@ -228,7 +232,8 @@ def predict(self, ds, threshold=0.8):
                 ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            ds = self._preprocessor.transform(ds)
+            ds = self._scaler.transform(ds)
+            # ds = self._preprocessor.transform(ds)
 
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 9ccc27d..7e6e50f 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -71,6 +71,7 @@ def __init__(
         # Initialize Ray variables
         self._clf = None
         self._encoder = None
+        self._scaler = None
         self._preprocessor = None
         self._reductor = None
         self._model_ckpt = None
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 93bebaf..73ca634 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -107,13 +107,16 @@ def preprocess(self, ds):
         else:
             self._encoder = ModelLabelEncoder(self.taxa)
         
-        self._preprocessor = Chain(
-            TensorTfIdfTransformer(self.kmers),
-            TensorRDFFeaturesSelection(self.kmers, self.taxa),
-        )
+        self._scaler = TensorTfIdfTransformer(self.kmers)
+
+        # self._preprocessor = Chain(
+        #     TensorTfIdfTransformer(self.kmers),
+        #     TensorRDFFeaturesSelection(self.kmers, self.taxa),
+        # )
         self._encoder.fit(ds)
-        ds = self._preprocessor.fit_transform(ds)
-        self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep']
+        ds = self._scaler.fit_transform(ds)
+        # ds = self._preprocessor.fit_transform(ds)
+        # self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep']
         self._reductor = TensorTruncatedSVDReduction(self.kmers)
         self._reductor.fit(ds)
 
@@ -178,7 +181,8 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._preprocessor.transform(ds)
+            ds = self._scaler.transform(ds)
+            # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
             datasets[name] = ray.put(ds)
         try:
@@ -215,7 +219,8 @@ def fit(self, datasets):
     def predict(self, ds, threshold = 0.8):
         print('predict')
         if ds.count() > 0:
-            ds = self._preprocessor.transform(ds)
+            ds = self._scaler.transform(ds)
+            # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)

From 1796002d97c6605e9bd5cb7baafcf861e4442ffe Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 10 Nov 2023 15:03:54 -0500
Subject: [PATCH 30/92] truncated svd incremental fitting

---
 src/data/reduction/truncated_svd_reduction.py | 45 ++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py
index 9e64773..4d2a43a 100644
--- a/src/data/reduction/truncated_svd_reduction.py
+++ b/src/data/reduction/truncated_svd_reduction.py
@@ -18,7 +18,9 @@ class TensorTruncatedSVDReduction(Preprocessor):
     This makes it possible to use the class as a Ray preprocessor in a features reduction strategy.
     TruncatedSVD performs linear dimensionality reduction by means of truncated singular value decomposition (SVD).
     When it is applied following the TF-IDF normalisation, it becomes a latent semantic analysis (LSA).
+    https://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA
     """
     
     def __init__(self, features: List[str], nb_components: int = 10000):
@@ -26,17 +28,20 @@ def __init__(self, features: List[str], nb_components: int = 10000):
         self.features = features
         self._nb_features = len(features)
         self._nb_components = nb_components
-        
-    
+        self._n_samples_seen = 0
+        self._mean = 0.0
+        self._var = 0.0
+
     def _fit(self, ds: Dataset) -> Preprocessor:
+        # Parallel
+        """
         def svd_batch(arr: np.array):
-            df = arr['__value__']
+            df = arr[TENSOR_COLUMN_NAME]
             df = _unwrap_ndarray_object_type_if_needed(df)
             U, Sigma, VT = randomized_svd(
                 df,
                 n_components = self._nb_components,
-                n_iter = 5,
-                n_oversamples = 10,
+                n_iter = 1,
                 power_iteration_normalizer = 'LU',
                 random_state = None
             )
@@ -51,8 +56,36 @@ def svd_batch(arr: np.array):
             for row in svd_vt.iter_rows():
                 components.append(row['VT'])
 
-            components = np.mean(components, axis = 0)
+            components = np.concatenate(components, axis = 0)
 
+            self.stats_ = {'components' : components}
+        """
+        # Incremental
+        # If too long to exec, will have to parallelise internal SVD computations
+        components = None
+        singular_values = None
+        if self._nb_features > self._nb_components:
+            for batch in ds.iter_batches(batch_format = 'numpy'):
+                batch = batch[TENSOR_COLUMN_NAME]
+                batch = _unwrap_ndarray_object_type_if_needed(batch)
+                if components is not None:
+                    # Build matrix of previous computations
+                    batch = np.vstack(
+                        (
+                            singular_values.reshape((-1, 1)) * components,
+                            batch,
+                        )
+                    )
+                
+                U, Sigma, VT = randomized_svd(
+                    batch,
+                    n_components = self._nb_components,
+                    n_iter = 1,
+                    power_iteration_normalizer = 'LU',
+                )
+                components = VT
+                singular_values = Sigma
+                
             self.stats_ = {'components' : components}
         else:
             warn('No features reduction to do because the number of features is already lower than the required number of components')

From b02f7c7c4414a7bafa2dfe0775243c3d1070250d Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 10 Nov 2023 16:05:49 -0500
Subject: [PATCH 31/92] tqdm for TruncSVD + materialize ds after preprocessing

---
 src/data/reduction/truncated_svd_reduction.py | 3 ++-
 src/models/kerasTF/models.py                  | 4 +++-
 src/models/sklearn/models.py                  | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py
index 4d2a43a..ccf0187 100644
--- a/src/data/reduction/truncated_svd_reduction.py
+++ b/src/data/reduction/truncated_svd_reduction.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 
+from tqdm import tqdm
 from typing import List
 from warnings import warn
 from ray.data import Dataset
@@ -65,7 +66,7 @@ def svd_batch(arr: np.array):
         components = None
         singular_values = None
         if self._nb_features > self._nb_components:
-            for batch in ds.iter_batches(batch_format = 'numpy'):
+            for batch in tqdm(ds.iter_batches(batch_format = 'numpy')):
                 batch = batch[TENSOR_COLUMN_NAME]
                 batch = _unwrap_ndarray_object_type_if_needed(batch)
                 if components is not None:
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index f2d0a7d..cf1936d 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -190,6 +190,9 @@ def fit(self, datasets):
             ds = self._scaler.transform(ds)
             # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
+            # Trigger the preprocessing computations before ingest in trainer
+            # Otherwise, it would be executed at each epoch
+            ds = ds.materialize()
             datasets[name] = ds
 
         # Training parameters
@@ -321,7 +324,6 @@ def train_func(config):
     train_data = session.get_dataset_shard('train')
     val_data = session.get_dataset_shard('validation')
 
-
     for _ in range(epochs):
         batch_train = train_data.to_tf(
             feature_columns = TENSOR_COLUMN_NAME,
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 73ca634..04b38b2 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -184,6 +184,9 @@ def fit(self, datasets):
             ds = self._scaler.transform(ds)
             # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
+            # Trigger the preprocessing computations before ingest in trainer
+            # Otherwise, it would be executed at each epoch
+            ds = ds.materialize()
             datasets[name] = ray.put(ds)
         try:
             training_labels = self._encoded.copy()

From 9b827f11aaa8a37bd0b66fb5986b6439cf792f7d Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 13 Nov 2023 09:33:03 -0500
Subject: [PATCH 32/92] parallel occurence counting

---
 src/data/reduction/occurence_exclusion.py     | 15 +++++++---
 src/data/reduction/truncated_svd_reduction.py | 28 ++-----------------
 src/models/preprocessors/tfidf_transformer.py | 14 ++++++++--
 3 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index fe9b45d..ab65389 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -22,12 +22,19 @@ def __init__(self, features: List[str], num_features: int):
         self._num_features = int(self._nb_features - num_features)
 
     def _fit(self, ds: Dataset) -> Preprocessor:
+        def get_occurences(batch):
+            batch = batch[TENSOR_COLUMN_NAME]
+            return {'occurences' : np.count_nonzero(batch, axis = 0)}
+
         # Nb of occurences
         occurences = np.zeros(self._nb_features)
-        for batch in ds.iter_batches(batch_format = 'numpy'):
-            batch = batch[TENSOR_COLUMN_NAME]
-            occurences += np.count_nonzero(batch, axis = 0)
-        
+        occur = ds.map_batches(get_occurences, batch_format = 'numpy')
+        # for batch in ds.iter_batches(batch_format = 'numpy'):
+        #     batch = batch[TENSOR_COLUMN_NAME]
+        #     occurences += np.count_nonzero(batch, axis = 0)
+        for row in occur.iter_rows():
+            occurences += row['occurences']
+
         # Include / Exclude by sorted position
         cols_keep = pd.Series(occurences, index = self.features)
         cols_keep = cols_keep.sort_values(ascending = True) # Long operation
diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py
index ccf0187..ed653cb 100644
--- a/src/data/reduction/truncated_svd_reduction.py
+++ b/src/data/reduction/truncated_svd_reduction.py
@@ -36,33 +36,11 @@ def __init__(self, features: List[str], nb_components: int = 10000):
     def _fit(self, ds: Dataset) -> Preprocessor:
         # Parallel
         """
-        def svd_batch(arr: np.array):
-            df = arr[TENSOR_COLUMN_NAME]
-            df = _unwrap_ndarray_object_type_if_needed(df)
-            U, Sigma, VT = randomized_svd(
-                df,
-                n_components = self._nb_components,
-                n_iter = 1,
-                power_iteration_normalizer = 'LU',
-                random_state = None
-            )
-
-            return {'VT': [VT]}
-
-        if self._nb_features > self._nb_components:
-            # Exec svd
-            components = []
-            svd_vt = ds.map_batches(svd_batch, batch_format = 'numpy')
-
-            for row in svd_vt.iter_rows():
-                components.append(row['VT'])
-
-            components = np.concatenate(components, axis = 0)
-
-            self.stats_ = {'components' : components}
+        # TODO: implement parallel computation for svd
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html#scipy.linalg.svd
+        # https://github.com/scipy/scipy/blob/v1.11.3/scipy/linalg/_decomp_svd.py#L13-L138
         """
         # Incremental
-        # If too long to exec, will have to parallelise internal SVD computations
         components = None
         singular_values = None
         if self._nb_features > self._nb_components:
diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
index 88d899c..ba4dcc3 100644
--- a/src/models/preprocessors/tfidf_transformer.py
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -27,10 +27,18 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         nb_samples = ds.count()
 
         # Nb of occurences
-        occurences = np.zeros(self._nb_features)
-        for batch in ds.iter_batches(batch_format = 'numpy'):
+        def get_occurences(batch):
             batch = batch[TENSOR_COLUMN_NAME]
-            occurences += np.count_nonzero(batch, axis = 0)
+            return {'occurences' : np.count_nonzero(batch, axis = 0)}
+
+        # Nb of occurences
+        occurences = np.zeros(self._nb_features)
+        occur = ds.map_batches(get_occurences, batch_format = 'numpy')
+        # for batch in ds.iter_batches(batch_format = 'numpy'):
+        #     batch = batch[TENSOR_COLUMN_NAME]
+        #     occurences += np.count_nonzero(batch, axis = 0)
+        for row in occur.iter_rows():
+            occurences += row['occurences']
 
         idf = np.log(nb_samples / occurences) + 1
         

From f5ccbec396590506118b4762deb2de39e78ff585 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 13 Nov 2023 09:40:54 -0500
Subject: [PATCH 33/92] serial occurence counting

---
 src/data/reduction/occurence_exclusion.py     | 12 ++++++------
 src/models/preprocessors/tfidf_transformer.py | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index ab65389..05aa6d3 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -28,12 +28,12 @@ def get_occurences(batch):
 
         # Nb of occurences
         occurences = np.zeros(self._nb_features)
-        occur = ds.map_batches(get_occurences, batch_format = 'numpy')
-        # for batch in ds.iter_batches(batch_format = 'numpy'):
-        #     batch = batch[TENSOR_COLUMN_NAME]
-        #     occurences += np.count_nonzero(batch, axis = 0)
-        for row in occur.iter_rows():
-            occurences += row['occurences']
+        # occur = ds.map_batches(get_occurences, batch_format = 'numpy')
+        for batch in ds.iter_batches(batch_format = 'numpy'):
+            batch = batch[TENSOR_COLUMN_NAME]
+            occurences += np.count_nonzero(batch, axis = 0)
+        # for row in occur.iter_rows():
+        #     occurences += row['occurences']
 
         # Include / Exclude by sorted position
         cols_keep = pd.Series(occurences, index = self.features)
diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
index ba4dcc3..188ae0b 100644
--- a/src/models/preprocessors/tfidf_transformer.py
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -33,12 +33,12 @@ def get_occurences(batch):
 
         # Nb of occurences
         occurences = np.zeros(self._nb_features)
-        occur = ds.map_batches(get_occurences, batch_format = 'numpy')
-        # for batch in ds.iter_batches(batch_format = 'numpy'):
-        #     batch = batch[TENSOR_COLUMN_NAME]
-        #     occurences += np.count_nonzero(batch, axis = 0)
-        for row in occur.iter_rows():
-            occurences += row['occurences']
+        # occur = ds.map_batches(get_occurences, batch_format = 'numpy')
+        for batch in ds.iter_batches(batch_format = 'numpy'):
+            batch = batch[TENSOR_COLUMN_NAME]
+            occurences += np.count_nonzero(batch, axis = 0)
+        # for row in occur.iter_rows():
+        #     occurences += row['occurences']
 
         idf = np.log(nb_samples / occurences) + 1
         

From a071131454a8d85baea9c333f3162c48f0ca9943 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 13 Nov 2023 17:40:12 -0500
Subject: [PATCH 34/92] parallel LSA

---
 src/data/reduction/occurence_exclusion.py     |  7 --
 ...tion.py => truncated_svd_decomposition.py} | 80 +++++++++++++------
 src/models/kerasTF/models.py                  |  4 +-
 src/models/preprocessors/tfidf_transformer.py |  8 --
 src/models/sklearn/models.py                  |  4 +-
 5 files changed, 58 insertions(+), 45 deletions(-)
 rename src/data/reduction/{truncated_svd_reduction.py => truncated_svd_decomposition.py} (56%)

diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py
index 05aa6d3..8eee147 100644
--- a/src/data/reduction/occurence_exclusion.py
+++ b/src/data/reduction/occurence_exclusion.py
@@ -22,18 +22,11 @@ def __init__(self, features: List[str], num_features: int):
         self._num_features = int(self._nb_features - num_features)
 
     def _fit(self, ds: Dataset) -> Preprocessor:
-        def get_occurences(batch):
-            batch = batch[TENSOR_COLUMN_NAME]
-            return {'occurences' : np.count_nonzero(batch, axis = 0)}
-
         # Nb of occurences
         occurences = np.zeros(self._nb_features)
-        # occur = ds.map_batches(get_occurences, batch_format = 'numpy')
         for batch in ds.iter_batches(batch_format = 'numpy'):
             batch = batch[TENSOR_COLUMN_NAME]
             occurences += np.count_nonzero(batch, axis = 0)
-        # for row in occur.iter_rows():
-        #     occurences += row['occurences']
 
         # Include / Exclude by sorted position
         cols_keep = pd.Series(occurences, index = self.features)
diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_decomposition.py
similarity index 56%
rename from src/data/reduction/truncated_svd_reduction.py
rename to src/data/reduction/truncated_svd_decomposition.py
index ed653cb..6d4aa6f 100644
--- a/src/data/reduction/truncated_svd_reduction.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -13,9 +13,9 @@
 
 TENSOR_COLUMN_NAME = '__value__'
 
-class TensorTruncatedSVDReduction(Preprocessor):
+class TensorTruncatedSVDDecomposition(Preprocessor):
     """
-    Custom class for using a mix of TruncatedSVD inspired by sklearn.decomposition.TruncatedSVD and applying a batched strategy inspired by sklearn.decomposition.IncrementalPCA to process batches in parallel.
+    Custom class for using a mix of TruncatedSVD inspired by sklearn.decomposition.TruncatedSVD and applying a batched strategy inspired by sklearn.decomposition.IncrementalPCA to process batches sequentially.
     This makes it possible to use the class as a Ray preprocessor in a features reduction strategy.
     TruncatedSVD performs linear dimensionality reduction by means of truncated singular value decomposition (SVD).
     When it is applied following the TF-IDF normalisation, it becomes a latent semantic analysis (LSA).
@@ -23,7 +23,6 @@ class TensorTruncatedSVDReduction(Preprocessor):
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA
     """
-    
     def __init__(self, features: List[str], nb_components: int = 10000):
         # Parameters
         self.features = features
@@ -34,36 +33,65 @@ def __init__(self, features: List[str], nb_components: int = 10000):
         self._var = 0.0
 
     def _fit(self, ds: Dataset) -> Preprocessor:
-        # Parallel
+        # Parallel MiniBatchPCA
         """
-        # TODO: implement parallel computation for svd
-        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html#scipy.linalg.svd
-        # https://github.com/scipy/scipy/blob/v1.11.3/scipy/linalg/_decomp_svd.py#L13-L138
+        Possibilities for parallel TruncatedSVD
+        * sklearn minibatch PCA -> PCA / SVD mostly equivalent
+        * implement parallel based on other library
+        * dask-ml has a truncated svd
+        * tf has a svd function
         """
-        # Incremental
-        components = None
-        singular_values = None
-        if self._nb_features > self._nb_components:
-            for batch in tqdm(ds.iter_batches(batch_format = 'numpy')):
-                batch = batch[TENSOR_COLUMN_NAME]
-                batch = _unwrap_ndarray_object_type_if_needed(batch)
-                if components is not None:
-                    # Build matrix of previous computations
-                    batch = np.vstack(
-                        (
-                            singular_values.reshape((-1, 1)) * components,
-                            batch,
-                        )
-                    )
-                
-                U, Sigma, VT = randomized_svd(
+        """
+        Option to implement parallel computation for SVD
+        1- Sparse Dictionnary Learning -> encode data to sparse representation by sample
+        2- Sparse PCA (sparse SVD?) -> construct a PCA from sparsely encoded data
+        It is possible to parallelize batches computation by applying the logic from MiniBatchDictionaryLearning and MiniBatchSparsePCA
+        """
+        components = []
+        def batch_svd(batch):
+            batch = batch[TENSOR_COLUMN_NAME]
+            batch = _unwrap_ndarray_object_type_if_needed(batch)
+            
+            U, Sigma, VT = randomized_svd(
                     batch,
                     n_components = self._nb_components,
                     n_iter = 1,
                     power_iteration_normalizer = 'LU',
                 )
-                components = VT
-                singular_values = Sigma
+            return {'VT' : VT}
+
+        if self._nb_features > self._nb_components:
+            svd = ds.map_batches(batch_svd, batch_format = 'numpy')
+            for row in svd.iter_rows():
+                components.append(row['VT'])
+            components = np.sum(components, axis = 0)
+
+        # Incremental
+        # components = None
+        # singular_values = None
+        # if self._nb_features > self._nb_components:
+        #     for batch in tqdm(ds.iter_batches(batch_format = 'numpy')):
+        #         batch = batch[TENSOR_COLUMN_NAME]
+        #         batch = _unwrap_ndarray_object_type_if_needed(batch)
+        #         if components is not None:
+        #             # Build matrix of previous computations
+        #             batch = np.vstack(
+        #                 (
+        #                     singular_values.reshape((-1, 1)) * components,
+        #                     batch,
+        #                 )
+        #             )
+        #         # U : (1000, 100), S : (100,), V : (100, 1024)
+        #         # S.reshape : (100, 1), S.reshape * components : (100, 1024)
+        #         # batch : (1000, 1024), vstack : (1100, 1024)
+        #         U, Sigma, VT = randomized_svd(
+        #             batch,
+        #             n_components = self._nb_components,
+        #             n_iter = 1,
+        #             power_iteration_normalizer = 'LU',
+        #         )
+        #         components = VT
+        #         singular_values = Sigma
                 
             self.stats_ = {'components' : components}
         else:
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index cf1936d..a6bf191 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -11,7 +11,7 @@
 # Dimensions reduction
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
-from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction
+from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
 
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
@@ -161,7 +161,7 @@ def preprocess(self, ds):
         self._encoder.fit(ds)
         ds = self._scaler.fit_transform(ds)
         # ds = self._preprocessor.fit_transform(ds)
-        self._reductor = TensorTruncatedSVDReduction(self.kmers)
+        self._reductor = TensorTruncatedSVDDecomposition(self.kmers)
         self._reductor.fit(ds)
         # Labels mapping
         if self._nb_classes == 2:
diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
index 188ae0b..88d899c 100644
--- a/src/models/preprocessors/tfidf_transformer.py
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -26,19 +26,11 @@ def __init__(self, features):
     def _fit(self, ds: Dataset) -> Preprocessor:
         nb_samples = ds.count()
 
-        # Nb of occurences
-        def get_occurences(batch):
-            batch = batch[TENSOR_COLUMN_NAME]
-            return {'occurences' : np.count_nonzero(batch, axis = 0)}
-
         # Nb of occurences
         occurences = np.zeros(self._nb_features)
-        # occur = ds.map_batches(get_occurences, batch_format = 'numpy')
         for batch in ds.iter_batches(batch_format = 'numpy'):
             batch = batch[TENSOR_COLUMN_NAME]
             occurences += np.count_nonzero(batch, axis = 0)
-        # for row in occur.iter_rows():
-        #     occurences += row['occurences']
 
         idf = np.log(nb_samples / occurences) + 1
         
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 04b38b2..0591372 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -10,7 +10,7 @@
 # Dimensions reduction
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
-from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction
+from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
 
 # Preprocessing
 from ray.data.preprocessors import Chain
@@ -117,7 +117,7 @@ def preprocess(self, ds):
         ds = self._scaler.fit_transform(ds)
         # ds = self._preprocessor.fit_transform(ds)
         # self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep']
-        self._reductor = TensorTruncatedSVDReduction(self.kmers)
+        self._reductor = TensorTruncatedSVDDecomposition(self.kmers)
         self._reductor.fit(ds)
 
         # Labels mapping

From e51e10bd72694223228731f0b8c031b8cce7680e Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 14 Nov 2023 16:51:07 -0500
Subject: [PATCH 35/92] features reduction using CountHashing

---
 src/data/reduction/count_hashing.py           |  2 +-
 .../reduction/truncated_svd_decomposition.py  | 47 +++++++++++--------
 src/models/classification.py                  |  4 +-
 src/models/kerasTF/models.py                  | 16 ++-----
 src/models/sklearn/models.py                  | 13 ++---
 5 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/src/data/reduction/count_hashing.py b/src/data/reduction/count_hashing.py
index 1b6506e..d23e0cf 100644
--- a/src/data/reduction/count_hashing.py
+++ b/src/data/reduction/count_hashing.py
@@ -19,7 +19,7 @@ class TensorCountHashing(Preprocessor):
     """
     _is_fittable = False
 
-    def __init__(self, features: List[str], num_features: int):
+    def __init__(self, features: List[str], num_features: int = 1000):
         self.features = features
         self.num_features = num_features
 
diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py
index 6d4aa6f..3cc14c8 100644
--- a/src/data/reduction/truncated_svd_decomposition.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -1,3 +1,5 @@
+import os
+
 import numpy as np
 import pandas as pd
 
@@ -5,6 +7,7 @@
 from typing import List
 from warnings import warn
 from ray.data import Dataset
+from utils import save_Xy_data, load_Xy_data
 
 from sklearn.utils.extmath import randomized_svd
 
@@ -23,17 +26,15 @@ class TensorTruncatedSVDDecomposition(Preprocessor):
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA
     """
-    def __init__(self, features: List[str], nb_components: int = 10000):
+    def __init__(self, features: List[str], nb_components: int = 100, file: str = ''):
         # Parameters
         self.features = features
         self._nb_features = len(features)
         self._nb_components = nb_components
-        self._n_samples_seen = 0
-        self._mean = 0.0
-        self._var = 0.0
-
+        self._file = file
+        
     def _fit(self, ds: Dataset) -> Preprocessor:
-        # Parallel MiniBatchPCA
+        # Parallel
         """
         Possibilities for parallel TruncatedSVD
         * sklearn minibatch PCA -> PCA / SVD mostly equivalent
@@ -51,20 +52,28 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         def batch_svd(batch):
             batch = batch[TENSOR_COLUMN_NAME]
             batch = _unwrap_ndarray_object_type_if_needed(batch)
-            
-            U, Sigma, VT = randomized_svd(
-                    batch,
-                    n_components = self._nb_components,
-                    n_iter = 1,
-                    power_iteration_normalizer = 'LU',
-                )
-            return {'VT' : VT}
+            U, S, V = randomized_svd(
+                batch,
+                n_components = self._nb_components,
+                n_iter = 1,
+                power_iteration_normalizer = 'LU',
+            )
+            print(V.shape)
+            return {'V' : V}
 
         if self._nb_features > self._nb_components:
-            svd = ds.map_batches(batch_svd, batch_format = 'numpy')
-            for row in svd.iter_rows():
-                components.append(row['VT'])
-            components = np.sum(components, axis = 0)
+            if os.path.isfile(self._file):
+                components = np.array(load_Xy_data(self._file))
+            else:
+                # sampl = ds.random_sample(0.1)
+                # svd = sampl.map_batches(batch_svd, batch_format = 'numpy')
+                svd = ds.map_batches(batch_svd, batch_format = 'numpy')
+                print(svd.to_pandas())
+                for row in svd.iter_rows():
+                    components.append(row['V'])
+                # components = np.vstack(components)
+                components = np.sum(components, axis = 0)
+                save_Xy_data(components, self._file)
 
         # Incremental
         # components = None
@@ -108,7 +117,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
             tensor_col = df[TENSOR_COLUMN_NAME]
             tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
             tensor_col = np.dot(tensor_col, components.T)
-            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))        
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
 
         return df
 
diff --git a/src/models/classification.py b/src/models/classification.py
index cbad2be..eaa682c 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -203,7 +203,7 @@ def _binary_training(self, datasets, taxa, file):
                 taxa,
                 self._database_data['kmers']
             )
-        model.preprocess(datasets[TRAINING_DATASET_NAME])
+        model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz'))
         model.fit(datasets)
 
         self._save_model(model, file)
@@ -228,7 +228,7 @@ def _multiclass_training(self, datasets, taxa, file):
                 taxa,
                 self._database_data['kmers']
             )
-        model.preprocess(datasets[TRAINING_DATASET_NAME])
+        model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz'))
         model.fit(datasets)
 
         self._save_model(model, file)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index a6bf191..a955274 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -9,6 +9,7 @@
 from shutil import rmtree
 
 # Dimensions reduction
+from data.reduction.count_hashing import TensorCountHashing
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
 from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
@@ -133,7 +134,7 @@ def __init__(
         elif self.classifier == 'widecnn':
             print('Training multiclass classifier based on Wide CNN Network')
 
-    def preprocess(self, ds):
+    def preprocess(self, ds, reductor_file):
         print('preprocess')
         labels = []
         encoded = []
@@ -143,25 +144,16 @@ def preprocess(self, ds):
         if self._nb_classes == 2:
             self._encoder = ModelLabelEncoder(self.taxa)
             self._scaler = TensorTfIdfTransformer(self.kmers)
-            # self._preprocessor = Chain(
-            #     TensorTfIdfTransformer(self.kmers),
-            #     TensorRDFFeaturesSelection(self.kmers, self.taxa),
-            # )
         else:
             self._encoder = Chain(
                 LabelEncoder(self.taxa),
                 OneHotTensorEncoder(self.taxa)
             )
             self._scaler = TensorTfIdfTransformer(self.kmers)
-            # self._preprocessor = Chain(
-            #     TensorTfIdfTransformer(self.kmers),
-            #     TensorRDFFeaturesSelection(self.kmers, self.taxa),
-            # )
-        
+            
         self._encoder.fit(ds)
         ds = self._scaler.fit_transform(ds)
-        # ds = self._preprocessor.fit_transform(ds)
-        self._reductor = TensorTruncatedSVDDecomposition(self.kmers)
+        self._reductor = TensorCountHashing(self.kmers, 10000)
         self._reductor.fit(ds)
         # Labels mapping
         if self._nb_classes == 2:
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 0591372..650b62f 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -8,12 +8,12 @@
 from shutil import rmtree
 
 # Dimensions reduction
+from data.reduction.count_hashing import TensorCountHashing
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
 from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
 
 # Preprocessing
-from ray.data.preprocessors import Chain
 from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
@@ -98,7 +98,7 @@ def __init__(
         # Computes
         self._build()
 
-    def preprocess(self, ds):
+    def preprocess(self, ds, reductor_file):
         print('preprocess')
         if self.classifier == 'onesvm':
             self._encoder = OneClassSVMLabelEncoder(self.taxa)
@@ -109,15 +109,10 @@ def preprocess(self, ds):
         
         self._scaler = TensorTfIdfTransformer(self.kmers)
 
-        # self._preprocessor = Chain(
-        #     TensorTfIdfTransformer(self.kmers),
-        #     TensorRDFFeaturesSelection(self.kmers, self.taxa),
-        # )
         self._encoder.fit(ds)
         ds = self._scaler.fit_transform(ds)
-        # ds = self._preprocessor.fit_transform(ds)
-        # self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep']
-        self._reductor = TensorTruncatedSVDDecomposition(self.kmers)
+
+        self._reductor = TensorCountHashing(self.kmers, 10000)
         self._reductor.fit(ds)
 
         # Labels mapping

From 4609ff97a22968dc24df35791bdacd2b16f62b6f Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 14 Nov 2023 16:54:16 -0500
Subject: [PATCH 36/92] remove tfidf transform in preprocessing

---
 src/models/kerasTF/models.py | 2 +-
 src/models/sklearn/models.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index a955274..8d0d992 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -152,7 +152,7 @@ def preprocess(self, ds, reductor_file):
             self._scaler = TensorTfIdfTransformer(self.kmers)
             
         self._encoder.fit(ds)
-        ds = self._scaler.fit_transform(ds)
+        self._scaler.fit(ds)
         self._reductor = TensorCountHashing(self.kmers, 10000)
         self._reductor.fit(ds)
         # Labels mapping
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 650b62f..f74fc8e 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -110,7 +110,7 @@ def preprocess(self, ds, reductor_file):
         self._scaler = TensorTfIdfTransformer(self.kmers)
 
         self._encoder.fit(ds)
-        ds = self._scaler.fit_transform(ds)
+        self._scaler.fit(ds)
 
         self._reductor = TensorCountHashing(self.kmers, 10000)
         self._reductor.fit(ds)

From 20ad4875b36f1fdfef0be521f412d5deab712406 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 15 Nov 2023 18:13:27 -0500
Subject: [PATCH 37/92] TruncatedSVD + keras debugging

---
 src/data/reduction/count_hashing.py           | 30 +++++++-----
 .../reduction/truncated_svd_decomposition.py  | 49 ++++++++++++++-----
 src/models/kerasTF/build_neural_networks.py   | 44 ++++++++---------
 src/models/kerasTF/models.py                  | 12 +++--
 src/models/models_utils.py                    |  2 +-
 src/models/sklearn/models.py                  |  7 +--
 src/models/sklearn/partial_trainer.py         | 11 +++--
 7 files changed, 95 insertions(+), 60 deletions(-)

diff --git a/src/data/reduction/count_hashing.py b/src/data/reduction/count_hashing.py
index d23e0cf..89f4c13 100644
--- a/src/data/reduction/count_hashing.py
+++ b/src/data/reduction/count_hashing.py
@@ -21,27 +21,33 @@ class TensorCountHashing(Preprocessor):
 
     def __init__(self, features: List[str], num_features: int = 1000):
         self.features = features
-        self.num_features = num_features
+        self._nb_features = len(features)
+        self._num_features = num_features
 
     def _transform_pandas(self, df: pd.DataFrame):
         def row_feature_hasher(row):
             hash_counts = collections.defaultdict(int)
             for feature in self.features:
-                hashed_value = simple_hash(feature, self.num_features)
+                hashed_value = simple_hash(feature, self._num_features)
                 hash_counts[hashed_value] += row[feature]
-            return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
+            return {f"hash_{i}": hash_counts[i] for i in range(self._num_features)}
 
-        tensor_col = df[TENSOR_COLUMN_NAME]
-        tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
-        tensor_col = pd.DataFrame(tensor_col, columns = self.features)
+        if self._nb_features > self._num_features:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = pd.DataFrame(tensor_col, columns = self.features)
 
-        tensor_col = tensor_col.apply(
-            row_feature_hasher, axis=1, result_type="expand"
-        )
-        
-        tensor_col = tensor_col.to_numpy()
+            tensor_col = tensor_col.apply(
+                row_feature_hasher, axis=1, result_type="expand"
+            )
+            
+            self.stats_ = {'nb_features' : self._num_features}
 
-        df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
+            tensor_col = tensor_col.to_numpy()
+
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
+
+        self.stats_ = {'nb_features' : self._nb_features}
 
         return df
 
diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py
index 3cc14c8..e48c2c5 100644
--- a/src/data/reduction/truncated_svd_decomposition.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -10,6 +10,7 @@
 from utils import save_Xy_data, load_Xy_data
 
 from sklearn.utils.extmath import randomized_svd
+from sklearn.decomposition import DictionaryLearning
 
 from ray.data.preprocessor import Preprocessor
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
@@ -26,7 +27,7 @@ class TensorTruncatedSVDDecomposition(Preprocessor):
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
     https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA
     """
-    def __init__(self, features: List[str], nb_components: int = 100, file: str = ''):
+    def __init__(self, features: List[str], nb_components: int = 10000, file: str = ''):
         # Parameters
         self.features = features
         self._nb_features = len(features)
@@ -48,33 +49,61 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         2- Sparse PCA (sparse SVD?) -> construct a PCA from sparsely encoded data
         It is possible to parallelize batches computation by applying the logic from MiniBatchDictionaryLearning and MiniBatchSparsePCA
         """
-        components = []
+        # Parallel
+        
         def batch_svd(batch):
             batch = batch[TENSOR_COLUMN_NAME]
             batch = _unwrap_ndarray_object_type_if_needed(batch)
             U, S, V = randomized_svd(
                 batch,
                 n_components = self._nb_components,
-                n_iter = 1,
+                n_iter = 2,
                 power_iteration_normalizer = 'LU',
             )
-            print(V.shape)
             return {'V' : V}
 
+        components = []
         if self._nb_features > self._nb_components:
             if os.path.isfile(self._file):
                 components = np.array(load_Xy_data(self._file))
             else:
                 # sampl = ds.random_sample(0.1)
                 # svd = sampl.map_batches(batch_svd, batch_format = 'numpy')
+                svd = ds.map_batches(batch_svd, batch_size = 1, batch_format = 'numpy')
+                components = svd.random_shuffle().limit(self._nb_components).to_pandas()['V']
+                components = _unwrap_ndarray_object_type_if_needed(components)
+
+                save_Xy_data(components, self._file)
+
+            self.stats_ = {'components' : components}
+        else:
+            warn('No features reduction to do because the number of features is already lower than the required number of components')
+            self.stats_ = {'components' : False}
+        """
+        # Parallel multiple MiniBatchDictionaryLearning
+        def batch_svd(batch):
+            batch = batch[TENSOR_COLUMN_NAME]
+            batch = _unwrap_ndarray_object_type_if_needed(batch)
+            dict = DictionaryLearning(
+                n_components = self._nb_components,
+                max_iter = 10,
+                transform_algorithm = 'lasso_lars',
+            )
+            dict.fit(batch)
+            return {'dictonnary' : [dict.components_]}
+        components = []
+        if self._nb_features > self._nb_components:
+            if os.path.isfile(self._file):
+                components = np.array(load_Xy_data(self._file))
+            else:
                 svd = ds.map_batches(batch_svd, batch_format = 'numpy')
                 print(svd.to_pandas())
                 for row in svd.iter_rows():
-                    components.append(row['V'])
-                # components = np.vstack(components)
-                components = np.sum(components, axis = 0)
+                    components.append(row['dictonnary'])
+                components = np.mean(components, axis = 0)
+                print(components.shape)
                 save_Xy_data(components, self._file)
-
+        """
         # Incremental
         # components = None
         # singular_values = None
@@ -102,10 +131,6 @@ def batch_svd(batch):
         #         components = VT
         #         singular_values = Sigma
                 
-            self.stats_ = {'components' : components}
-        else:
-            warn('No features reduction to do because the number of features is already lower than the required number of components')
-            self.stats_ = {'components' : False}
 
         return self
 
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 81751dc..80bdc07 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -13,14 +13,14 @@
 __all__ = ['build_attention','build_LSTM','build_deepLSTM','build_LSTM_attention','build_CNN','build_wideCNN']
 
 # Self-aware binary classifier
-def build_attention(nb_kmers):
+def build_attention(nb_features):
     """
     Function extracted from module virnet/NNClassifier.py of
     VirNet package [Abdelkareem et al. 2018]
     https://github.com/alyosama/virnet/blob/master/NNClassifier.py
     """
-    inputs = Input(shape = (nb_kmers,))
-    x = Embedding(nb_kmers, 128)(inputs)
+    inputs = Input(shape = (nb_features,))
+    x = Embedding(nb_features, 128)(inputs)
 
     x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x)
     x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x)
@@ -36,15 +36,15 @@ def build_attention(nb_kmers):
     return model
 
 # Recurrent binary classifier
-def build_LSTM(nb_kmers):
+def build_LSTM(nb_features):
     """
     Function extracted from module seeker/train_model/train_model.py of
     Seeker package [Auslander et al. 2020]
     https://github.com/gussow/seeker/blob/master/train_model/train_model.py
     """
     
-    inputs = Input(shape = (nb_kmers,))
-    x = Embedding(nb_kmers, 128)(inputs)
+    inputs = Input(shape = (nb_features,))
+    x = Embedding(nb_features, 128)(inputs)
 
     x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(x)
 
@@ -56,16 +56,16 @@ def build_LSTM(nb_kmers):
     return model
 
 # Deep recurrent binary classifier
-def build_deepLSTM(nb_kmers):
+def build_deepLSTM(nb_features):
     """
     Function adapted from module deeplasmid/classifier/dl/DL_Model.py of
     Deeplasmid package [Andreopoulos et al. 2021]
     https://github.com/wandreopoulos/deeplasmid/blob/docker/classifier/dl/DL_Model.py
     """
 
-    inputs = Input(shape=(nb_kmers,))
+    inputs = Input(shape=(nb_features,))
 
-    netA = Embedding(nb_kmers, 128)(inputs)
+    netA = Embedding(nb_features, 128)(inputs)
     netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (netA)
     netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA)
 
@@ -87,7 +87,7 @@ def build_deepLSTM(nb_kmers):
     return model
 
 # Recurrent self-aware multiclass classifier
-def build_LSTM_attention(nb_kmers, nb_classes):
+def build_LSTM_attention(nb_features, nb_classes):
     """
     Function adapted in keras from module DeepMicrobes/models/embed_lstm_attention.py and
     default values for layers in script DeepMicrobes/models/define_flags.py of
@@ -95,12 +95,12 @@ def build_LSTM_attention(nb_kmers, nb_classes):
     https://github.com/MicrobeLab/DeepMicrobes/blob/master/models/embed_lstm_attention.py
     """
 
-    inputs = Input(shape = (nb_kmers,))
-    net = Embedding(nb_kmers, 100)(inputs)
+    inputs = Input(shape = (nb_features,))
+    net = Embedding(nb_features, 100)(inputs)
     net = Bidirectional(LSTM(300, return_sequences=True))(net)
     net = Attention(dropout = 0.2)([net,net])
     # MLP
-    net = Dense((nb_kmers * 300 * 2), activation = 'relu')(net)
+    net = Dense((nb_features * 300 * 2), activation = 'relu')(net)
     net = Dropout(0.2)(net)
     net = Dense(nb_classes, activation = 'relu')(net)
     net = Dropout(0.2)(net)
@@ -113,7 +113,7 @@ def build_LSTM_attention(nb_kmers, nb_classes):
     return model
 
 # Convolutional multiclass classifier
-def build_CNN(nb_kmers, nb_classes):
+def build_CNN(nb_features, nb_classes):
     """
     Function extracted from module MetagenomicDC/models/CNN.py of
     MetagenomicDC package [Fiannaca et al. 2018]
@@ -121,7 +121,7 @@ def build_CNN(nb_kmers, nb_classes):
     """
 
     model = Sequential()
-    model.add(Conv1D(5,5, input_shape = (nb_kmers, 1), padding = 'valid')) #input_dim
+    model.add(Conv1D(5,5, input_shape = (nb_features, 1), padding = 'valid')) #input_dim
     model.add(Activation('relu'))
     model.add(MaxPooling1D(pool_size = 2, padding = 'valid'))
     model.add(Conv1D(10, 5, padding = 'valid'))
@@ -139,28 +139,28 @@ def build_CNN(nb_kmers, nb_classes):
     return model
 
 # Wide convolutional multiclass classifier
-def build_wideCNN(nb_kmers, nb_classes):
+def build_wideCNN(nb_features, nb_classes):
     """
     Function adapted in keras from module CHEER/Classifier/model/Wcnn.py of
     CHEER package [Shang et al. 2021]
     https://github.com/KennthShang/CHEER/blob/master/Classifier/model/Wcnn.py
     """
 
-    inputs = Input(shape = (nb_kmers,))
+    inputs = Input(shape = (nb_features,))
     embed = Embedding(248, 100)(inputs)
-    embed = Reshape((nb_kmers, -1, 1))(embed)
+    embed = Reshape((nb_features, -1, 1))(embed)
 
     conv1 = Conv2D(256, 3, activation = 'relu')(embed)
-    conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv1)
+    conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv1)
 
     conv2 = Conv2D(256, 7, activation = 'relu')(embed)
-    conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv2)
+    conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv2)
 
     conv3 = Conv2D(256, 11, activation = 'relu')(embed)
-    conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv3)
+    conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv3)
 
     conv4 = Conv2D(256, 15, activation = 'relu')(embed)
-    conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv4)
+    conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv4)
 
     net = Concatenate(axis = 1)([conv1,conv2,conv3,conv4])
     net = Flatten()(net)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 8d0d992..9187e69 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -152,8 +152,9 @@ def preprocess(self, ds, reductor_file):
             self._scaler = TensorTfIdfTransformer(self.kmers)
             
         self._encoder.fit(ds)
-        self._scaler.fit(ds)
-        self._reductor = TensorCountHashing(self.kmers, 10000)
+        ds = self._scaler.fit_transform(ds)
+        self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file)
+        # self._reductor = TensorCountHashing(self.kmers, 10000)
         self._reductor.fit(ds)
         # Labels mapping
         if self._nb_classes == 2:
@@ -182,6 +183,7 @@ def fit(self, datasets):
             ds = self._scaler.transform(ds)
             # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
+            self._nb_features = self._reductor._nb_components
             # Trigger the preprocessing computations before ingest in trainer
             # Otherwise, it would be executed at each epoch
             ds = ds.materialize()
@@ -191,7 +193,7 @@ def fit(self, datasets):
         self._train_params = {
             'batch_size': self.batch_size,
             'epochs': self._training_epochs,
-            'size': self._nb_kmers,
+            'size': self._nb_features,
             'nb_cls': self._nb_classes,
             'model': self.classifier
         }
@@ -228,12 +230,12 @@ def predict(self, ds, threshold=0.8):
 
             # Preprocess
             ds = self._scaler.transform(ds)
-            # ds = self._preprocessor.transform(ds)
+            ds = self._reductor.transform(ds)
 
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
                 TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, len(self.kmers))
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_features)
             )
             predictions = self._predictor.predict(
                 data = ds,
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 7e6e50f..2eeb0a9 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -68,12 +68,12 @@ def __init__(
         self._training_epochs = training_epochs
         # Initialize empty
         self._labels_map = None
-        # Initialize Ray variables
         self._clf = None
         self._encoder = None
         self._scaler = None
         self._preprocessor = None
         self._reductor = None
+        self._nb_features = None
         self._model_ckpt = None
         self._trainer = None
         self._train_params = {}
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index f74fc8e..9199c4e 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -110,9 +110,10 @@ def preprocess(self, ds, reductor_file):
         self._scaler = TensorTfIdfTransformer(self.kmers)
 
         self._encoder.fit(ds)
-        self._scaler.fit(ds)
+        ds = self._scaler.fit_transform(ds)
 
-        self._reductor = TensorCountHashing(self.kmers, 10000)
+        self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file)
+        # self._reductor = TensorCountHashing(self.kmers, 10000)
         self._reductor.fit(ds)
 
         # Labels mapping
@@ -179,6 +180,7 @@ def fit(self, datasets):
             ds = self._scaler.transform(ds)
             # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
+            self._nb_features = self._reductor._nb_components
             # Trigger the preprocessing computations before ingest in trainer
             # Otherwise, it would be executed at each epoch
             ds = ds.materialize()
@@ -218,7 +220,6 @@ def predict(self, ds, threshold = 0.8):
         print('predict')
         if ds.count() > 0:
             ds = self._scaler.transform(ds)
-            # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py
index 021d9ce..046c88c 100644
--- a/src/models/sklearn/partial_trainer.py
+++ b/src/models/sklearn/partial_trainer.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
 from joblib import parallel_backend
 from sklearn.metrics import check_scoring
 
@@ -201,18 +202,18 @@ def training_loop(self):
 
         _set_cpu_params(self.estimator, num_cpus)
 
-        for epoch_X, epoch_y in zip(X_train.iter_epochs(), y_train.iter_epochs()):
+        for epoch_X, epoch_y in tqdm(zip(X_train.iter_epochs(), y_train.iter_epochs())):
             with parallel_backend("ray", n_jobs=num_cpus):
                 start_time = time()
                 for batch_X, batch_y in zip(
                     epoch_X.iter_batches(
-                        # batch_size = self._batch_size,
-                        batch_size = 1,
+                        batch_size = self._batch_size,
+                        # batch_size = 1,
                         batch_format = 'numpy'
                     ),
                     epoch_y.iter_batches(
-                        # batch_size = self._batch_size,
-                        batch_size = 1,
+                        batch_size = self._batch_size,
+                        # batch_size = 1,
                         batch_format = 'numpy'
                     )
                 ):  

From 23b6ddba6be8e67ed2b041354ac2d1db7131be12 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 16 Nov 2023 06:59:56 -0500
Subject: [PATCH 38/92] remove random sample in TruncatedSVD

---
 src/data/reduction/truncated_svd_decomposition.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py
index e48c2c5..0a67cd0 100644
--- a/src/data/reduction/truncated_svd_decomposition.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -69,8 +69,8 @@ def batch_svd(batch):
             else:
                 # sampl = ds.random_sample(0.1)
                 # svd = sampl.map_batches(batch_svd, batch_format = 'numpy')
-                svd = ds.map_batches(batch_svd, batch_size = 1, batch_format = 'numpy')
-                components = svd.random_shuffle().limit(self._nb_components).to_pandas()['V']
+                svd = ds.map_batches(batch_svd, batch_format = 'numpy')
+                components = svd.limit(self._nb_components).to_pandas()['V']
                 components = _unwrap_ndarray_object_type_if_needed(components)
 
                 save_Xy_data(components, self._file)

From 40835bb7e46e45045fc684a73ccadaa213dc6dba Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 17 Nov 2023 20:49:16 -0500
Subject: [PATCH 39/92] req for CCDB + dataset name in classif

---
 requirements.txt                       | 18 ++++++++----------
 src/Caribou_classification.py          |  2 +-
 src/Caribou_classification_train_cv.py |  4 ++--
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a8d4ad5..4a6ea4b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,8 +26,8 @@ google-auth-oauthlib==1.0.0
 google-pasta==0.2.0
 googleapis-common-protos==1.60.0
 gpustat==1.1
-grpcio==1.48.2
-h5py==3.9.0
+grpcio==1.57.0
+h5py==3.8.0
 idna==3.4
 importlib-metadata==6.8.0
 importlib-resources==6.0.0
@@ -41,7 +41,7 @@ Markdown==3.4.4
 MarkupSafe==2.1.3
 msgpack==1.0.5
 multidict==6.0.4
-numpy==1.24.3
+numpy==1.25.2
 nvidia-ml-py==12.535.77
 oauthlib==3.2.2
 opencensus==0.11.2
@@ -55,7 +55,7 @@ prometheus-client==0.13.1
 protobuf==4.23.4
 psutil==5.9.5
 py-spy==0.3.14
-pyarrow==12.0.1
+pyarrow==12.0.0
 pyasn1==0.5.0
 pyasn1-modules==0.3.0
 pydantic==1.10.12
@@ -67,7 +67,7 @@ ray==2.6.3
 referencing==0.30.2
 requests==2.31.0
 requests-oauthlib==1.3.1
-rpds-py==0.9.2
+rpds-py==0.10.0
 rsa==4.9
 scikit-learn==1.3.0
 scipy==1.10.1
@@ -77,9 +77,9 @@ tabulate==0.9.0
 tensorboard==2.13.0
 tensorboard-data-server==0.7.1
 tensorboardX==2.6.2
-tensorflow==2.13.0
+tensorflow==2.14.0
 tensorflow-estimator==2.13.0
-tensorflow-io-gcs-filesystem==0.33.0
+tensorflow-io-gcs-filesystem==0.32.0
 termcolor==2.3.0
 threadpoolctl==3.2.0
 tqdm==4.65.0
@@ -92,6 +92,4 @@ wcwidth==0.2.6
 Werkzeug==2.3.6
 wrapt==1.15.0
 yarl==1.9.2
-zipp==3.16.2
-xgboost==2.0.1
-xgboost_ray==0.1.18
\ No newline at end of file
+zipp==3.16.2
\ No newline at end of file
diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index 0c4b460..da9dbf7 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -47,7 +47,7 @@ def bacteria_classification(opt):
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
 
-    val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
+    val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
 
     datasets = {
         TRAINING_DATASET_NAME : db_ds,
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index f6832a8..00fb97f 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -53,8 +53,8 @@ def bacteria_classification_train_cv(opt):
     
     for taxa in lst_taxas:
 
-        test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME)
-        val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME)
+        test_ds, test_data = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}")
+        val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
 
         datasets = {
             TRAINING_DATASET_NAME : db_ds,

From 556c30268ea8d5c62159551f7240ee69419da4b0 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 18 Nov 2023 13:08:03 -0500
Subject: [PATCH 40/92] class weights + adjust requirements to be more fluid

---
 requirements copy.txt                         | 95 ++++++++++++++++++
 requirements.txt                              | 97 ++-----------------
 src/models/classification.py                  |  8 +-
 src/models/encoders/model_label_encoder.py    |  4 +-
 src/models/kerasTF/models.py                  | 21 ++--
 src/models/models_utils.py                    |  1 +
 .../preprocessors/compute_class_weights.py    | 49 ++++++++++
 src/models/sklearn/models.py                  | 32 +++---
 8 files changed, 183 insertions(+), 124 deletions(-)
 create mode 100644 requirements copy.txt
 create mode 100644 src/models/preprocessors/compute_class_weights.py

diff --git a/requirements copy.txt b/requirements copy.txt
new file mode 100644
index 0000000..caf68a4
--- /dev/null
+++ b/requirements copy.txt	
@@ -0,0 +1,95 @@
+absl-py==1.4.0
+aiohttp==3.8.5
+aiohttp-cors==0.7.0
+aiosignal==1.3.1
+astunparse==1.6.3
+async-timeout==4.0.2
+attrs==23.1.0
+biopython==1.78
+blessed==1.20.0
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+cloudpickle==2.2.1
+colorful==0.5.5
+Cython==3.0.0
+distlib==0.3.7
+filelock==3.12.2
+flatbuffers==23.5.26
+frozenlist==1.4.0
+future==0.18.3
+gast==0.4.0
+google-api-core==2.11.1
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
+google-pasta==0.2.0
+googleapis-common-protos==1.60.0
+gpustat==1.1
+grpcio==1.47.0
+h5py==3.8.0
+idna==3.4
+importlib-metadata==6.8.0
+importlib-resources==6.0.0
+InSilicoSeq==1.5.4
+joblib==1.3.1
+jsonschema==4.18.6
+jsonschema-specifications==2023.7.1
+keras==2.13.1
+libclang==16.0.6
+Markdown==3.4.4
+MarkupSafe==2.1.3
+msgpack==1.0.5
+multidict==6.0.4
+numpy==1.25.2
+nvidia-ml-py==12.535.77
+oauthlib==3.2.2
+opencensus==0.11.2
+opencensus-context==0.1.3
+opt-einsum==3.3.0
+packaging==23.1
+pandas==2.0.3
+pkgutil_resolve_name==1.3.10
+platformdirs==3.10.0
+prometheus-client==0.13.1
+protobuf==4.23.4
+psutil==5.9.5
+py-spy==0.3.14
+pyarrow==12.0.0
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pydantic==1.10.12
+pysam==0.21.0
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0.1
+ray==2.6.3
+referencing==0.30.2
+requests==2.31.0
+requests-oauthlib==1.3.1
+rpds-py==0.10.0
+rsa==4.9
+scikit-learn==1.3.0
+scipy==1.10.1
+six==1.16.0
+smart-open==6.3.0
+tabulate==0.9.0
+tensorboard==2.13.0
+tensorboard-data-server==0.7.1
+tensorboardX==2.6.2
+tensorflow==2.14.0
+tensorflow-estimator==2.13.0
+tensorflow-io-gcs-filesystem==0.32.0
+termcolor==2.3.0
+threadpoolctl==3.2.0
+tqdm==4.65.0
+tune-sklearn==0.4.6
+typing_extensions==4.5.0
+tzdata==2023.3
+urllib3==1.26.16
+virtualenv==20.24.2
+wcwidth==0.2.6
+Werkzeug==2.3.6
+wrapt==1.15.0
+yarl==1.9.2
+zipp==3.16.2
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4a6ea4b..d409b51 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,95 +1,10 @@
-absl-py==1.4.0
-aiohttp==3.8.5
-aiohttp-cors==0.7.0
-aiosignal==1.3.1
-astunparse==1.6.3
-async-timeout==4.0.2
-attrs==23.1.0
 biopython==1.78
-blessed==1.20.0
-cachetools==5.3.1
-certifi==2023.7.22
-charset-normalizer==3.2.0
-click==8.1.6
-cloudpickle==2.2.1
-colorful==0.5.5
-Cython==3.0.0
-distlib==0.3.7
-filelock==3.12.2
-flatbuffers==23.5.26
-frozenlist==1.4.0
-future==0.18.3
-gast==0.4.0
-google-api-core==2.11.1
-google-auth==2.22.0
-google-auth-oauthlib==1.0.0
-google-pasta==0.2.0
-googleapis-common-protos==1.60.0
-gpustat==1.1
-grpcio==1.57.0
-h5py==3.8.0
-idna==3.4
-importlib-metadata==6.8.0
-importlib-resources==6.0.0
+cloudpickle>=2.2.1
 InSilicoSeq==1.5.4
-joblib==1.3.1
-jsonschema==4.18.6
-jsonschema-specifications==2023.7.1
-keras==2.13.1
-libclang==16.0.6
-Markdown==3.4.4
-MarkupSafe==2.1.3
-msgpack==1.0.5
-multidict==6.0.4
-numpy==1.25.2
-nvidia-ml-py==12.535.77
-oauthlib==3.2.2
-opencensus==0.11.2
-opencensus-context==0.1.3
-opt-einsum==3.3.0
-packaging==23.1
-pandas==2.0.3
-pkgutil_resolve_name==1.3.10
-platformdirs==3.10.0
-prometheus-client==0.13.1
-protobuf==4.23.4
-psutil==5.9.5
-py-spy==0.3.14
-pyarrow==12.0.0
-pyasn1==0.5.0
-pyasn1-modules==0.3.0
-pydantic==1.10.12
-pysam==0.21.0
-python-dateutil==2.8.2
-pytz==2023.3
-PyYAML==6.0.1
+keras==2.14
+numpy>=1.2
+pandas>=2.0
 ray==2.6.3
-referencing==0.30.2
-requests==2.31.0
-requests-oauthlib==1.3.1
-rpds-py==0.10.0
-rsa==4.9
 scikit-learn==1.3.0
-scipy==1.10.1
-six==1.16.0
-smart-open==6.3.0
-tabulate==0.9.0
-tensorboard==2.13.0
-tensorboard-data-server==0.7.1
-tensorboardX==2.6.2
-tensorflow==2.14.0
-tensorflow-estimator==2.13.0
-tensorflow-io-gcs-filesystem==0.32.0
-termcolor==2.3.0
-threadpoolctl==3.2.0
-tqdm==4.65.0
-tune-sklearn==0.4.6
-typing_extensions==4.5.0
-tzdata==2023.3
-urllib3==1.26.16
-virtualenv==20.24.2
-wcwidth==0.2.6
-Werkzeug==2.3.6
-wrapt==1.15.0
-yarl==1.9.2
-zipp==3.16.2
\ No newline at end of file
+tensorflow==2.14
+pyarrow==12.0
\ No newline at end of file
diff --git a/src/models/classification.py b/src/models/classification.py
index eaa682c..99a4a9b 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -103,19 +103,19 @@ def cross_validation(self, datasets):
         """
         Public function to call the cross-validation method after validation of parameters
         Executes cross-validation of a model by fitting it and predicting over a test dataset
-        """
-        
+        """        
         if isinstance(self._taxas, str):
             self._valid_assign_taxas()
             tax_map = self._verify_model_trained()
 
             test_ds = datasets.pop(TEST_DATASET_NAME)
             y_true, test_ds = self._get_true_classif(test_ds, self._taxas)
-
+            
             self._fit(datasets, tax_map)
 
             model_mapping = self._verify_load_model()
             y_pred = self._cv_predict(test_ds, model_mapping)
+
             cv_scores = self._score_cv(y_true, y_pred, self._taxas[0])
             
             return cv_scores
@@ -168,7 +168,7 @@ def _cv_predict(self, ds, model_map):
         """
         mapping = {}
         for taxa, model in model_map.items():
-                mapping[taxa] = model.predict(ds) # np.array
+            mapping[taxa] = model.predict(ds) # np.array
         return mapping
 
     # Private training secondary functions
diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py
index 7084b2b..b635108 100644
--- a/src/models/encoders/model_label_encoder.py
+++ b/src/models/encoders/model_label_encoder.py
@@ -9,7 +9,7 @@
 
 from ray.data import Dataset
 from ray.data.preprocessor import Preprocessor
-from ray.data.preprocessors.encoder import _get_unique_value_indices, _validate_df
+from ray.data.preprocessors.encoder import _get_unique_value_indices
 
 LABELS_COLUMN_NAME = 'labels'
 
@@ -25,8 +25,6 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):
-        _validate_df(df, self.label_column)
-
         def column_label_encoder(s: pd.Series):
             s_values = self.stats_[f"unique_values({s.name})"]
             return s.map(s_values)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 9187e69..08746c5 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -1,24 +1,18 @@
 import os
 import gc
-import ray
 import warnings
 import numpy as np
 import pandas as pd
 
-from glob import glob
-from shutil import rmtree
-
 # Dimensions reduction
-from data.reduction.count_hashing import TensorCountHashing
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
-from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
 from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
 
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
-from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
+from models.preprocessors.compute_class_weights import ComputeClassWeights
 
 # Parent class / models
 from models.models_utils import ModelsUtils
@@ -27,11 +21,10 @@
 # Training
 import tensorflow as tf
 from ray.air import session
-from ray.train import DataConfig
 # from ray.air.integrations.keras import Callback
+from ray.air.config import ScalingConfig
 from ray.air.integrations.keras import ReportCheckpointCallback
-from ray.air.config import ScalingConfig #DatasetConfig
-from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint, prepare_dataset_shard
+from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
 # Tuning
 from ray.air.config import RunConfig
@@ -152,6 +145,11 @@ def preprocess(self, ds, reductor_file):
             self._scaler = TensorTfIdfTransformer(self.kmers)
             
         self._encoder.fit(ds)
+
+        self._weights = ComputeClassWeights(LABELS_COLUMN_NAME)
+        self._weights.fit(ds)
+        self._weights = self._weights.stats_
+        
         ds = self._scaler.fit_transform(ds)
         self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file)
         # self._reductor = TensorCountHashing(self.kmers, 10000)
@@ -183,7 +181,7 @@ def fit(self, datasets):
             ds = self._scaler.transform(ds)
             # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
-            self._nb_features = self._reductor._nb_components
+            self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers
             # Trigger the preprocessing computations before ingest in trainer
             # Otherwise, it would be executed at each epoch
             ds = ds.materialize()
@@ -231,6 +229,7 @@ def predict(self, ds, threshold=0.8):
             # Preprocess
             ds = self._scaler.transform(ds)
             ds = self._reductor.transform(ds)
+            ds = ds.materialize()
 
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 2eeb0a9..6a97587 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -67,6 +67,7 @@ def __init__(
         self._nb_kmers = len(kmers_list)
         self._training_epochs = training_epochs
         # Initialize empty
+        self._weights = []
         self._labels_map = None
         self._clf = None
         self._encoder = None
diff --git a/src/models/preprocessors/compute_class_weights.py b/src/models/preprocessors/compute_class_weights.py
new file mode 100644
index 0000000..43b4c5d
--- /dev/null
+++ b/src/models/preprocessors/compute_class_weights.py
@@ -0,0 +1,49 @@
+
+import numpy as np
+import pandas as pd
+
+from ray.data.dataset import Dataset
+from ray.data.preprocessor import Preprocessor
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class ComputeClassWeights(Preprocessor):
+    """
+    Custom implementation of Class Weight Computation inspired by sklearn.utils.class_weight.compute_class_weight to be used as a Ray preprocessor.
+    https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html
+    This permits to estimate balanced class weights for an unbalanced dataset.
+    """
+
+    def __init__(self, class_col):
+        # Parameters
+        self._col = class_col
+        self._cls = []
+        self._counts_map = {}
+
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        def get_cls_counts(df):
+            mapping = {}
+            counts = df[self._col].value_counts()
+            for cls in self._cls:
+                if cls in counts.index:
+                    mapping[str(cls)] = [counts[cls]]
+                else:
+                    mapping[str(cls)] = [0]
+            return mapping
+        
+        self._cls = ds.unique(self._col)
+        
+        counts = ds.map_batches(get_cls_counts, batch_format = 'pandas')
+                
+        for cls in self._cls:
+            self._counts_map[str(cls)] = counts.sum(str(cls))
+
+        freqs = ds.count() / (len(self._cls) * np.array(list(self._counts_map.values())).astype(np.float64))
+        
+        self.stats_ = {}
+        for i, cls in enumerate(self._cls):
+            self.stats_[cls] = freqs[i]
+                
+        return self
+
+        
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 9199c4e..4f7bc76 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -4,19 +4,14 @@
 import numpy as np
 import pandas as pd
 
-from glob import glob
-from shutil import rmtree
-
 # Dimensions reduction
-from data.reduction.count_hashing import TensorCountHashing
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
-from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection
 from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
 
 # Preprocessing
 from models.encoders.model_label_encoder import ModelLabelEncoder
-from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
+from models.preprocessors.compute_class_weights import ComputeClassWeights
 
 # Training
 from ray.air.config import ScalingConfig
@@ -24,14 +19,13 @@
 from sklearn.linear_model import SGDClassifier
 from models.sklearn.partial_trainer import SklearnPartialTrainer
 from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
-from models.sklearn.tensor_predictor import SklearnTensorPredictor
 
 # Tuning
 from ray.air.config import RunConfig
 
 # Predicting
 from ray.train.batch_predictor import BatchPredictor
-from models.sklearn.probability_predictor import SklearnTensorProbaPredictor
+from models.sklearn.tensor_predictor import SklearnTensorPredictor
 
 # Parent class
 from models.models_utils import ModelsUtils
@@ -95,8 +89,6 @@ def __init__(
         )
         # Parameters
         self._encoded = []
-        # Computes
-        self._build()
 
     def preprocess(self, ds, reductor_file):
         print('preprocess')
@@ -109,7 +101,12 @@ def preprocess(self, ds, reductor_file):
         
         self._scaler = TensorTfIdfTransformer(self.kmers)
 
-        self._encoder.fit(ds)
+        ds = self._encoder.fit_transform(ds)
+        
+        self._weights = ComputeClassWeights(LABELS_COLUMN_NAME)
+        self._weights.fit(ds)
+        self._weights = self._weights.stats_
+        
         ds = self._scaler.fit_transform(ds)
 
         self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file)
@@ -123,7 +120,7 @@ def preprocess(self, ds, reductor_file):
             labels = np.append(labels, 'unknown')
             self._encoded = np.append(self._encoded, -1)
         self._labels_map = zip(labels, self._encoded)
-
+        
     def _label_decode(self, predict):
         print('_label_decode')
         decoded = pd.Series(np.empty(len(predict), dtype=object))
@@ -151,6 +148,7 @@ def _build(self):
                 'penalty' : 'elasticnet',
                 'alpha' : 141.6146176,
                 'learning_rate' : 'adaptive',
+                'class_weight' : self._weights,
                 'eta0' : 0.001,
                 'n_jobs' : -1
             }
@@ -162,7 +160,8 @@ def _build(self):
                 'alpha' : 173.5667373,
                 'learning_rate' : 'optimal',
                 'loss': 'modified_huber',
-                'penalty' : 'l2'
+                'penalty' : 'l2',
+                'class_weight' : self._weights,
             }
         elif self.classifier == 'mnb':
             print('Training multiclass Multinomial Naive Bayes classifier')
@@ -174,17 +173,19 @@ def _build(self):
 
     def fit(self, datasets):
         print('_fit_model')
+        # Define model
+        self._build()
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
             ds = self._scaler.transform(ds)
-            # ds = self._preprocessor.transform(ds)
             ds = self._reductor.transform(ds)
-            self._nb_features = self._reductor._nb_components
+            self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers
             # Trigger the preprocessing computations before ingest in trainer
             # Otherwise, it would be executed at each epoch
             ds = ds.materialize()
             datasets[name] = ray.put(ds)
+        
         try:
             training_labels = self._encoded.copy()
             training_labels = np.delete(training_labels, np.where(training_labels == -1))
@@ -221,6 +222,7 @@ def predict(self, ds, threshold = 0.8):
         if ds.count() > 0:
             ds = self._scaler.transform(ds)
             ds = self._reductor.transform(ds)
+            ds = ds.materialize()
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
             predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)

From 6608365320687f179d156139e7467c1e5da931f1 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 20 Nov 2023 18:28:49 -0500
Subject: [PATCH 41/92] cls weights + TruncatedSVD in separated step

---
 setup.cfg                                  |   1 +
 src/Caribou_classification.py              |   6 +-
 src/Caribou_classification_train_cv.py     |   6 +-
 src/Caribou_dimensions_decomposition.py    | 104 +++++++++++++++++++++
 src/Caribou_extraction.py                  |   6 +-
 src/Caribou_extraction_train_cv.py         |   6 +-
 src/Caribou_reduce_features.py             |  56 +++++------
 src/models/classification.py               |  29 ++++--
 src/models/encoders/model_label_encoder.py |   2 +-
 src/models/kerasTF/models.py               |  53 +++++------
 src/models/models_utils.py                 |  37 ++++++--
 src/models/sklearn/models.py               |  40 +++-----
 src/utils.py                               |  10 +-
 13 files changed, 246 insertions(+), 110 deletions(-)
 create mode 100644 src/Caribou_dimensions_decomposition.py

diff --git a/setup.cfg b/setup.cfg
index fce114a..a9f82f6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,6 +35,7 @@ scripts =
   src/Caribou_kmers.py
   src/Caribou_reduce_features.py
   src/Caribou_simulate_test_val.py
+  src/Caribou_dimensions_decomposition.py
   src/Caribou_extraction.py
   src/Caribou_classification.py
   src/Caribou_extraction_train_cv.py
diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index da9dbf7..1e1da9c 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -47,6 +47,9 @@ def bacteria_classification(opt):
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
 
+    # Verify need for scaling
+    scaling = verify_need_scaling(db_data)
+
     val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
 
     datasets = {
@@ -66,7 +69,8 @@ def bacteria_classification(opt):
         clf_multiclass = opt['model_type'],
         taxa = 'domain',
         batch_size = opt['batch_size'],
-        training_epochs = opt['training_epochs']
+        training_epochs = opt['training_epochs'],
+        scaling = scaling
     )
     
 # Execution of bacteria taxonomic classification on metagenome + save results
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index 00fb97f..1f707b9 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -51,6 +51,9 @@ def bacteria_classification_train_cv(opt):
     if 'domain' in lst_taxas:
         lst_taxas.remove('domain')
     
+    # Verify need for scaling
+    scaling = verify_need_scaling(db_data)
+    
     for taxa in lst_taxas:
 
         test_ds, test_data = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}")
@@ -72,7 +75,8 @@ def bacteria_classification_train_cv(opt):
             clf_multiclass = opt['model_type'],
             taxa = taxa,
             batch_size = opt['batch_size'],
-            training_epochs = opt['training_epochs']
+            training_epochs = opt['training_epochs'],
+            scaling = scaling
         )
 
         t_s = time()
diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
new file mode 100644
index 0000000..52ccff6
--- /dev/null
+++ b/src/Caribou_dimensions_decomposition.py
@@ -0,0 +1,104 @@
+#!/usr/bin python3
+
+import ray
+import os.path
+import argparse
+
+import numpy as np
+
+from utils import *
+from time import time
+from glob import glob
+from pathlib import Path
+
+from ray.data.preprocessors import Chain
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
+
+__author__ = "Nicolas de Montigny"
+
+__all__ = ['dimensions_decomposition']
+
+"""
+This script computes dimensions decomposition via TruncatedSVD and saves a reduced version of the dataset.
+"""
+
+# Initialisation / validation of parameters from CLI
+################################################################################
+def dimensions_decomposition(opt):
+    
+    # Verify existence of files and load data
+    data = verify_load_data(opt['dataset'])
+
+    # Verification of k length
+    k_length = len(data['kmers'][0])
+    verify_file(opt['kmers_list'])
+    k_length, kmers = verify_kmers_list_length(k_length, opt['kmers_list'])
+
+    outdirs = define_create_outdirs(opt['outdir'])
+    
+    # Initialize cluster
+    init_ray_cluster(opt['workdir'])
+
+# Dimensions decomposition
+################################################################################
+
+    # Define new file
+    path, ext = os.path.splitext(opt['dataset'])
+    data_file = f'{path}_decomposed{ext}'
+
+    if not os.path.exists(data_file):
+        if opt['nb_features'] < len(kmers):
+            # Load data 
+            files_lst = glob(os.path.join(data['profile'],'*.parquet'))
+            ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+
+            reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz')
+
+            # Compute the decomposition
+            preprocessor = Chain(
+                TensorTfIdfTransformer(
+                    features = kmers
+                ),
+                TensorTruncatedSVDDecomposition(
+                    features = kmers,
+                    nb_components = opt['nb_components'],
+                    file = reductor_file
+                )
+            )
+            t_s = time()
+            ds = preprocessor.fit_transform(ds)
+            t_decomposition = time() - t_s
+
+            # Save decomposed dataset
+            data['profile'] = f"{data['profile']}_decomposed"
+            data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor._nb_components)]
+            ds.write_parquet(data['profile'])
+
+            # Save decomposed data
+            save_Xy_data(data, data_file)
+
+            print(f"Caribou finished decomposing the features of {opt['dataset_name']} in {t_decomposition} seconds.")
+        else:
+            print('Caribou did not decompose the features because the number to extract is bigger than the actual number of features')
+    else:
+        print("Caribou did not decompose the features because the file already exists")
+
+# Argument parsing from CLI
+################################################################################
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='This script computes features reduction to a given K-mers dataset and then applies it.')
+    # Dataset
+    parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
+    parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files')
+    parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced')
+    # Parameters
+    parser.add_argument('-n','--nb_components', default=1000, type=int, help='Number of components to decompose data into')
+    parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
+    parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled')
+    args = parser.parse_args()
+
+    opt = vars(args)
+
+    dimensions_decomposition(opt)
diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py
index 3876f2b..a2168fb 100644
--- a/src/Caribou_extraction.py
+++ b/src/Caribou_extraction.py
@@ -56,6 +56,9 @@ def bacteria_extraction(opt):
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
         db_name = opt['dataset_name']
 
+    # Verify need for scaling
+    scaling = verify_need_scaling(db_data)
+
     datasets = {
         TRAINING_DATASET_NAME : db_ds,
         VALIDATION_DATASET_NAME : val_ds
@@ -73,7 +76,8 @@ def bacteria_extraction(opt):
         clf_binary = opt['model_type'],
         taxa = 'domain',
         batch_size = opt['batch_size'],
-        training_epochs = opt['training_epochs']
+        training_epochs = opt['training_epochs'],
+        scaling = scaling
     )
 
 # Execution of bacteria extraction / host removal on metagenome + save results
diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py
index 1c73cad..2547886 100644
--- a/src/Caribou_extraction_train_cv.py
+++ b/src/Caribou_extraction_train_cv.py
@@ -52,6 +52,9 @@ def bacteria_extraction_train_cv(opt):
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
         db_name = opt['database_name']
 
+    # Verify need for scaling
+    scaling = verify_need_scaling(db_data)
+
     datasets = {
         TRAINING_DATASET_NAME : db_ds,
         TEST_DATASET_NAME : test_ds,
@@ -69,7 +72,8 @@ def bacteria_extraction_train_cv(opt):
         clf_binary = opt['model_type'],
         taxa = 'domain',
         batch_size = opt['batch_size'],
-        training_epochs = opt['training_epochs']
+        training_epochs = opt['training_epochs'],
+        scaling = scaling
     )
 
     t_s = time()
diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 412b8bc..c95861d 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -58,35 +58,39 @@ def features_reduction(opt):
         2. TruncatedSVD decomposition (map the features to 10 000 decomposed features if there is still more)
     """
 
-    # Load data 
-    files_lst = glob(os.path.join(data['profile'],'*.parquet'))
-    export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    # Time the computation of transformations
-    t_start = time()
-    # Features scaling
-    train_ds = tfidf_transform(train_ds, kmers)
-    # Brute force features exclusion
-    train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers)
-    train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers)
-    # Statistical features selection
-    train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa'])
-    # Time the computation of transformations
-    t_end = time()
-    t_reduction = t_end - t_start
-    # Save reduced dataset
-    data['profile'] = f"{data['profile']}_reduced"
-    export_ds.write_parquet(data['profile'])
-    # Save reduced K-mers
-    with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle:
-        handle.writelines("%s\n" % item for item in data['kmers'])
-    # Save reduced data
+    # Define new file
     path, ext = os.path.splitext(opt['dataset'])
     data_file = f'{path}_reduced{ext}'
-    save_Xy_data(data, data_file)
-
-    print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.")
 
+    if not os.path.exists(data_file):
+        # Load data 
+        files_lst = glob(os.path.join(data['profile'],'*.parquet'))
+        export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        # Time the computation of transformations
+        t_start = time()
+        # Features scaling
+        train_ds = tfidf_transform(train_ds, kmers)
+        # Brute force features exclusion
+        train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers)
+        train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers)
+        # Statistical features selection
+        train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa'])
+        # Time the computation of transformations
+        t_end = time()
+        t_reduction = t_end - t_start
+        # Save reduced dataset
+        data['profile'] = f"{data['profile']}_reduced"
+        export_ds.write_parquet(data['profile'])
+        # Save reduced K-mers
+        with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle:
+            handle.writelines("%s\n" % item for item in data['kmers'])
+        # Save reduced data
+        save_Xy_data(data, data_file)
+
+        print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.")
+    else:
+        print("Caribou did not reduce features because the file already exists")
 # TF-IDF scaling of the features
 def tfidf_transform(ds, kmers):
     preprocessor = TensorTfIdfTransformer(
diff --git a/src/models/classification.py b/src/models/classification.py
index 99a4a9b..a9ce5b1 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -48,18 +48,20 @@ def __init__(
         clf_multiclass: str = None,
         taxa: [str, List] = None,
         batch_size: int = 32,
-        training_epochs: int = 100
+        training_epochs: int = 100,
+        scaling = False
     ):
         # Parameters
         self._taxas = taxa
         self._outdirs = outdirs
+        self._scaling = scaling
         self._database = db_name
         self._database_data = db_data
         self._classifier_binary = clf_binary
         self._classifier_multiclass = clf_multiclass
         self._batch_size = batch_size
         self._training_epochs = training_epochs
-        # Init not fitted
+        # Init False
         self.is_fitted = False
 
     # Public functions
@@ -130,6 +132,7 @@ def _fit(self, datasets, tax_map):
         """
         Fit the given model to the training dataset
         """
+
         for taxa, file in tax_map.items():
             if taxa in ['domain','bacteria','host']:
                 self._binary_training(datasets, taxa, file)
@@ -183,7 +186,8 @@ def _binary_training(self, datasets, taxa, file):
                 self._batch_size,
                 self._training_epochs,
                 taxa,
-                self._database_data['kmers']
+                self._database_data['kmers'],
+                self._database_data['csv']
             )
         elif self._classifier_binary == 'linearsvm':
             model = SklearnModel(
@@ -192,7 +196,8 @@ def _binary_training(self, datasets, taxa, file):
                 self._batch_size,
                 self._training_epochs,
                 taxa,
-                self._database_data['kmers']
+                self._database_data['kmers'],
+                self._database_data['csv']
             )
         else:
             model = KerasTFModel(
@@ -201,9 +206,10 @@ def _binary_training(self, datasets, taxa, file):
                 self._batch_size,
                 self._training_epochs,
                 taxa,
-                self._database_data['kmers']
+                self._database_data['kmers'],
+                self._database_data['csv']
             )
-        model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz'))
+        model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling)
         model.fit(datasets)
 
         self._save_model(model, file)
@@ -217,7 +223,8 @@ def _multiclass_training(self, datasets, taxa, file):
                 self._batch_size,
                 self._training_epochs,
                 taxa,
-                self._database_data['kmers']
+                self._database_data['kmers'],
+                self._database_data['csv']
             )
         else:
             model = KerasTFModel(
@@ -226,9 +233,10 @@ def _multiclass_training(self, datasets, taxa, file):
                 self._batch_size,
                 self._training_epochs,
                 taxa,
-                self._database_data['kmers']
+                self._database_data['kmers'],
+                self._database_data['csv']
             )
-        model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz'))
+        model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling)
         model.fit(datasets)
 
         self._save_model(model, file)
@@ -424,4 +432,5 @@ def _save_dataset(self, ds, taxa):
             model = self._classifier_multiclass
         file = os.path.join(self._outdirs['results'], f'data_classified_{model}_{taxa}.parquet')
         ds.write_parquet(file)
-        return file
\ No newline at end of file
+        return file
+    
\ No newline at end of file
diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py
index b635108..3317257 100644
--- a/src/models/encoders/model_label_encoder.py
+++ b/src/models/encoders/model_label_encoder.py
@@ -25,8 +25,8 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):
+        s_values = self.stats_
         def column_label_encoder(s: pd.Series):
-            s_values = self.stats_[f"unique_values({s.name})"]
             return s.map(s_values)
 
         df[self.label_column] = df[self.label_column].transform(column_label_encoder)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 08746c5..827225f 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -4,15 +4,11 @@
 import numpy as np
 import pandas as pd
 
-# Dimensions reduction
-from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
-from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
-
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
-from models.preprocessors.compute_class_weights import ComputeClassWeights
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Parent class / models
 from models.models_utils import ModelsUtils
@@ -84,7 +80,8 @@ def __init__(
         batch_size,
         training_epochs,
         taxa,
-        kmers_list
+        kmers_list,
+        csv
     ):
         super().__init__(
             classifier,
@@ -92,7 +89,8 @@ def __init__(
             batch_size,
             training_epochs,
             taxa,
-            kmers_list
+            kmers_list,
+            csv
         )
         # Parameters
         # Initialize hidden
@@ -127,7 +125,7 @@ def __init__(
         elif self.classifier == 'widecnn':
             print('Training multiclass classifier based on Wide CNN Network')
 
-    def preprocess(self, ds, reductor_file):
+    def preprocess(self, ds, scaling = False):
         print('preprocess')
         labels = []
         encoded = []
@@ -136,24 +134,19 @@ def preprocess(self, ds, reductor_file):
         self._nb_classes = len(np.unique(labels))
         if self._nb_classes == 2:
             self._encoder = ModelLabelEncoder(self.taxa)
-            self._scaler = TensorTfIdfTransformer(self.kmers)
+            if scaling:
+                self._scaler = TensorTfIdfTransformer(self.kmers)
         else:
             self._encoder = Chain(
                 LabelEncoder(self.taxa),
                 OneHotTensorEncoder(self.taxa)
             )
-            self._scaler = TensorTfIdfTransformer(self.kmers)
+            if scaling:
+                self._scaler = TensorTfIdfTransformer(self.kmers)
             
         self._encoder.fit(ds)
-
-        self._weights = ComputeClassWeights(LABELS_COLUMN_NAME)
-        self._weights.fit(ds)
-        self._weights = self._weights.stats_
-        
-        ds = self._scaler.fit_transform(ds)
-        self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file)
-        # self._reductor = TensorCountHashing(self.kmers, 10000)
-        self._reductor.fit(ds)
+        if scaling:
+            self._scaler.fit(ds)
         # Labels mapping
         if self._nb_classes == 2:
             labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
@@ -163,6 +156,7 @@ def preprocess(self, ds, reductor_file):
         labels = np.append(labels, 'unknown')
         encoded = np.append(encoded, -1)
         self._labels_map = zip(labels, encoded)
+        self._compute_weights()
 
     def _label_decode(self, predict):
         print('_label_decode')
@@ -178,10 +172,8 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._scaler.transform(ds)
-            # ds = self._preprocessor.transform(ds)
-            ds = self._reductor.transform(ds)
-            self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
             # Trigger the preprocessing computations before ingest in trainer
             # Otherwise, it would be executed at each epoch
             ds = ds.materialize()
@@ -191,9 +183,10 @@ def fit(self, datasets):
         self._train_params = {
             'batch_size': self.batch_size,
             'epochs': self._training_epochs,
-            'size': self._nb_features,
+            'size': self._nb_kmers,
             'nb_cls': self._nb_classes,
-            'model': self.classifier
+            'model': self.classifier,
+            'weights': self._weights
         }
 
         # Define trainer / tuner
@@ -227,14 +220,14 @@ def predict(self, ds, threshold=0.8):
                 ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            ds = self._scaler.transform(ds)
-            ds = self._reductor.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
             self._predictor = BatchPredictor.from_checkpoint(
                 self._model_ckpt,
                 TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_features)
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
             )
             predictions = self._predictor.predict(
                 data = ds,
@@ -308,8 +301,7 @@ def train_func(config):
     size = config.get('size')
     nb_cls = config.get('nb_cls')
     model = config.get('model')
-
-    
+    weights = config.get('weights')
 
     # Model construction 
     model = build_model(model, nb_cls, size)
@@ -336,6 +328,7 @@ def train_func(config):
             x = batch_train,
             validation_data = batch_val,
             callbacks = [ReportCheckpointCallback()],
+            class_weight = weights,
             verbose = 0
         )
         session.report({
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 6a97587..3bf2e5e 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -1,10 +1,14 @@
 import os
 import warnings
+import numpy as np
 import pandas as pd
 
 # Class construction
 from abc import ABC, abstractmethod
 
+# Class weights
+from sklearn.utils.class_weight import compute_class_weight
+
 __author__ = 'Nicolas de Montigny'
 
 __all__ = ['ModelsUtils']
@@ -56,7 +60,8 @@ def __init__(
         batch_size,
         training_epochs,
         taxa,
-        kmers_list
+        kmers_list,
+        csv
     ):
         # Parameters
         self.classifier = classifier
@@ -64,21 +69,21 @@ def __init__(
         self.taxa = taxa
         self.kmers = kmers_list
         # Initialize hidden
+        self._csv = csv
         self._nb_kmers = len(kmers_list)
         self._training_epochs = training_epochs
         # Initialize empty
-        self._weights = []
-        self._labels_map = None
         self._clf = None
-        self._encoder = None
+        self._weights = {}
         self._scaler = None
-        self._preprocessor = None
+        self._encoder = None
+        self._trainer = None
         self._reductor = None
-        self._nb_features = None
+        self._predictor = None
+        self._labels_map = None
         self._model_ckpt = None
-        self._trainer = None
         self._train_params = {}
-        self._predictor = None
+        self._preprocessor = None
         self._workdir = outdir_model
 
     @abstractmethod
@@ -104,4 +109,18 @@ def _prob_2_cls(self):
     @abstractmethod
     def _label_decode(self):
         """
-        """
\ No newline at end of file
+        """
+
+    def _compute_weights(self):
+        """
+        Set class weights depending on their abundance in data-associated classes csv
+        """
+        cls = pd.read_csv(self._csv)
+        classes = list(cls[self.taxa].unique())
+        weights = compute_class_weight(
+            class_weight = 'balanced',
+            classes = classes,
+            y = cls[self.taxa]
+        )
+        for lab, encoded in self._labels_map:
+            self._weights[encoded] = weights[classes.index(lab)]
\ No newline at end of file
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 4f7bc76..4d61f65 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -4,14 +4,10 @@
 import numpy as np
 import pandas as pd
 
-# Dimensions reduction
-from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
-from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
-
 # Preprocessing
 from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
-from models.preprocessors.compute_class_weights import ComputeClassWeights
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Training
 from ray.air.config import ScalingConfig
@@ -77,7 +73,8 @@ def __init__(
         batch_size,
         training_epochs,
         taxa,
-        kmers_list
+        kmers_list,
+        csv
     ):
         super().__init__(
             classifier,
@@ -85,12 +82,13 @@ def __init__(
             batch_size,
             training_epochs,
             taxa,
-            kmers_list
+            kmers_list,
+            csv
         )
         # Parameters
         self._encoded = []
 
-    def preprocess(self, ds, reductor_file):
+    def preprocess(self, ds, scaling = False):
         print('preprocess')
         if self.classifier == 'onesvm':
             self._encoder = OneClassSVMLabelEncoder(self.taxa)
@@ -99,19 +97,11 @@ def preprocess(self, ds, reductor_file):
         else:
             self._encoder = ModelLabelEncoder(self.taxa)
         
-        self._scaler = TensorTfIdfTransformer(self.kmers)
-
-        ds = self._encoder.fit_transform(ds)
-        
-        self._weights = ComputeClassWeights(LABELS_COLUMN_NAME)
-        self._weights.fit(ds)
-        self._weights = self._weights.stats_
-        
-        ds = self._scaler.fit_transform(ds)
+        self._encoder.fit(ds)
 
-        self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file)
-        # self._reductor = TensorCountHashing(self.kmers, 10000)
-        self._reductor.fit(ds)
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers)
+            self._scaler.fit(ds)
 
         # Labels mapping
         if self.classifier != 'onesvm':
@@ -120,6 +110,7 @@ def preprocess(self, ds, reductor_file):
             labels = np.append(labels, 'unknown')
             self._encoded = np.append(self._encoded, -1)
         self._labels_map = zip(labels, self._encoded)
+        self._compute_weights()
         
     def _label_decode(self, predict):
         print('_label_decode')
@@ -178,9 +169,8 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._scaler.transform(ds)
-            ds = self._reductor.transform(ds)
-            self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
             # Trigger the preprocessing computations before ingest in trainer
             # Otherwise, it would be executed at each epoch
             ds = ds.materialize()
@@ -220,8 +210,8 @@ def fit(self, datasets):
     def predict(self, ds, threshold = 0.8):
         print('predict')
         if ds.count() > 0:
-            ds = self._scaler.transform(ds)
-            ds = self._reductor.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
             ds = ds.materialize()
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
diff --git a/src/utils.py b/src/utils.py
index 5e7924f..b45b0b1 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -33,6 +33,7 @@
     'verify_kmers_list_length',
     'verify_load_data',
     'verify_concordance_klength',
+    'verify_need_scaling',
     'verify_taxas',
     'verify_load_preclassified',
     'merge_save_data',
@@ -162,6 +163,9 @@ def verify_concordance_klength(klen1 : int, klen2 : int):
         raise ValueError("K length between datasets is inconsistent ! Exiting\n" +
                 f"K length of bacteria dataset is {klen1} while K length from host is {klen2}")
 
+def verify_need_scaling(data : dict):
+    return False if 'decomposed' in data['profile'] else True
+
 # Verif + handling
 #########################################################################################################
 
@@ -199,10 +203,6 @@ def verify_load_data(data_file: Path):
     verify_file(data_file)
     data = load_Xy_data(data_file)
     verify_data_path(data['profile'])
-    if not isinstance(data['ids'], list):
-        raise ValueError("Invalid data file !")
-    elif not isinstance(data['kmers'], list):
-        raise ValueError("Invalid data file !")
     return data
 
 def verify_taxas(taxas : str, db_taxas : list):
@@ -249,7 +249,7 @@ def merge_classified_data(
     clf_ids.extend(clf_data['unknown_ids'])
     clf_data['unknown_ids'] = list(np.unique(clf_ids))
     # classes
-    dct_diff = {k : v for k,v in db_data.items() if k not in clf_data.keys()}
+    dct_diff = {k : v for k, v in db_data.items() if k not in clf_data.keys()}
     clf_data = {**clf_data,**dct_diff}
 
     return clf_data

From 59625e907e83671a19ad884993d74a618cff78b5 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 20 Nov 2023 18:33:07 -0500
Subject: [PATCH 42/92] correction in dimension decomp

---
 src/Caribou_dimensions_decomposition.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
index 52ccff6..627877b 100644
--- a/src/Caribou_dimensions_decomposition.py
+++ b/src/Caribou_dimensions_decomposition.py
@@ -78,7 +78,7 @@ def dimensions_decomposition(opt):
             # Save decomposed data
             save_Xy_data(data, data_file)
 
-            print(f"Caribou finished decomposing the features of {opt['dataset_name']} in {t_decomposition} seconds.")
+            print(f"Caribou finished decomposing the features in {t_decomposition} seconds.")
         else:
             print('Caribou did not decompose the features because the number to extract is bigger than the actual number of features')
     else:
@@ -91,7 +91,6 @@ def dimensions_decomposition(opt):
     parser = argparse.ArgumentParser(description='This script computes features reduction to a given K-mers dataset and then applies it.')
     # Dataset
     parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
-    parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files')
     parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced')
     # Parameters
     parser.add_argument('-n','--nb_components', default=1000, type=int, help='Number of components to decompose data into')

From 799b39f956b65d086a44a2341e8f83949b2a130c Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 20 Nov 2023 18:51:21 -0500
Subject: [PATCH 43/92] dim decomp local debug

---
 src/Caribou_dimensions_decomposition.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
index 627877b..286861d 100644
--- a/src/Caribou_dimensions_decomposition.py
+++ b/src/Caribou_dimensions_decomposition.py
@@ -48,7 +48,7 @@ def dimensions_decomposition(opt):
     data_file = f'{path}_decomposed{ext}'
 
     if not os.path.exists(data_file):
-        if opt['nb_features'] < len(kmers):
+        if opt['nb_components'] < len(kmers):
             # Load data 
             files_lst = glob(os.path.join(data['profile'],'*.parquet'))
             ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
@@ -72,7 +72,7 @@ def dimensions_decomposition(opt):
 
             # Save decomposed dataset
             data['profile'] = f"{data['profile']}_decomposed"
-            data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor._nb_components)]
+            data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor.preprocessors[1]._nb_components)]
             ds.write_parquet(data['profile'])
 
             # Save decomposed data
@@ -88,7 +88,7 @@ def dimensions_decomposition(opt):
 ################################################################################
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='This script computes features reduction to a given K-mers dataset and then applies it.')
+    parser = argparse.ArgumentParser(description='This script computes features decomposition to a given K-mers dataset and then applies it.')
     # Dataset
     parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
     parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced')

From bb25d283c0fecc8f892075e4570d98a5170e27f2 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 20 Nov 2023 18:56:15 -0500
Subject: [PATCH 44/92] tf-idf unwrap batch to avoid 0 div

---
 src/models/preprocessors/tfidf_transformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
index 88d899c..3732526 100644
--- a/src/models/preprocessors/tfidf_transformer.py
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -30,6 +30,7 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         occurences = np.zeros(self._nb_features)
         for batch in ds.iter_batches(batch_format = 'numpy'):
             batch = batch[TENSOR_COLUMN_NAME]
+            batch = _unwrap_ndarray_object_type_if_needed(batch)
             occurences += np.count_nonzero(batch, axis = 0)
 
         idf = np.log(nb_samples / occurences) + 1

From 2246cd0c3cffb749cba86d40f634be3c2b433cb8 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 21 Nov 2023 09:44:44 -0500
Subject: [PATCH 45/92] decomposition script rectify + load tf-idf in prepro

---
 src/Caribou_dimensions_decomposition.py       | 15 +++++-
 src/Caribou_reduce_features.py                |  3 +-
 .../reduction/truncated_svd_decomposition.py  |  6 +--
 src/models/classification.py                  | 12 ++++-
 src/models/kerasTF/models.py                  |  6 +--
 src/models/preprocessors/tfidf_transformer.py | 47 ++++++++++++-------
 src/models/sklearn/models.py                  |  4 +-
 7 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
index 286861d..8c539b6 100644
--- a/src/Caribou_dimensions_decomposition.py
+++ b/src/Caribou_dimensions_decomposition.py
@@ -36,7 +36,7 @@ def dimensions_decomposition(opt):
     k_length, kmers = verify_kmers_list_length(k_length, opt['kmers_list'])
 
     outdirs = define_create_outdirs(opt['outdir'])
-    
+
     # Initialize cluster
     init_ray_cluster(opt['workdir'])
 
@@ -53,12 +53,14 @@ def dimensions_decomposition(opt):
             files_lst = glob(os.path.join(data['profile'],'*.parquet'))
             ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
+            scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz')
             reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz')
 
             # Compute the decomposition
             preprocessor = Chain(
                 TensorTfIdfTransformer(
-                    features = kmers
+                    features = kmers,
+                    file = scaler_file
                 ),
                 TensorTruncatedSVDDecomposition(
                     features = kmers,
@@ -101,3 +103,12 @@ def dimensions_decomposition(opt):
     opt = vars(args)
 
     dimensions_decomposition(opt)
+
+# Test params
+opt = {
+    'dataset':'/home/nicdemon/results/data/Xy_genome_cucurbita_data_K10.npz',
+    'kmers_list':'/home/nicdemon/results/data/kmers_list_reduced.txt',
+    'nb_components':10000,
+    'outdir':'/home/nicdemon/results/',
+    'workdir':'/home/nicdemon/ray/',
+}
diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index c95861d..5fe02cd 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -75,7 +75,7 @@ def features_reduction(opt):
         train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers)
         train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers)
         # Statistical features selection
-        train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa'])
+        train_ds, export_ds, kmers = features_selection(train_ds, export_ds, kmers, opt['taxa'])
         # Time the computation of transformations
         t_end = time()
         t_reduction = t_end - t_start
@@ -83,6 +83,7 @@ def features_reduction(opt):
         data['profile'] = f"{data['profile']}_reduced"
         export_ds.write_parquet(data['profile'])
         # Save reduced K-mers
+        data['kmers'] = kmers
         with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle:
             handle.writelines("%s\n" % item for item in data['kmers'])
         # Save reduced data
diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py
index 0a67cd0..8c27ac5 100644
--- a/src/data/reduction/truncated_svd_decomposition.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -1,4 +1,3 @@
-import os
 
 import numpy as np
 import pandas as pd
@@ -6,6 +5,7 @@
 from tqdm import tqdm
 from typing import List
 from warnings import warn
+from os.path import isfile
 from ray.data import Dataset
 from utils import save_Xy_data, load_Xy_data
 
@@ -64,7 +64,7 @@ def batch_svd(batch):
 
         components = []
         if self._nb_features > self._nb_components:
-            if os.path.isfile(self._file):
+            if isfile(self._file):
                 components = np.array(load_Xy_data(self._file))
             else:
                 # sampl = ds.random_sample(0.1)
@@ -147,7 +147,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def __repr__(self):
-        return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})")
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})")
 
 def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
     if len(df.loc[0, column]) != nb_features:
diff --git a/src/models/classification.py b/src/models/classification.py
index a9ce5b1..30747a4 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -209,7 +209,11 @@ def _binary_training(self, datasets, taxa, file):
                 self._database_data['kmers'],
                 self._database_data['csv']
             )
-        model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling)
+        model.preprocess(
+            datasets[TRAINING_DATASET_NAME],
+            self._scaling,
+            os.path.join(self._outdirs['models_dir'], 'TF-IDF_diag.npz')
+        )
         model.fit(datasets)
 
         self._save_model(model, file)
@@ -236,7 +240,11 @@ def _multiclass_training(self, datasets, taxa, file):
                 self._database_data['kmers'],
                 self._database_data['csv']
             )
-        model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling)
+        model.preprocess(
+            datasets[TRAINING_DATASET_NAME],
+            self._scaling,
+            os.path.join(self._outdirs['models_dir'], 'TF-IDF_diag.npz')
+        )
         model.fit(datasets)
 
         self._save_model(model, file)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 827225f..28a5767 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -125,7 +125,7 @@ def __init__(
         elif self.classifier == 'widecnn':
             print('Training multiclass classifier based on Wide CNN Network')
 
-    def preprocess(self, ds, scaling = False):
+    def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         labels = []
         encoded = []
@@ -135,14 +135,14 @@ def preprocess(self, ds, scaling = False):
         if self._nb_classes == 2:
             self._encoder = ModelLabelEncoder(self.taxa)
             if scaling:
-                self._scaler = TensorTfIdfTransformer(self.kmers)
+                self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
         else:
             self._encoder = Chain(
                 LabelEncoder(self.taxa),
                 OneHotTensorEncoder(self.taxa)
             )
             if scaling:
-                self._scaler = TensorTfIdfTransformer(self.kmers)
+                self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
             
         self._encoder.fit(ds)
         if scaling:
diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py
index 3732526..de5c2ec 100644
--- a/src/models/preprocessors/tfidf_transformer.py
+++ b/src/models/preprocessors/tfidf_transformer.py
@@ -4,8 +4,10 @@
 import scipy.sparse as sp
 
 
+from os.path import isfile
 from ray.data.dataset import Dataset
 from sklearn.preprocessing import normalize
+from utils import save_Xy_data, load_Xy_data
 from ray.data.preprocessor import Preprocessor
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 
@@ -18,31 +20,37 @@ class TensorTfIdfTransformer(Preprocessor):
     TF-IDF transformation is used to scale down the impact of tokens that occur very frequently and scale up the impact of those that occur very rarely.
     """
 
-    def __init__(self, features):
+    def __init__(self, features, file: str = ''):
         # Parameters
         self._features = features
         self._nb_features = len(features)
+        self._file = file
 
     def _fit(self, ds: Dataset) -> Preprocessor:
-        nb_samples = ds.count()
+        if isfile(self._file):
+            idf_diag = load_Xy_data(self._file)
+        else:
+            nb_samples = ds.count()
 
-        # Nb of occurences
-        occurences = np.zeros(self._nb_features)
-        for batch in ds.iter_batches(batch_format = 'numpy'):
-            batch = batch[TENSOR_COLUMN_NAME]
-            batch = _unwrap_ndarray_object_type_if_needed(batch)
-            occurences += np.count_nonzero(batch, axis = 0)
+            # Nb of occurences
+            occurences = np.zeros(self._nb_features)
+            for batch in ds.iter_batches(batch_format = 'numpy'):
+                batch = batch[TENSOR_COLUMN_NAME]
+                batch = _unwrap_ndarray_object_type_if_needed(batch)
+                occurences += np.count_nonzero(batch, axis = 0)
 
-        idf = np.log(nb_samples / occurences) + 1
-        
-        idf_diag = sp.diags(
-            idf,
-            offsets=0,
-            shape=(self._nb_features, self._nb_features),
-            format="csr",
-            dtype=np.float64,
-        )
-        
+            idf = np.log(nb_samples / occurences) + 1
+            
+            idf_diag = sp.diags(
+                idf,
+                offsets=0,
+                shape=(self._nb_features, self._nb_features),
+                format="csr",
+                dtype=np.float64,
+            )
+
+            save_Xy_data(idf_diag, self._file)
+            
         self.stats_ = {'idf_diag' : idf_diag}
 
         return self
@@ -62,6 +70,9 @@ def _transform_pandas(self, batch: pd.DataFrame) -> pd.DataFrame:
 
         return batch
 
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})")
+
 def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
     if len(df.loc[0, column]) != nb_features:
         raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
\ No newline at end of file
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 4d61f65..6a7ef3f 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -88,7 +88,7 @@ def __init__(
         # Parameters
         self._encoded = []
 
-    def preprocess(self, ds, scaling = False):
+    def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         if self.classifier == 'onesvm':
             self._encoder = OneClassSVMLabelEncoder(self.taxa)
@@ -100,7 +100,7 @@ def preprocess(self, ds, scaling = False):
         self._encoder.fit(ds)
 
         if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers)
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
             self._scaler.fit(ds)
 
         # Labels mapping

From f7cc7a11645f1bbad34c5593155299b9ceb999ce Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 22 Nov 2023 18:08:50 -0500
Subject: [PATCH 46/92] NMF for decomposition + debug weights

---
 ...ements copy.txt => frozen_requirements.txt |  0
 requirements.txt                              |  6 +-
 setup.cfg                                     | 27 +++---
 src/Caribou_classification.py                 |  9 +-
 src/Caribou_classification_train_cv.py        | 15 +++-
 src/Caribou_dimensions_decomposition.py       |  8 +-
 src/Caribou_extraction.py                     | 23 ++++-
 src/Caribou_extraction_train_cv.py            | 36 ++++++--
 .../reduction/dictionnary_decomposition.py    | 83 +++++++++++++++++++
 src/data/reduction/nmf_decomposition.py       | 81 ++++++++++++++++++
 .../reduction/truncated_svd_decomposition.py  |  3 +-
 src/models/classification.py                  |  5 +-
 src/models/encoders/model_label_encoder.py    |  2 +-
 src/models/kerasTF/models.py                  | 11 ++-
 src/models/models_utils.py                    | 13 ++-
 src/models/reads_simulation.py                |  8 +-
 src/models/sklearn/models.py                  | 12 +--
 src/models/sklearn/partial_trainer.py         |  3 +-
 src/utils.py                                  | 13 +--
 19 files changed, 285 insertions(+), 73 deletions(-)
 rename requirements copy.txt => frozen_requirements.txt (100%)
 create mode 100644 src/data/reduction/dictionnary_decomposition.py
 create mode 100644 src/data/reduction/nmf_decomposition.py

diff --git a/requirements copy.txt b/frozen_requirements.txt
similarity index 100%
rename from requirements copy.txt
rename to frozen_requirements.txt
diff --git a/requirements.txt b/requirements.txt
index d409b51..b51fcee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-biopython==1.78
+biopython>=1.79
 cloudpickle>=2.2.1
-InSilicoSeq==1.5.4
+InSilicoSeq==1.6.0
 keras==2.14
 numpy>=1.2
 pandas>=2.0
 ray==2.6.3
 scikit-learn==1.3.0
 tensorflow==2.14
-pyarrow==12.0
\ No newline at end of file
+pyarrow==12.0.1
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index a9f82f6..fe79d4b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,23 +11,16 @@ long_description = file: README.md
 
 [options]
 install_requires =
-  setuptools
-  wheel
-  grpcio==1.48.2
-  ray[default]==2.6.3
-  pydantic<2
-  pyarrow>=6.0.1,!=7
-  keras>=2.0.0
-  tensorflow>=2.0.0
-  numpy>=1.16
-  pandas>=1.3.0
-  scikit-learn>=1.1.2
-  scipy
-  insilicoseq
-  biopython==1.78
-  tqdm
-  cloudpickle
-  tune-sklearn
+  ray==2.6.3
+  numpy>=1.2
+  pandas>=2.0
+  pyarrow==12.0.1
+  cloudpickle>=2.2.1
+  keras==2.14
+  tensorflow==2.14
+  scikit-learn==1.3.0
+  biopython>=1.79
+  InSilicoSeq==1.6.0
 include_package_data = True
 python_requires = >=3.8
 scripts =
diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index 1e1da9c..9a9473c 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -50,7 +50,10 @@ def bacteria_classification(opt):
     # Verify need for scaling
     scaling = verify_need_scaling(db_data)
 
-    val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
+    if opt['validation'] is not None:
+        val_data, val_ds = verify_load_metagenome(opt['validation'])
+    else:
+        val_data, val_ds = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
 
     datasets = {
         TRAINING_DATASET_NAME : db_ds,
@@ -102,9 +105,11 @@ def bacteria_classification(opt):
     # Dataset
     parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify')
     parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files')
+    # Optional datasets
+    parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
     # Parameters
     parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
-    parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.')
+    parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index 1f707b9..f6d1422 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -56,8 +56,14 @@ def bacteria_classification_train_cv(opt):
     
     for taxa in lst_taxas:
 
-        test_ds, test_data = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}")
-        val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
+        if opt['test'] is not None:
+            test_data, test_ds = verify_load_metagenome(opt['test'])
+        else:
+            test_data, test_ds = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}")
+        if opt['validation'] is not None:
+            val_data, val_ds = verify_load_metagenome(opt['validation'])
+        else:
+            val_data, val_ds = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}")
 
         datasets = {
             TRAINING_DATASET_NAME : db_ds,
@@ -94,9 +100,12 @@ def bacteria_classification_train_cv(opt):
     # Database
     parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database')
     parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files')
+    # Optional datasets
+    parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
+    parser.add_argument('-t','--test', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the test dataset')
     # Parameters
     parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
-    parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.')
+    parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
index 8c539b6..0e9b75d 100644
--- a/src/Caribou_dimensions_decomposition.py
+++ b/src/Caribou_dimensions_decomposition.py
@@ -12,7 +12,9 @@
 from pathlib import Path
 
 from ray.data.preprocessors import Chain
+from data.reduction.nmf_decomposition import TensorNMFDecomposition
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+from data.reduction.dictionnary_decomposition import TensorDictionnaryDecomposition
 from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition
 
 __author__ = "Nicolas de Montigny"
@@ -54,7 +56,7 @@ def dimensions_decomposition(opt):
             ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
             scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz')
-            reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz')
+            reductor_file = os.path.join(outdirs['models_dir'], 'decomposed_components.npz')
 
             # Compute the decomposition
             preprocessor = Chain(
@@ -62,7 +64,7 @@ def dimensions_decomposition(opt):
                     features = kmers,
                     file = scaler_file
                 ),
-                TensorTruncatedSVDDecomposition(
+                TensorNMFDecomposition(
                     features = kmers,
                     nb_components = opt['nb_components'],
                     file = reductor_file
@@ -70,12 +72,12 @@ def dimensions_decomposition(opt):
             )
             t_s = time()
             ds = preprocessor.fit_transform(ds)
-            t_decomposition = time() - t_s
 
             # Save decomposed dataset
             data['profile'] = f"{data['profile']}_decomposed"
             data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor.preprocessors[1]._nb_components)]
             ds.write_parquet(data['profile'])
+            t_decomposition = time() - t_s
 
             # Save decomposed data
             save_Xy_data(data, data_file)
diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py
index a2168fb..6228230 100644
--- a/src/Caribou_extraction.py
+++ b/src/Caribou_extraction.py
@@ -40,18 +40,30 @@ def bacteria_extraction(opt):
 
     if opt['model_type'] != 'onesvm':
         if opt['data_host'] is not None:
-            db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+            if opt['merged'] is not None:
+                db_data, db_ds = verify_load_db(opt['merged'])
+            else:
+                db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
             db_name = 'host_merged'
         else:
             db_data, db_ds = verify_load_db(opt['data_bacteria'])
             db_name = opt['dataset_name']
 
-        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+        if opt['validation'] is not None:
+            val_data, val_ds = verify_load_db(opt['validation'])
+        else:
+            val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
     else:
-        db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+        if opt['merged'] is not None:
+            db_data, db_ds = verify_load_db(opt['merged'])
+        else:
+            db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
         db_name = 'host_merged'
 
-        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+        if opt['validation'] is not None:
+            val_data, val_ds = verify_load_db(opt['validation'])
+        else:
+            val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
 
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
         db_name = opt['dataset_name']
@@ -111,6 +123,9 @@ def bacteria_extraction(opt):
     # Dataset
     parser.add_argument('-dm','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify')
     parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files')
+    # Optional datasets
+    parser.add_argument('-m','--merged', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the merged bacteria and host databases')
+    parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
     # Parameters
     parser.add_argument('-model','--model_type', default=None, choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py
index 2547886..8576535 100644
--- a/src/Caribou_extraction_train_cv.py
+++ b/src/Caribou_extraction_train_cv.py
@@ -34,20 +34,38 @@ def bacteria_extraction_train_cv(opt):
 
     if opt['model_type'] != 'onesvm':
         if opt['data_host'] is not None:
-            db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+            if opt['merged'] is not None:
+                db_data, db_ds = verify_load_db(opt['merged'])
+            else:
+                db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
             db_name = 'host_merged'
         else:
             db_data, db_ds = verify_load_db(opt['data_bacteria'])
             db_name = opt['database_name']
 
-        test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
-        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+        if opt['test'] is not None:
+            test_data, test_ds = verify_load_db(opt['test'])
+        else:
+            test_data, test_ds = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
+        if opt['validation'] is not None:
+            val_data, val_ds = verify_load_db(opt['validation'])
+        else:
+            val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
     else:
-        db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
+        if opt['merged'] is not None:
+            db_data, db_ds = verify_load_db(opt['merged'])
+        else:
+            db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host'])
         db_name = 'host_merged'
 
-        test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
-        val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
+        if opt['test'] is not None:
+            test_data, test_ds = verify_load_db(opt['test'])
+        else:
+            test_data, test_ds = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}')
+        if opt['validation'] is not None:
+            val_data, val_ds = verify_load_db(opt['validation'])
+        else:
+            val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}')
 
         db_data, db_ds = verify_load_db(opt['data_bacteria'])
         db_name = opt['database_name']
@@ -92,8 +110,12 @@ def bacteria_extraction_train_cv(opt):
     parser.add_argument('-dh','--data_host', default=None, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the host')
     parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files')
     parser.add_argument('-hn','--host_name', default=None, help='Name of the host database used to name files')
+    # Optional datasets
+    parser.add_argument('-m','--merged', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the merged bacteria and host databases')
+    parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
+    parser.add_argument('-t','--test', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the test dataset')
     # Parameters
-    parser.add_argument('-model','--model_type', required = True, choices=['onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
+    parser.add_argument('-model','--model_type', required=True, choices=['onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one is chosen, defaults to 100')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
diff --git a/src/data/reduction/dictionnary_decomposition.py b/src/data/reduction/dictionnary_decomposition.py
new file mode 100644
index 0000000..5bbc4b5
--- /dev/null
+++ b/src/data/reduction/dictionnary_decomposition.py
@@ -0,0 +1,83 @@
+
+import numpy as np
+import pandas as pd
+
+from typing import List
+from warnings import warn
+from os.path import isfile
+from ray.data import Dataset
+from utils import save_Xy_data, load_Xy_data
+
+from sklearn.utils.extmath import randomized_svd
+from sklearn.decomposition import DictionaryLearning
+from sklearn.decomposition._dict_learning import _sparse_encode
+
+from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class TensorDictionnaryDecomposition(Preprocessor):
+    """
+    Custom class for using Mini-Batch Dictionnary Learning as a Ray preprocessor.
+    This is inspired by sklearn.decomposition.DictionaryLearning and is fitted on batches before keeping the consensus components matrix.
+    Consensus components matrix is attained following the logic from sklearn.decomposition.MiniBatchDictionnaryLearning.
+    https://scikit-learn.org/stable/modules/decomposition.html#nmf
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchNMF.html
+    """
+    def __init__(self, features: List[str], nb_components: int = 10000, file: str = ''):
+        # Parameters
+        self.features = features
+        self._nb_features = len(features)
+        self._nb_components = nb_components
+        self._file = file       
+
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        def batch_dict(batch):
+            batch = batch[TENSOR_COLUMN_NAME]
+            batch = _unwrap_ndarray_object_type_if_needed(batch)
+            dict = DictionaryLearning(
+                n_components = self._nb_components,
+                max_iter = 10,
+                transform_algorithm = 'cd',
+            )
+            dict.fit(batch)
+            return {'components' : [dict.components_]}
+        
+        components = []
+        if self._nb_features > self._nb_components:
+            if isfile(self._file):
+                components = np.array(load_Xy_data(self._file))
+            else:
+                dct = ds.map_batches(batch_dict, batch_format = 'numpy')
+                
+                for row in dct.iter_rows():
+                    components.append(row['components'])
+                components = np.mean(components, axis = 0)
+                
+                save_Xy_data(components, self._file)
+
+            self.stats_ = {'components' : components}
+        else:
+            warn('No features reduction to do because the number of features is already lower than the required number of components')
+            self.stats_ = {'components' : False}
+    
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        components = self.stats_['components']
+        
+        if components is not False:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = np.dot(tensor_col, components.T)
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
+
+        return df
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})")
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
diff --git a/src/data/reduction/nmf_decomposition.py b/src/data/reduction/nmf_decomposition.py
new file mode 100644
index 0000000..85abe03
--- /dev/null
+++ b/src/data/reduction/nmf_decomposition.py
@@ -0,0 +1,81 @@
+
+import numpy as np
+import pandas as pd
+
+from typing import List
+from warnings import warn
+from os.path import isfile
+from ray.data import Dataset
+from utils import save_Xy_data, load_Xy_data
+
+from sklearn.utils.extmath import randomized_svd
+from sklearn.decomposition import DictionaryLearning, NMF, MiniBatchNMF
+
+from ray.data.preprocessor import Preprocessor
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class TensorNMFDecomposition(Preprocessor):
+    """
+    Custom class for using Mini-Batch Non-Negative Matrix Factorization (NMF) as a Ray preprocessor.
+    This is inspired by sklearn.decomposition.NMF and is fitted on batches before keeping the consensus components matrix.
+    Consensus components matrix is attained following the logic from sklearn.decomposition.MiniBatchNMF.
+    https://scikit-learn.org/stable/modules/decomposition.html#nmf
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchNMF.html
+    """
+    def __init__(self, features: List[str], nb_components: int = 10000, file: str = ''):
+        # Parameters
+        self.features = features
+        self._nb_features = len(features)
+        self._nb_components = nb_components
+        self._file = file
+
+    def _fit(self, ds: Dataset) -> Preprocessor:
+        def batch_nmf(batch):
+            batch = batch[TENSOR_COLUMN_NAME]
+            batch = _unwrap_ndarray_object_type_if_needed(batch)
+            model = NMF(
+                n_components = self._nb_components,
+                init = 'random'
+            )
+            model.fit(batch)
+            return {'components' : [model.components_]}
+
+        components = []
+        if self._nb_features > self._nb_components:
+            if isfile(self._file):
+                components = np.array(load_Xy_data(self._file))
+            else:
+                nmf = ds.map_batches(batch_nmf, batch_format = 'numpy')
+                
+                for row in nmf.iter_rows():
+                    components.append(row['components'])
+                components = np.mean(components, axis = 0)
+
+                save_Xy_data(components, self._file)
+
+            self.stats_ = {'components' : components}
+        else:
+            warn('No features reduction to do because the number of features is already lower than the required number of components')
+            self.stats_ = {'components' : False}
+    
+    def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features)
+        components = self.stats_['components']
+        
+        if components is not False:
+            tensor_col = df[TENSOR_COLUMN_NAME]
+            tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col)
+            tensor_col = np.dot(tensor_col, components.T)
+            df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col))
+
+        return df
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})")
+
+def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None:
+    if len(df.loc[0, column]) != nb_features:
+        raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting')
diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py
index 8c27ac5..ca0eed4 100644
--- a/src/data/reduction/truncated_svd_decomposition.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -35,7 +35,6 @@ def __init__(self, features: List[str], nb_components: int = 10000, file: str =
         self._file = file
         
     def _fit(self, ds: Dataset) -> Preprocessor:
-        # Parallel
         """
         Possibilities for parallel TruncatedSVD
         * sklearn minibatch PCA -> PCA / SVD mostly equivalent
@@ -93,7 +92,7 @@ def batch_svd(batch):
             return {'dictonnary' : [dict.components_]}
         components = []
         if self._nb_features > self._nb_components:
-            if os.path.isfile(self._file):
+            if isfile(self._file):
                 components = np.array(load_Xy_data(self._file))
             else:
                 svd = ds.map_batches(batch_svd, batch_format = 'numpy')
diff --git a/src/models/classification.py b/src/models/classification.py
index 30747a4..5da3f9c 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -280,7 +280,7 @@ def _get_true_classif(self, ds, taxas):
         Extract the true classification of the dataset used for cross-validation
         """
         classif = {taxa : [] for taxa in taxas}
-        
+
         cols2drop = [col for col in ds.schema().names if col not in ['id', taxas[0]]]
         classif_ds = ds.drop_columns(cols2drop)
 
@@ -304,13 +304,10 @@ def _score_cv(self, y_true, y_pred, taxa):
 
         cv_csv = os.path.join(self._outdirs['results_dir'],f'{self._database}_{model}_{taxa}_cv_scores.csv')
 
-
         y_compare = pd.DataFrame({
             'y_true': y_true[taxa],
             'y_pred': y_pred[taxa]
         })
-        y_compare['y_true'] = y_compare['y_true'].str.lower()
-        y_compare['y_pred'] = y_compare['y_pred'].str.lower()
         y_compare.to_csv(os.path.join(self._outdirs['models_dir'], f'y_compare_{self._database}_{model}_{taxa}.csv'))
 
         support = precision_recall_fscore_support(
diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py
index 3317257..b635108 100644
--- a/src/models/encoders/model_label_encoder.py
+++ b/src/models/encoders/model_label_encoder.py
@@ -25,8 +25,8 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):
-        s_values = self.stats_
         def column_label_encoder(s: pd.Series):
+            s_values = self.stats_[f"unique_values({s.name})"]
             return s.map(s_values)
 
         df[self.label_column] = df[self.label_column].transform(column_label_encoder)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 28a5767..62538f4 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -127,8 +127,6 @@ def __init__(
 
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
-        labels = []
-        encoded = []
         for row in ds.iter_rows():
             labels.append(row[self.taxa])
         self._nb_classes = len(np.unique(labels))
@@ -152,16 +150,17 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
         else:
             labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys())
-        encoded = np.arange(len(labels))
+        self._encoded = np.arange(len(labels))
         labels = np.append(labels, 'unknown')
-        encoded = np.append(encoded, -1)
-        self._labels_map = zip(labels, encoded)
+        self._encoded = np.append(self._encoded, -1)
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
         self._compute_weights()
 
     def _label_decode(self, predict):
         print('_label_decode')
         decoded = pd.Series(np.empty(len(predict), dtype=object))
-        for label, encoded in self._labels_map:
+        for label, encoded in self._labels_map.items():
             decoded[predict == encoded] = label
 
         return np.array(decoded)
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 3bf2e5e..a7fbdb7 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -76,11 +76,12 @@ def __init__(
         self._clf = None
         self._weights = {}
         self._scaler = None
+        self._encoded = []
         self._encoder = None
         self._trainer = None
         self._reductor = None
         self._predictor = None
-        self._labels_map = None
+        self._labels_map = {}
         self._model_ckpt = None
         self._train_params = {}
         self._preprocessor = None
@@ -115,12 +116,18 @@ def _compute_weights(self):
         """
         Set class weights depending on their abundance in data-associated classes csv
         """
+        if isinstance(self._csv, tuple):
+            cls = pd.concat([pd.read_csv(self._csv[0]),pd.read_csv(self._csv[1])], axis = 0, join = 'inner', ignore_index = True)
         cls = pd.read_csv(self._csv)
+        if self.taxa == 'domain':
+            cls.loc[cls['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
         classes = list(cls[self.taxa].unique())
         weights = compute_class_weight(
             class_weight = 'balanced',
             classes = classes,
             y = cls[self.taxa]
         )
-        for lab, encoded in self._labels_map:
-            self._weights[encoded] = weights[classes.index(lab)]
\ No newline at end of file
+        
+        for lab, encoded in self._labels_map.items():
+            if lab != 'unknown':
+                self._weights[encoded] = weights[classes.index(lab)]
\ No newline at end of file
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index 463c077..197a3f4 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -218,7 +218,7 @@ def _verify_sim_arguments(self, k, kmers_list):
 
 def split_sim_dataset(ds, data, name):
     splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}.npz')
-    if os.path.exists(splitted_path):
+    if os.path.isfile(splitted_path):
         warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset')
         splitted_data = load_Xy_data(splitted_path)
         files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet'))
@@ -229,8 +229,8 @@ def split_sim_dataset(ds, data, name):
         if splitted_ds.count() == 0:
             nb_samples = round(ds.count() * 0.1)
             splitted_ds = ds.random_shuffle().limit(nb_samples)
-        splitted_ds, splitted_data = sim_dataset(splitted_ds, data, name)
-        return splitted_ds, splitted_data
+        splitted_data, splitted_ds = sim_dataset(splitted_ds, data, name)
+        return splitted_data, splitted_ds 
 
 def sim_dataset(ds, data, name):
     """
@@ -247,4 +247,4 @@ def sim_dataset(ds, data, name):
     sim_data = cv_sim.simulation(k, data['kmers'])
     files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
     sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-    return sim_ds, sim_data
\ No newline at end of file
+    return sim_data, sim_ds
\ No newline at end of file
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 6a7ef3f..39af3de 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -85,9 +85,7 @@ def __init__(
             kmers_list,
             csv
         )
-        # Parameters
-        self._encoded = []
-
+        
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         if self.classifier == 'onesvm':
@@ -109,13 +107,15 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             self._encoded = np.arange(len(labels))
             labels = np.append(labels, 'unknown')
             self._encoded = np.append(self._encoded, -1)
-        self._labels_map = zip(labels, self._encoded)
-        self._compute_weights()
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
+        if self.classifier != 'onesvm':
+            self._compute_weights()
         
     def _label_decode(self, predict):
         print('_label_decode')
         decoded = pd.Series(np.empty(len(predict), dtype=object))
-        for label, encoded in self._labels_map:
+        for label, encoded in self._labels_map.items():
             decoded[predict == encoded] = label
 
         return np.array(decoded)
diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py
index 046c88c..9545581 100644
--- a/src/models/sklearn/partial_trainer.py
+++ b/src/models/sklearn/partial_trainer.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 import pandas as pd
-from tqdm import tqdm
 from joblib import parallel_backend
 from sklearn.metrics import check_scoring
 
@@ -202,7 +201,7 @@ def training_loop(self):
 
         _set_cpu_params(self.estimator, num_cpus)
 
-        for epoch_X, epoch_y in tqdm(zip(X_train.iter_epochs(), y_train.iter_epochs())):
+        for epoch_X, epoch_y in zip(X_train.iter_epochs(), y_train.iter_epochs()):
             with parallel_backend("ray", n_jobs=num_cpus):
                 start_time = time()
                 for batch_X, batch_y in zip(
diff --git a/src/utils.py b/src/utils.py
index b45b0b1..9194f63 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -86,20 +86,20 @@ def save_Xy_data(data, Xy_file):
 #########################################################################################################
 
 def verify_file(file : Path):
-    if file is not None and not os.path.exists(file):
+    if file is not None and not os.path.isfile(file):
         raise ValueError(f'Cannot find file {file} !')
 
 def verify_fasta(file : Path):
-    if not os.path.isfile(file) and not os.path.isdir(file):
+    if not os.path.exists(file):
         raise ValueError('Fasta must be an interleaved fasta file or a directory containing fasta files.')
 
 def verify_data_path(dir : Path):
-    if not os.path.exists(dir):
+    if not os.path.isdir(dir):
         raise ValueError(f"Cannot find data folder {dir} ! Exiting")
 
 def verify_saving_path(dir : Path):
     path, folder = os.path.split(dir)
-    if not os.path.exists(path):
+    if not os.path.isdir(path):
         raise ValueError("Cannot find where to create output folder !")
 
 def verify_host(host : str):
@@ -341,7 +341,7 @@ def merge_db_host(db_data, host_data):
     merged_db_host = {}
     merged_db_host_file = f"{db_data['profile']}_host_merged.npz"
 
-    if os.path.exists(merged_db_host_file):
+    if os.path.isfile(merged_db_host_file):
         merged_db_host = load_Xy_data(merged_db_host_file)
         files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
         merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
@@ -366,7 +366,8 @@ def merge_db_host(db_data, host_data):
     merged_db_host['kmers'] = db_data['kmers']  # Features
     merged_db_host['taxas'] = ['domain']  # Known taxas for classification
     merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta'])  # Fasta file needed for reads simulation
-    
+    merged_db_host['csv'] = (db_data['csv'], host_data['csv'])  # csv file needed for classes weights
+        
     save_Xy_data(merged_db_host, merged_db_host_file)
 
     return merged_db_host, merged_ds

From 7511e7561a3b80773ff54edfd3e9fa8edeba39d5 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 23 Nov 2023 06:14:25 -0500
Subject: [PATCH 47/92] decomposition revert back to TruncatedSVD

---
 src/Caribou_dimensions_decomposition.py         |  4 ++--
 src/data/reduction/dictionnary_decomposition.py | 12 +++++++-----
 src/data/reduction/nmf_decomposition.py         |  8 ++++----
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
index 0e9b75d..0141abb 100644
--- a/src/Caribou_dimensions_decomposition.py
+++ b/src/Caribou_dimensions_decomposition.py
@@ -56,7 +56,7 @@ def dimensions_decomposition(opt):
             ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
 
             scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz')
-            reductor_file = os.path.join(outdirs['models_dir'], 'decomposed_components.npz')
+            reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz')
 
             # Compute the decomposition
             preprocessor = Chain(
@@ -64,7 +64,7 @@ def dimensions_decomposition(opt):
                     features = kmers,
                     file = scaler_file
                 ),
-                TensorNMFDecomposition(
+                TensorTruncatedSVDDecomposition(
                     features = kmers,
                     nb_components = opt['nb_components'],
                     file = reductor_file
diff --git a/src/data/reduction/dictionnary_decomposition.py b/src/data/reduction/dictionnary_decomposition.py
index 5bbc4b5..27e9f67 100644
--- a/src/data/reduction/dictionnary_decomposition.py
+++ b/src/data/reduction/dictionnary_decomposition.py
@@ -8,9 +8,7 @@
 from ray.data import Dataset
 from utils import save_Xy_data, load_Xy_data
 
-from sklearn.utils.extmath import randomized_svd
-from sklearn.decomposition import DictionaryLearning
-from sklearn.decomposition._dict_learning import _sparse_encode
+from sklearn.decomposition import DictionaryLearning, MiniBatchDictionaryLearning
 
 from ray.data.preprocessor import Preprocessor
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
@@ -37,10 +35,14 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         def batch_dict(batch):
             batch = batch[TENSOR_COLUMN_NAME]
             batch = _unwrap_ndarray_object_type_if_needed(batch)
-            dict = DictionaryLearning(
+            dict = MiniBatchDictionaryLearning(
                 n_components = self._nb_components,
                 max_iter = 10,
-                transform_algorithm = 'cd',
+                fit_algorithm = 'cd',
+                transform_algorithm = 'lars',
+                positive_code = True,
+                positive_dict = True,
+                batch_size = 10
             )
             dict.fit(batch)
             return {'components' : [dict.components_]}
diff --git a/src/data/reduction/nmf_decomposition.py b/src/data/reduction/nmf_decomposition.py
index 85abe03..6048cbb 100644
--- a/src/data/reduction/nmf_decomposition.py
+++ b/src/data/reduction/nmf_decomposition.py
@@ -8,8 +8,7 @@
 from ray.data import Dataset
 from utils import save_Xy_data, load_Xy_data
 
-from sklearn.utils.extmath import randomized_svd
-from sklearn.decomposition import DictionaryLearning, NMF, MiniBatchNMF
+from sklearn.decomposition import NMF, MiniBatchNMF
 
 from ray.data.preprocessor import Preprocessor
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
@@ -36,9 +35,10 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         def batch_nmf(batch):
             batch = batch[TENSOR_COLUMN_NAME]
             batch = _unwrap_ndarray_object_type_if_needed(batch)
-            model = NMF(
+            model = MiniBatchNMF(
                 n_components = self._nb_components,
-                init = 'random'
+                init = 'random',
+                batch_size = 10
             )
             model.fit(batch)
             return {'components' : [model.components_]}

From fa5495095d5b70f88bc5289461635b28c0557186 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 23 Nov 2023 13:43:16 -0500
Subject: [PATCH 48/92] debug onesvm labels encoding

---
 src/models/classification.py                | 5 ++++-
 src/models/encoders/onesvm_label_encoder.py | 4 ++--
 src/models/sklearn/models.py                | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/models/classification.py b/src/models/classification.py
index 5da3f9c..1d0e3c8 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -289,7 +289,10 @@ def _get_true_classif(self, ds, taxas):
 
         for row in classif_ds.iter_rows():
             for taxa in taxas:
-                classif[taxa].append(row[taxa])
+                if self._classifier_binary == 'onesvm' and row[taxa] not in ['Bacteria','bacteria','bact']:
+                    classif[taxa].append('Unknown')
+                else:
+                    classif[taxa].append(row[taxa])
 
         return classif, ds
 
diff --git a/src/models/encoders/onesvm_label_encoder.py b/src/models/encoders/onesvm_label_encoder.py
index 1743f95..23b121d 100644
--- a/src/models/encoders/onesvm_label_encoder.py
+++ b/src/models/encoders/onesvm_label_encoder.py
@@ -20,14 +20,14 @@ def __init__(self, label_column: str):
     def _fit(self, dataset : Dataset) -> Preprocessor:
         self.stats_ = OrderedDict()
         self.stats_[f"unique_values({self.label_column})"] = {
-            'bacteria' : 1,
+            'Bacteria' : 1
         }
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):
         _validate_df(df, self.label_column)
         mapping = self.stats_[f"unique_values({self.label_column})"]
-        df[self.label_column] = df[self.label_column].str.lower()
+        df[self.label_column] = df[self.label_column]
         df[self.label_column] = df[self.label_column].map(mapping)
         df[self.label_column] = df[self.label_column].fillna(-1)
 
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 39af3de..1be9a15 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -91,7 +91,7 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         if self.classifier == 'onesvm':
             self._encoder = OneClassSVMLabelEncoder(self.taxa)
             self._encoded = np.array([1,-1], dtype = np.int32)
-            labels = np.array(['bacteria', 'unknown'], dtype = object)
+            labels = np.array(['Bacteria', 'Unknown'], dtype = object)
         else:
             self._encoder = ModelLabelEncoder(self.taxa)
         
@@ -105,7 +105,7 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         if self.classifier != 'onesvm':
             labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
             self._encoded = np.arange(len(labels))
-            labels = np.append(labels, 'unknown')
+            labels = np.append(labels, 'Unknown')
             self._encoded = np.append(self._encoded, -1)
         for (label, encoded) in zip(labels, self._encoded):
             self._labels_map[label] = encoded

From 3fdaf17e05c18e2dedbd0e9a5fdf0de57726f61f Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 25 Nov 2023 12:21:15 -0500
Subject: [PATCH 49/92] handle import error for parquet reading

---
 src/Caribou_dimensions_decomposition.py |  5 ++--
 src/Caribou_reduce_features.py          |  5 ++--
 src/models/models_utils.py              |  2 +-
 src/models/reads_simulation.py          |  6 ++---
 src/utils.py                            | 31 +++++++++++++++++--------
 5 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py
index 0141abb..3c4a343 100644
--- a/src/Caribou_dimensions_decomposition.py
+++ b/src/Caribou_dimensions_decomposition.py
@@ -51,9 +51,8 @@ def dimensions_decomposition(opt):
 
     if not os.path.exists(data_file):
         if opt['nb_components'] < len(kmers):
-            # Load data 
-            files_lst = glob(os.path.join(data['profile'],'*.parquet'))
-            ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+            # Load data
+            ds = read_parquet_files(data['profile'])
 
             scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz')
             reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz')
diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index 5fe02cd..efe88db 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -64,9 +64,8 @@ def features_reduction(opt):
 
     if not os.path.exists(data_file):
         # Load data 
-        files_lst = glob(os.path.join(data['profile'],'*.parquet'))
-        export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        export_ds = read_parquet_files(data['profile'])
+        train_ds = read_parquet_files(data['profile'])
         # Time the computation of transformations
         t_start = time()
         # Features scaling
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index a7fbdb7..c9990bd 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -129,5 +129,5 @@ def _compute_weights(self):
         )
         
         for lab, encoded in self._labels_map.items():
-            if lab != 'unknown':
+            if lab.lower() != 'unknown':
                 self._weights[encoded] = weights[classes.index(lab)]
\ No newline at end of file
diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py
index 197a3f4..cf4c8eb 100644
--- a/src/models/reads_simulation.py
+++ b/src/models/reads_simulation.py
@@ -221,8 +221,7 @@ def split_sim_dataset(ds, data, name):
     if os.path.isfile(splitted_path):
         warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset')
         splitted_data = load_Xy_data(splitted_path)
-        files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet'))
-        splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        splitted_ds = read_parquet_files(splitted_data['profile'])
         return splitted_ds, splitted_data
     else:
         splitted_ds = ds.random_sample(0.1)
@@ -245,6 +244,5 @@ def sim_dataset(ds, data, name):
     sim_outdir = os.path.dirname(data['profile'])
     cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name)
     sim_data = cv_sim.simulation(k, data['kmers'])
-    files_lst = glob(os.path.join(sim_data['profile'], '*.parquet'))
-    sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    sim_ds = read_parquet_files(sim_data['profile'])
     return sim_data, sim_ds
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index 9194f63..83e84b2 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,18 +5,21 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pyarrow.parquet as pq
 
 from glob import glob
 from pathlib import Path
 from warnings import warn
 from psutil import virtual_memory
 
+
 __author__ = "Nicolas de Montigny"
 
 __all__ = [
     'init_ray_cluster',
     'load_Xy_data',
     'save_Xy_data',
+    'read_parquet_files',
     'verify_file',
     'verify_fasta',
     'verify_data_path',
@@ -82,6 +85,19 @@ def load_Xy_data(Xy_file):
 def save_Xy_data(data, Xy_file):
     np.savez(Xy_file, data = data)
 
+# Read parquet files and handle FileSystem build ImportError
+def read_parquet_files(profile):
+    files_lst = glob(os.path.join(profile, '*.parquet'))
+    try:
+        ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    except ImportError:
+        tables_lst = []
+        for file in files_lst:
+            tables_lst.append(pq.read_table(file))
+        ds = ray.data.from_arrow(tables_lst)
+    
+    return ds
+
 # User arguments verification
 #########################################################################################################
 
@@ -306,8 +322,7 @@ def verify_load_metagenome(data):
     Wrapper function for verifying and loading the metagenome dataset
     """
     data = verify_load_data(data)
-    files_lst = glob(os.path.join(data['profile'], '*.parquet'))
-    ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    ds = read_parquet_files(data['profile'])
     
     return data, ds
 
@@ -317,8 +332,7 @@ def verify_load_db(db_data):
     Wrapper function for verifying and loading the db dataset
     """
     db_data = verify_load_data(db_data)
-    files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-    db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+    db_ds = read_parquet_files(db_data['profile'])
     db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas')
     
     return db_data, db_ds
@@ -343,14 +357,11 @@ def merge_db_host(db_data, host_data):
 
     if os.path.isfile(merged_db_host_file):
         merged_db_host = load_Xy_data(merged_db_host_file)
-        files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet'))
-        merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        merged_ds = read_parquet_files(merge_db_host['profile'])
     else:
         merged_db_host['profile'] = f"{db_data['profile']}_host_merged"
-        files_lst = glob(os.path.join(db_data['profile'], '*.parquet'))
-        db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
-        files_lst = glob(os.path.join(host_data['profile'], '*.parquet'))
-        host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst))
+        db_ds = read_parquet_files(db_data['profile'])
+        host_ds = read_parquet_files(host_data['profile'])
 
         cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]]
         db_ds = db_ds.drop_columns(cols2drop)

From 3964306d4c1f3397b19d80c01f837367de050984 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 25 Nov 2023 20:55:09 -0500
Subject: [PATCH 50/92] debug keras for cv

---
 src/models/classification.py                |  2 +
 src/models/kerasTF/build_neural_networks.py | 52 +++++++++----------
 src/models/kerasTF/models.py                | 56 ++++++++++-----------
 src/models/models_utils.py                  |  2 +-
 src/utils.py                                | 51 +++++++++++++------
 5 files changed, 92 insertions(+), 71 deletions(-)

diff --git a/src/models/classification.py b/src/models/classification.py
index 1d0e3c8..94455ad 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -172,6 +172,7 @@ def _cv_predict(self, ds, model_map):
         mapping = {}
         for taxa, model in model_map.items():
             mapping[taxa] = model.predict(ds) # np.array
+        
         return mapping
 
     # Private training secondary functions
@@ -311,6 +312,7 @@ def _score_cv(self, y_true, y_pred, taxa):
             'y_true': y_true[taxa],
             'y_pred': y_pred[taxa]
         })
+        
         y_compare.to_csv(os.path.join(self._outdirs['models_dir'], f'y_compare_{self._database}_{model}_{taxa}.csv'))
 
         support = precision_recall_fscore_support(
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 80bdc07..b037d24 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -19,19 +19,19 @@ def build_attention(nb_features):
     VirNet package [Abdelkareem et al. 2018]
     https://github.com/alyosama/virnet/blob/master/NNClassifier.py
     """
-    inputs = Input(shape = (nb_features,))
-    x = Embedding(nb_features, 128)(inputs)
+    inputs = Input(shape = (nb_features,1))
+    # x = Embedding(nb_features, 128)(inputs)
 
-    x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x)
+    x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(inputs)
     x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x)
     x = AttentionWeightedAverage()(x)
 
     x = Dense(128, activation = "relu")(x)
     x = Dropout(0.1)(x)
-    x = Dense(1, activation = "tanh")(x)
+    x = Dense(1, activation = "sigmoid")(x)
 
     model = Model(inputs = inputs, outputs = x)
-    model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy'], jit_compile = True)
+    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'], jit_compile = True)
 
     return model
 
@@ -43,15 +43,15 @@ def build_LSTM(nb_features):
     https://github.com/gussow/seeker/blob/master/train_model/train_model.py
     """
     
-    inputs = Input(shape = (nb_features,))
-    x = Embedding(nb_features, 128)(inputs)
+    inputs = Input(shape = (nb_features,1))
+    # x = Embedding(nb_features, 128)(inputs)
 
-    x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(x)
+    x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(inputs)
 
     x = Dense(1, activation = 'tanh')(x)
     
     model = Model(inputs = inputs, outputs = x)
-    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -63,10 +63,10 @@ def build_deepLSTM(nb_features):
     https://github.com/wandreopoulos/deeplasmid/blob/docker/classifier/dl/DL_Model.py
     """
 
-    inputs = Input(shape=(nb_features,))
+    inputs = Input(shape=(nb_features,1))
 
-    netA = Embedding(nb_features, 128)(inputs)
-    netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (netA)
+    # netA = Embedding(nb_features, 128)(inputs)
+    netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (inputs)
     netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA)
 
     netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs)
@@ -82,7 +82,7 @@ def build_deepLSTM(nb_features):
 
     outputs = Dense(1, activation='sigmoid', name='score')(net)
     model = Model(inputs=inputs, outputs=outputs)
-    model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -95,9 +95,9 @@ def build_LSTM_attention(nb_features, nb_classes):
     https://github.com/MicrobeLab/DeepMicrobes/blob/master/models/embed_lstm_attention.py
     """
 
-    inputs = Input(shape = (nb_features,))
-    net = Embedding(nb_features, 100)(inputs)
-    net = Bidirectional(LSTM(300, return_sequences=True))(net)
+    inputs = Input(shape = (nb_features,1))
+    # net = Embedding(nb_features, 100)(inputs)
+    net = Bidirectional(LSTM(300, return_sequences=True))(inputs)
     net = Attention(dropout = 0.2)([net,net])
     # MLP
     net = Dense((nb_features * 300 * 2), activation = 'relu')(net)
@@ -108,7 +108,7 @@ def build_LSTM_attention(nb_features, nb_classes):
     net = Dense(nb_classes)(net)
     outputs = Activation('softmax')(net)
     model = Model(inputs = inputs, outputs = outputs)
-    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -134,7 +134,7 @@ def build_CNN(nb_features, nb_classes):
     model.add(Dropout(0.5))
     model.add(Dense(nb_classes))
     model.add(Activation('softmax'))
-    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -146,20 +146,20 @@ def build_wideCNN(nb_features, nb_classes):
     https://github.com/KennthShang/CHEER/blob/master/Classifier/model/Wcnn.py
     """
 
-    inputs = Input(shape = (nb_features,))
-    embed = Embedding(248, 100)(inputs)
-    embed = Reshape((nb_features, -1, 1))(embed)
+    inputs = Input(shape = (nb_features,1))
+    # embed = Embedding(248, 100)(inputs)
+    # embed = Reshape((nb_features, -1, 1))(embed)
 
-    conv1 = Conv2D(256, 3, activation = 'relu')(embed)
+    conv1 = Conv2D(256, 3, activation = 'relu')(inputs)
     conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv1)
 
-    conv2 = Conv2D(256, 7, activation = 'relu')(embed)
+    conv2 = Conv2D(256, 7, activation = 'relu')(inputs)
     conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv2)
 
-    conv3 = Conv2D(256, 11, activation = 'relu')(embed)
+    conv3 = Conv2D(256, 11, activation = 'relu')(inputs)
     conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv3)
 
-    conv4 = Conv2D(256, 15, activation = 'relu')(embed)
+    conv4 = Conv2D(256, 15, activation = 'relu')(inputs)
     conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv4)
 
     net = Concatenate(axis = 1)([conv1,conv2,conv3,conv4])
@@ -172,6 +172,6 @@ def build_wideCNN(nb_features, nb_classes):
     net = Dense(nb_classes)(net)
     outputs = Activation('softmax')(net)
     model = Model(inputs = inputs, outputs = outputs)
-    model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 62538f4..ebb37f2 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -127,31 +127,22 @@ def __init__(
 
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
-        for row in ds.iter_rows():
-            labels.append(row[self.taxa])
-        self._nb_classes = len(np.unique(labels))
-        if self._nb_classes == 2:
-            self._encoder = ModelLabelEncoder(self.taxa)
-            if scaling:
-                self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-        else:
+        self._encoder = ModelLabelEncoder(self.taxa)
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
+            self._scaler.fit(ds)
+        self._encoder.fit(ds)
+        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+        self._nb_classes = len(self._encoder.stats_[f'unique_values({self.taxa})'])
+        if self._nb_classes > 2 :
             self._encoder = Chain(
-                LabelEncoder(self.taxa),
+                self._encoder,
                 OneHotTensorEncoder(self.taxa)
             )
-            if scaling:
-                self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            
-        self._encoder.fit(ds)
-        if scaling:
-            self._scaler.fit(ds)
-        # Labels mapping
-        if self._nb_classes == 2:
-            labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
-        else:
-            labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys())
+            self._encoder.fit(ds)
+        
         self._encoded = np.arange(len(labels))
-        labels = np.append(labels, 'unknown')
+        labels = np.append(labels, 'Unknown')
         self._encoded = np.append(self._encoded, -1)
         for (label, encoded) in zip(labels, self._encoded):
             self._labels_map[label] = encoded
@@ -161,6 +152,7 @@ def _label_decode(self, predict):
         print('_label_decode')
         decoded = pd.Series(np.empty(len(predict), dtype=object))
         for label, encoded in self._labels_map.items():
+            print(predict == encoded)
             decoded[predict == encoded] = label
 
         return np.array(decoded)
@@ -207,7 +199,7 @@ def fit(self, datasets):
             ),
             datasets=datasets,
         )
-        
+
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
@@ -218,6 +210,7 @@ def predict(self, ds, threshold=0.8):
                 col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
                 ds = ds.drop_columns(col_2_drop)
 
+
             # Preprocess
             if self._scaler is not None:
                 ds = self._scaler.transform(ds)
@@ -230,12 +223,15 @@ def predict(self, ds, threshold=0.8):
             )
             predictions = self._predictor.predict(
                 data = ds,
-                batch_size = self.batch_size
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+                num_cpus_per_worker = self._nb_CPU_per_worker,
+                num_gpus_per_worker = self._nb_GPU_per_worker
             )
 
             # Convert predictions to labels
             predictions = self._prob_2_cls(predictions, threshold)
-                
+
             return self._label_decode(predictions)
         else:
             raise ValueError('No data to predict')
@@ -245,8 +241,8 @@ def _prob_2_cls(self, predictions, threshold):
         print('_prob_2_cls')
         def map_predicted_label_binary(ds, threshold):
             ds = np.ravel(ds['predictions'])
-            lower_threshold = 0.5 - (threshold * 0.5)
-            upper_threshold = 0.5 + (threshold * 0.5)
+            lower_threshold = 0.5 #- (threshold * 0.5)
+            upper_threshold = 0.5 #+ (threshold * 0.5)
             predict = pd.DataFrame({
                 'proba': ds,
                 'predicted_label': np.full(len(ds), -1)
@@ -265,12 +261,12 @@ def map_predicted_label_multiclass(ds, threshold):
 
             return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
         
-        if self._nb_classes == 2:
-            print('map_predicted_label_binary')
-            fn = map_predicted_label_binary
-        else:
+        if self._nb_classes > 2:
             print('map_predicted_label_multiclass')
             fn = map_predicted_label_multiclass
+        else:
+            print('map_predicted_label_binary')
+            fn = map_predicted_label_binary
 
         predict = []
         predictions = predictions.map_batches(
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index c9990bd..17d5c44 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -130,4 +130,4 @@ def _compute_weights(self):
         
         for lab, encoded in self._labels_map.items():
             if lab.lower() != 'unknown':
-                self._weights[encoded] = weights[classes.index(lab)]
\ No newline at end of file
+                self._weights[int(encoded)] = weights[classes.index(lab)]
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index 83e84b2..74ca704 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 from warnings import warn
 from psutil import virtual_memory
-
+from tensorflow.config import list_physical_devices
 
 __author__ = "Nicolas de Montigny"
 
@@ -49,6 +49,8 @@
     'merge_db_host'
 ]
 
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
 # Constants
 #########################################################################################################
 
@@ -59,19 +61,40 @@
 
 # Initialize ray cluster
 def init_ray_cluster(workdir):
-    mem = virtual_memory().total
-    frac = 0.8
-    while not ray.is_initialized():
-        try:
-            ray.init(
-                object_store_memory = mem * frac,
-                _temp_dir = str(workdir),
-            )
-            logging.getLogger("ray").setLevel(logging.WARNING)
-            ray.data.DataContext.get_current().execution_options.verbose_progress = True
-        except ValueError :
-            ray.shutdown()
-            frac -= 0.05
+    """
+    1. Get physical material available
+        Number of available CPUs and GPUs
+    2. Get host IP from OS
+        Defaults to 172.24.94.34
+    3. Start the ray cluster at OS level
+    """
+    nb_CPU = os.cpu_count()
+    nb_GPU = len(list_physical_devices('GPU'))
+
+    try:
+        host_ip = os.environ['HOST_IP']
+    except KeyError:
+        host_ip = '172.24.94.34'
+
+    cmd = f'ray start --head --node-ip-address {host_ip} --port 34567 --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}'
+    os.system(cmd)
+
+    ray.init()
+    logging.getLogger("ray").setLevel(logging.WARNING)
+    ray.data.DataContext.get_current().execution_options.verbose_progress = True
+    # mem = virtual_memory().total
+    # frac = 0.8
+    # while not ray.is_initialized():
+    #     try:
+    #         ray.init(
+    #             object_store_memory = mem * frac,
+    #             _temp_dir = str(workdir),
+    #         )
+    #         logging.getLogger("ray").setLevel(logging.WARNING)
+    #         ray.data.DataContext.get_current().execution_options.verbose_progress = True
+    #     except ValueError :
+    #         ray.shutdown()
+    #         frac -= 0.05
 
 # Data I/O
 #########################################################################################################

From 8dcfdf3ef083544bef42851e57bc90e434e37069 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 30 Nov 2023 12:13:37 -0500
Subject: [PATCH 51/92] NN debug for CCDB

---
 .../reduction/truncated_svd_decomposition.py  |   3 +
 src/models/classification.py                  |  19 +-
 src/models/kerasTF/binary_models.py           |  73 ++++++
 src/models/kerasTF/build_neural_networks.py   |   2 +-
 src/models/kerasTF/models.py                  |  87 +++++--
 src/models/kerasTF/multiclass_models.py       |  73 ++++++
 src/models/models_utils.py                    |  10 +-
 src/models/multiclass_utils.py                | 127 +++++++++
 src/models/sklearn/binary_models.py           | 216 ++++++++++++++++
 src/models/sklearn/models.py                  | 200 +++------------
 src/models/sklearn/multiclass_models.py       | 240 ++++++++++++++++++
 11 files changed, 856 insertions(+), 194 deletions(-)
 create mode 100644 src/models/kerasTF/binary_models.py
 create mode 100644 src/models/kerasTF/multiclass_models.py
 create mode 100644 src/models/multiclass_utils.py
 create mode 100644 src/models/sklearn/binary_models.py
 create mode 100644 src/models/sklearn/multiclass_models.py

diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py
index ca0eed4..74a4a0b 100644
--- a/src/data/reduction/truncated_svd_decomposition.py
+++ b/src/data/reduction/truncated_svd_decomposition.py
@@ -36,6 +36,9 @@ def __init__(self, features: List[str], nb_components: int = 10000, file: str =
         
     def _fit(self, ds: Dataset) -> Preprocessor:
         """
+        TODO: adapt by using the metho from PySpark SVD
+        https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.linalg.distributed.SingularValueDecomposition.html?highlight=svd
+
         Possibilities for parallel TruncatedSVD
         * sklearn minibatch PCA -> PCA / SVD mostly equivalent
         * implement parallel based on other library
diff --git a/src/models/classification.py b/src/models/classification.py
index 94455ad..3c62ddf 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -7,8 +7,11 @@
 
 from warnings import warn
 from typing import Dict, List
-from models.sklearn.models import SklearnModel
-from models.kerasTF.models import KerasTFModel
+from models.kerasTF.models import KerasTFModels
+from models.sklearn.binary_models import SklearnBinaryModels
+# from models.kerasTF.binary_models import KerasTFBinaryModels
+from models.sklearn.multiclass_models import SklearnMulticlassModels
+# from models.kerasTF.multiclass_models import KerasTFMulticlassModels
 
 # CV metrics
 from sklearn.metrics import precision_recall_fscore_support
@@ -149,7 +152,7 @@ def _predict(self, ds, model_map):
         if self.is_fitted:
             try:
                 for taxa, model in model_map.items():
-                    predictions = model.predict(ds) # np.array
+                    predictions = model.predict_proba(ds) # np.array
                     ds, predictions, ids = self._remove_unknown(ds, predictions)
                     file = self._save_dataset(ds, taxa)
                     mapping[taxa] = {
@@ -181,7 +184,7 @@ def _cv_predict(self, ds, model_map):
     def _binary_training(self, datasets, taxa, file):
         print('_binary_training')
         if self._classifier_binary == 'onesvm':
-            model = SklearnModel(
+            model = SklearnBinaryModels(
                 self._classifier_binary,
                 self._outdirs['models_dir'],
                 self._batch_size,
@@ -191,7 +194,7 @@ def _binary_training(self, datasets, taxa, file):
                 self._database_data['csv']
             )
         elif self._classifier_binary == 'linearsvm':
-            model = SklearnModel(
+            model = SklearnBinaryModels(
                 self._classifier_binary,
                 self._outdirs['models_dir'],
                 self._batch_size,
@@ -201,7 +204,7 @@ def _binary_training(self, datasets, taxa, file):
                 self._database_data['csv']
             )
         else:
-            model = KerasTFModel(
+            model = KerasTFModels(
                 self._classifier_binary,
                 self._outdirs['models_dir'],
                 self._batch_size,
@@ -222,7 +225,7 @@ def _binary_training(self, datasets, taxa, file):
     def _multiclass_training(self, datasets, taxa, file):
         print('_multiclass_training')
         if self._classifier_multiclass in ['sgd','mnb']:
-            model = SklearnModel(
+            model = SklearnMulticlassModels(
                 self._classifier_multiclass,
                 self._outdirs['models_dir'],
                 self._batch_size,
@@ -232,7 +235,7 @@ def _multiclass_training(self, datasets, taxa, file):
                 self._database_data['csv']
             )
         else:
-            model = KerasTFModel(
+            model = KerasTFModels(
                 self._classifier_multiclass,
                 self._outdirs['models_dir'],
                 self._batch_size,
diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
new file mode 100644
index 0000000..bc79f35
--- /dev/null
+++ b/src/models/kerasTF/binary_models.py
@@ -0,0 +1,73 @@
+import os
+import gc
+import warnings
+import numpy as np
+import pandas as pd
+
+# Preprocessing
+from ray.data.preprocessors import LabelEncoder, Chain
+from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+
+# Parent class / models
+from models.kerasTF.models import KerasTFModels
+from models.kerasTF.build_neural_networks import *
+
+# Training
+import tensorflow as tf
+from ray.air import session
+# from ray.air.integrations.keras import Callback
+from ray.air.config import ScalingConfig
+from ray.air.integrations.keras import ReportCheckpointCallback
+from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
+
+# Tuning
+from ray.air.config import RunConfig
+
+# Predicting
+from ray.train.tensorflow import TensorflowPredictor
+from ray.train.batch_predictor import BatchPredictor
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['KerasTFModel']
+
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
+
+# Ignore warnings to have a more comprehensible output on stdout
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+warnings.filterwarnings('ignore')
+
+class KerasTFBinaryModels(KerasTFModels):
+    """
+    Class used to build, train and predict models using Ray with Keras Tensorflow backend
+
+    ----------
+    Attributes
+    ----------
+
+    clf_file : string
+        Path to a file containing the trained model for this object
+
+    nb_classes : int
+        Number of classes for learning
+
+    ----------
+    Methods
+    ----------
+
+    preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation
+
+    train : train a model using the given datasets
+
+    predict : predict the classes of a dataset
+        ds : ray.data.Dataset
+            Dataset containing K-mers profiles of sequences to be classified
+
+        threshold : float
+            Minimum percentage of probability to effectively classify.
+            Sequences will be classified as 'unknown' if the probability is under this threshold.
+            Defaults to 80%
+    """
\ No newline at end of file
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index b037d24..8294110 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -72,7 +72,7 @@ def build_deepLSTM(nb_features):
     netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs)
     netB = Dense(40, activation='tanh',name='H_%d'%40) (netB)
 
-    net = Concatenate()([netA,netB])
+    net = Concatenate()([netA,netB]) # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 1000, 40)]
 
     net = Dense(200, activation='relu', name='C_%d'%(10*2))(net)
     net = Dropout(0.1,name='fr_%.1f'%0.1)(net)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index ebb37f2..989f934 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -4,6 +4,9 @@
 import numpy as np
 import pandas as pd
 
+# Class construction
+from abc import ABC, abstractmethod
+
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.encoders.model_label_encoder import ModelLabelEncoder
@@ -40,7 +43,7 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 warnings.filterwarnings('ignore')
 
-class KerasTFModel(ModelsUtils):
+class KerasTFModels(ModelsUtils):
     """
     Class used to build, train and predict models using Ray with Keras Tensorflow backend
 
@@ -70,7 +73,6 @@ class KerasTFModel(ModelsUtils):
             Minimum percentage of probability to effectively classify.
             Sequences will be classified as 'unknown' if the probability is under this threshold.
             Defaults to 80%
-
     """
 
     def __init__(
@@ -152,7 +154,6 @@ def _label_decode(self, predict):
         print('_label_decode')
         decoded = pd.Series(np.empty(len(predict), dtype=object))
         for label, encoded in self._labels_map.items():
-            print(predict == encoded)
             decoded[predict == encoded] = label
 
         return np.array(decoded)
@@ -203,14 +204,34 @@ def fit(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
-    def predict(self, ds, threshold=0.8):
+    def predict(self, ds):
         print('predict')
+        # Predict with model
+        predictions = self._make_predictions(ds)
+
+        # Convert predictions to labels for cross-validation of classification
+        predictions = self._get_abs_pred(predictions)
+
+        # Return decoded labels
+        return self._label_decode(predictions)
+
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        # Predict with model
+        predictions = self._make_predictions(ds)
+
+        # Convert predictions to labels with threshold for top-down classification
+        predictions = self._get_threshold_pred(predictions, threshold)
+
+        # Return decoded labels
+        return self._label_decode(predictions)
+        
+    def _make_predictions(self, ds):
         if ds.count() > 0:
             if len(ds.schema().names) > 1:
                 col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
                 ds = ds.drop_columns(col_2_drop)
 
-
             # Preprocess
             if self._scaler is not None:
                 ds = self._scaler.transform(ds)
@@ -228,21 +249,57 @@ def predict(self, ds, threshold=0.8):
                 num_cpus_per_worker = self._nb_CPU_per_worker,
                 num_gpus_per_worker = self._nb_GPU_per_worker
             )
-
-            # Convert predictions to labels
-            predictions = self._prob_2_cls(predictions, threshold)
-
-            return self._label_decode(predictions)
+            return predictions
         else:
             raise ValueError('No data to predict')
+    
+    def _get_abs_pred(self, predictions):
+        print('_get_abs_pred')
+        def map_predicted_label_binary(ds):
+            ds = np.ravel(ds['predictions'])
+            lower_threshold = 0.5
+            upper_threshold = 0.5
+            predict = pd.DataFrame({
+                'proba': ds,
+                'predicted_label': np.full(len(ds), -1)
+            })
+            predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
+            predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
+            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
+        
+        def map_predicted_label_multiclass(ds):
+            ds = ds['predictions']
+            pred = pd.DataFrame({
+                'best_proba': [np.max(arr) for arr in ds],
+                'predicted_label' : [np.argmax(arr) for arr in ds]
+            })
+
+            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
+        
+        if self._nb_classes > 2:
+            print('map_predicted_label_multiclass')
+            fn = map_predicted_label_multiclass
+        else:
+            print('map_predicted_label_binary')
+            fn = map_predicted_label_binary
+
+        predict = []
+        predictions = predictions.map_batches(
+            lambda batch : fn(batch),
+            batch_format = 'numpy',
+            batch_size = self.batch_size
+        )
+        for row in predictions.iter_rows():
+            predict.append(row['predictions'])
+
+        return predict
 
-    # Iterate over batches of predictions to transform probabilities to labels without mapping
-    def _prob_2_cls(self, predictions, threshold):
-        print('_prob_2_cls')
+    def _get_threshold_pred(self, predictions, threshold):
+        print('_get_threshold_pred')
         def map_predicted_label_binary(ds, threshold):
             ds = np.ravel(ds['predictions'])
-            lower_threshold = 0.5 #- (threshold * 0.5)
-            upper_threshold = 0.5 #+ (threshold * 0.5)
+            lower_threshold = 0.5 - (threshold * 0.5)
+            upper_threshold = 0.5 + (threshold * 0.5)
             predict = pd.DataFrame({
                 'proba': ds,
                 'predicted_label': np.full(len(ds), -1)
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
new file mode 100644
index 0000000..b422ff8
--- /dev/null
+++ b/src/models/kerasTF/multiclass_models.py
@@ -0,0 +1,73 @@
+import os
+import gc
+import warnings
+import numpy as np
+import pandas as pd
+
+# Preprocessing
+from ray.data.preprocessors import LabelEncoder, Chain
+from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+
+# Parent class / models
+from models.models_utils import ModelsUtils
+from models.kerasTF.build_neural_networks import *
+
+# Training
+import tensorflow as tf
+from ray.air import session
+# from ray.air.integrations.keras import Callback
+from ray.air.config import ScalingConfig
+from ray.air.integrations.keras import ReportCheckpointCallback
+from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
+
+# Tuning
+from ray.air.config import RunConfig
+
+# Predicting
+from ray.train.tensorflow import TensorflowPredictor
+from ray.train.batch_predictor import BatchPredictor
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['KerasTFModel']
+
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
+
+# Ignore warnings to have a more comprehensible output on stdout
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+warnings.filterwarnings('ignore')
+
+class KerasTFMulticlassModels(ModelsUtils):
+    """
+    Class used to build, train and predict models using Ray with Keras Tensorflow backend
+
+    ----------
+    Attributes
+    ----------
+
+    clf_file : string
+        Path to a file containing the trained model for this object
+
+    nb_classes : int
+        Number of classes for learning
+
+    ----------
+    Methods
+    ----------
+
+    preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation
+
+    train : train a model using the given datasets
+
+    predict : predict the classes of a dataset
+        ds : ray.data.Dataset
+            Dataset containing K-mers profiles of sequences to be classified
+
+        threshold : float
+            Minimum percentage of probability to effectively classify.
+            Sequences will be classified as 'unknown' if the probability is under this threshold.
+            Defaults to 80%
+    """
\ No newline at end of file
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 17d5c44..c665f0e 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -19,7 +19,7 @@
 
 class ModelsUtils(ABC):
     """
-    Utilities for preprocessing data and doing cross validation using ray
+    Abstract class for both frameworks to initialize their attributes.
 
     ----------
     Attributes
@@ -103,7 +103,7 @@ def predict(self):
         """
 
     @abstractmethod
-    def _prob_2_cls(self):
+    def _get_threshold_pred(self):
         """
         """
 
@@ -116,13 +116,14 @@ def _compute_weights(self):
         """
         Set class weights depending on their abundance in data-associated classes csv
         """
+        weights = {}
         if isinstance(self._csv, tuple):
             cls = pd.concat([pd.read_csv(self._csv[0]),pd.read_csv(self._csv[1])], axis = 0, join = 'inner', ignore_index = True)
         cls = pd.read_csv(self._csv)
         if self.taxa == 'domain':
             cls.loc[cls['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
         classes = list(cls[self.taxa].unique())
-        weights = compute_class_weight(
+        cls_weights = compute_class_weight(
             class_weight = 'balanced',
             classes = classes,
             y = cls[self.taxa]
@@ -130,4 +131,5 @@ def _compute_weights(self):
         
         for lab, encoded in self._labels_map.items():
             if lab.lower() != 'unknown':
-                self._weights[int(encoded)] = weights[classes.index(lab)]
\ No newline at end of file
+                weights[int(encoded)] = cls_weights[classes.index(lab)]
+        return weights
\ No newline at end of file
diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py
new file mode 100644
index 0000000..ad07191
--- /dev/null
+++ b/src/models/multiclass_utils.py
@@ -0,0 +1,127 @@
+import os
+import ray
+import warnings
+import numpy as np
+import pandas as pd
+
+# Class construction
+from abc import ABC, abstractmethod
+
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['ModelsUtils']
+
+TENSOR_COLUMN_NAME = '__value__'
+
+class MulticlassUtils(ABC):
+    """
+    Abstract class to provide utilities for multiclass classification models.
+    These methods are meant to be used when decomposing data into taxonomic groups before training one model per group
+    -----------------------
+    Mixture-of-Experts (MoE)
+    -----------------------
+    1. Train each expert on their task-associated data
+        * Split training data into 80/20% splits
+        * Train/val over multiple epochs
+    2. Train a gating network on the whole task
+        * Perceptron NN for gating
+        * Train on whole training ds
+        * Validation on simulated reads ds
+        * CV on test simulated reads ds 
+    https://medium.com/@bensalemh300/harnessing-the-best-of-both-worlds-how-mixture-of-experts-meets-pyspark-for-mnist-mastery-315f82e65a0e
+    https://machinelearningmastery.com/mixture-of-experts/
+
+    1. Cluster Data Split: Data within each cluster is divided into training and testing sets.
+    2. Decision Tree Classifiers: For clusters where there’s more than one unique class in the training data, we train Decision Tree classifiers. These classifiers can distinguish between different classes within the cluster.
+    3. Storing Expert Models: Trained Decision Tree models are stored in a dictionary, where each expert corresponds to a specific cluster.
+    4. Performance Evaluation: The performance of each expert model is assessed by evaluating its accuracy on the corresponding test data.
+    
+    Sklearn LogisticRegression : https://github.com/zermelozf/esn-lm/blob/master/esnlm/readouts/smoe.py
+    Keras/TF : https://abdulkaderhelwan.medium.com/mixture-of-experts-introduction-39f244a4ff05
+    Keras/TF on article 2018 : https://github.com/drawbridge/keras-mmoe
+    Keras/TF 2018 : https://github.com/eminorhan/mixture-of-experts
+    Detailed example : https://mattgorb.github.io/moe
+    Detailed example : https://towardsdatascience.com/how-to-build-a-wide-and-deep-model-using-keras-in-tensorflow-2-0-2f7a236b5a4b
+    Keras example : https://keras.io/examples/nlp/text_classification_with_switch_transformer/
+    Keras example : https://stackoverflow.com/questions/77551865/how-to-extend-keras-gpt2-model-moe-example
+    FastMoE PyTorch : https://fastmoe.ai/
+    Tutel PyTorch : https://www.microsoft.com/en-us/research/blog/tutel-an-efficient-mixture-of-experts-implementation-for-large-dnn-model-training/
+    """
+
+    def _split_dataset(self, ds, taxa, csv):
+        """
+        Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels
+        
+        Makes assumption that classes are order specific -> broad in csv columns
+
+        Ray data GroupBy https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key)
+        1. GroupBy previous taxa
+        2. Fx for model training (train_fx)
+        3. ds.map_groups(train_fx) to exec the training of models in parallel
+        4. Write results to file / save models
+        """
+        ds_collection = {}
+        # cls = pd.read_csv(csv)
+        # prev_tax = list(cls.columns)
+        # prev_tax = prev_tax[prev_tax.index(taxa) + 1]
+        # unique_labs = cls[prev_tax].unique()
+
+
+        # for lab in unique_labs:
+            
+        # def map_split(ds):
+        #     logging.getLogger("ray").info(ds[ds[prev_tax] == lab])
+        #     return ds[ds[prev_tax] == lab]
+
+        # test = ds.map(map_split)
+
+        # partial_ds = ds.map_batches(map_split, batch_format = 'pandas')
+        # file = '/home/nick/github/test'
+        # partial_ds.write_parquet(file)
+        # ds_collection[lab] = partial_ds
+
+        # for k, v in ds_collection.items():
+        #     # print(v.to_pandas())
+        #     print(v)
+        """
+        for lab in unique_labs:
+            ds_collection[lab] = []
+
+        for batch in ds.iter_batches(batch_format = 'pandas'):
+            labs_batch = batch[prev_tax].unique()
+            for lab in labs_batch:
+                ds_collection[lab].append(batch[batch[prev_tax] == lab])
+
+        for lab in unique_labs:
+            ds_collection[lab] = pd.concat(ds_collection[lab])
+        """
+        return ds_collection
+
+    def _predictions_cv(self, predictions):
+        """
+        Brings back together the predictions made by multiple models trained on subclasses of the original dataset
+        
+        If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to
+        
+        ----------
+        Cross-validation
+        ----------
+        * We know the classes from the previous taxa, can make each model CV on their subpart
+        * Metrics for CV overall per taxa ~k-fold strategy (mean / mode)
+        """
+
+    
+    def _predictions_classif(self, predictions):
+        """
+        Brings back together the predictions made by multiple models trained on subclasses of the original dataset
+        
+        If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to
+        
+        ----------
+        Classification
+        ----------
+        * Since we know the previous taxa classified per sequence, we can run this specific model to classify at the current level
+        * See multi-stage classification
+        """
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
new file mode 100644
index 0000000..90d3679
--- /dev/null
+++ b/src/models/sklearn/binary_models.py
@@ -0,0 +1,216 @@
+import os
+import ray
+import warnings
+import numpy as np
+import pandas as pd
+
+# Preprocessing
+from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+
+# Training
+from ray.air.config import ScalingConfig
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
+from models.sklearn.partial_trainer import SklearnPartialTrainer
+from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
+
+# Tuning
+from ray.air.config import RunConfig
+
+# Predicting
+from ray.train.batch_predictor import BatchPredictor
+from models.sklearn.tensor_predictor import SklearnTensorPredictor
+from models.sklearn.probability_predictor import SklearnTensorProbaPredictor
+
+# Parent class
+from models.sklearn.models import SklearnModels
+
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['SklearnModel']
+
+# Ignore warnings to have a more comprehensible output on stdout
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+warnings.filterwarnings('ignore')
+
+class SklearnBinaryModels(SklearnModels):
+    """
+    Class used to build, train and predict binary models using Ray with Scikit-learn backend
+
+    ----------
+    Attributes
+    ----------
+
+    clf_file : string
+        Path to a file containing the trained model for this object
+
+    ----------
+    Methods
+    ----------
+
+    preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation
+
+    train : train a model using the given datasets
+
+    predict : predict the classes of a dataset
+        ds : ray.data.Dataset
+            Dataset containing K-mers profiles of sequences to be classified
+
+        threshold : float
+            Minimum percentage of probability to effectively classify.
+            Sequences will be classified as 'unknown' if the probability is under this threshold.
+            Defaults to 80%
+    """
+    def __init__(
+        self,
+        classifier,
+        outdir_model,
+        batch_size,
+        training_epochs,
+        taxa,
+        kmers_list,
+        csv
+    ):
+        super().__init__(
+            classifier,
+            outdir_model,
+            batch_size,
+            training_epochs,
+            taxa,
+            kmers_list,
+            csv
+        )
+
+    def preprocess(self, ds, scaling = False, scaler_file = None):
+        print('preprocess')
+        if self.classifier == 'onesvm':
+            self._encoder = OneClassSVMLabelEncoder(self.taxa)
+            self._encoded = np.array([1,-1], dtype = np.int32)
+            labels = np.array(['Bacteria', 'Unknown'], dtype = object)
+            self._encoder.fit(ds)
+        else:
+            self._encoder = ModelLabelEncoder(self.taxa)
+            self._encoder.fit(ds)
+            labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+            self._encoded = np.arange(len(labels))
+            labels = np.append(labels, 'Unknown')
+            self._encoded = np.append(self._encoded, -1)
+            self._weights = self._compute_weights()
+
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
+            self._scaler.fit(ds)
+
+        # Labels mapping
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
+
+    def _build(self):
+        print('_build')
+        if self.classifier == 'onesvm':
+            print('Training bacterial extractor with One Class SVM')
+            self._clf = ScoringSGDOneClassSVM()
+            self._train_params = {
+                'nu' : 0.026441491,
+                'learning_rate' : 'constant',
+                'tol' : 1e-3,
+                'eta0' : 0.001
+            }
+        else :
+            print('Training bacterial / host classifier with SGD')
+            self._clf = SGDClassifier()
+            self._train_params = {
+                'loss' : 'hinge',
+                'penalty' : 'elasticnet',
+                'alpha' : 141.6146176,
+                'learning_rate' : 'adaptive',
+                'class_weight' : self._weights,
+                'eta0' : 0.001,
+                'n_jobs' : -1
+            }
+    
+    def fit(self, datasets):
+        print('_fit_model')
+        # Define model
+        self._build()
+        for name, ds in datasets.items():
+            ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            # Trigger the preprocessing computations before ingest in trainer
+            # Otherwise, it would be executed at each epoch
+            ds = ds.materialize()
+            datasets[name] = ray.put(ds)
+        
+        try:
+            training_labels = self._encoded.copy()
+            training_labels = np.delete(training_labels, np.where(training_labels == -1))
+        except:
+            pass
+
+        # Define trainer
+        self._trainer = SklearnPartialTrainer(
+            estimator=self._clf,
+            labels_list=training_labels,
+            features_list=self.kmers,
+            params=self._train_params,
+            datasets=datasets,
+            batch_size=self.batch_size,
+            training_epochs=self._training_epochs,
+            set_estimator_cpus=True,
+            scaling_config=ScalingConfig(
+                trainer_resources={
+                    'CPU': int(os.cpu_count()*0.6)
+                }
+            ),
+            run_config=RunConfig(
+                name=self.classifier,
+                local_dir=self._workdir
+            ),
+        )
+
+        # Training execution
+        training_result = self._trainer.fit()
+        self._model_ckpt = training_result.checkpoint
+    
+    def predict(self, ds):
+        print('predict')
+        if ds.count() > 0:
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
+            self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
+            predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
+            predictions = np.array(predictions.to_pandas()).reshape(-1)
+            return self._label_decode(predictions)
+        else:
+            raise ValueError('No data to predict')
+    
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        return self.predict(ds)
+
+    def _get_threshold_pred(self, predict, nb_cls, threshold):
+        print('_get_threshold_pred')
+        def map_predicted_label(ds : pd.DataFrame):
+            predict = pd.DataFrame({
+                'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))],
+                'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))]
+            })
+            predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1
+            return pd.DataFrame(predict['predicted_label'])
+
+        if nb_cls == 1:
+            predict = np.round(abs(np.concatenate(predict.to_pandas()['predictions'])))
+        else:
+            predict = predict.map_batches(map_predicted_label, batch_format = 'pandas')
+            predict = np.ravel(np.array(predict.to_pandas()))
+
+        return predict
\ No newline at end of file
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 1be9a15..eb379b0 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -1,27 +1,11 @@
 import os
-import ray
 import warnings
+
 import numpy as np
 import pandas as pd
 
-# Preprocessing
-from models.encoders.model_label_encoder import ModelLabelEncoder
-from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
-from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
-
-# Training
-from ray.air.config import ScalingConfig
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import SGDClassifier
-from models.sklearn.partial_trainer import SklearnPartialTrainer
-from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
-
-# Tuning
-from ray.air.config import RunConfig
-
-# Predicting
-from ray.train.batch_predictor import BatchPredictor
-from models.sklearn.tensor_predictor import SklearnTensorPredictor
+# Class construction
+from abc import ABC, abstractmethod
 
 # Parent class
 from models.models_utils import ModelsUtils
@@ -37,7 +21,7 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 warnings.filterwarnings('ignore')
 
-class SklearnModel(ModelsUtils):
+class SklearnModels(ModelsUtils, ABC):
     """
     Class used to build, train and predict models using Ray with Scikit-learn backend
 
@@ -64,7 +48,6 @@ class SklearnModel(ModelsUtils):
             Minimum percentage of probability to effectively classify.
             Sequences will be classified as 'unknown' if the probability is under this threshold.
             Defaults to 80%
-
     """
     def __init__(
         self,
@@ -86,155 +69,40 @@ def __init__(
             csv
         )
         
-    def preprocess(self, ds, scaling = False, scaler_file = None):
-        print('preprocess')
-        if self.classifier == 'onesvm':
-            self._encoder = OneClassSVMLabelEncoder(self.taxa)
-            self._encoded = np.array([1,-1], dtype = np.int32)
-            labels = np.array(['Bacteria', 'Unknown'], dtype = object)
-        else:
-            self._encoder = ModelLabelEncoder(self.taxa)
-        
-        self._encoder.fit(ds)
-
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
-
-        # Labels mapping
-        if self.classifier != 'onesvm':
-            labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
-            self._encoded = np.arange(len(labels))
-            labels = np.append(labels, 'Unknown')
-            self._encoded = np.append(self._encoded, -1)
-        for (label, encoded) in zip(labels, self._encoded):
-            self._labels_map[label] = encoded
-        if self.classifier != 'onesvm':
-            self._compute_weights()
+    @abstractmethod
+    def preprocess(self):
+        """
+        """
         
+    @abstractmethod
+    def _build(self):
+        """
+        """
+
+    @abstractmethod
+    def fit(self, datasets):
+        """
+        """
+
+    @abstractmethod
+    def predict(self, ds):
+        """
+        """
+
+    @abstractmethod
+    def predict_proba(self):
+        """
+        """
+
+    @abstractmethod
+    def _get_threshold_pred(self):
+        """
+        """
+
     def _label_decode(self, predict):
         print('_label_decode')
         decoded = pd.Series(np.empty(len(predict), dtype=object))
         for label, encoded in self._labels_map.items():
             decoded[predict == encoded] = label
 
-        return np.array(decoded)
-
-    def _build(self):
-        print('_build')
-        if self.classifier == 'onesvm':
-            print('Training bacterial extractor with One Class SVM')
-            self._clf = ScoringSGDOneClassSVM()
-            self._train_params = {
-                'nu' : 0.026441491,
-                'learning_rate' : 'constant',
-                'tol' : 1e-3,
-                'eta0' : 0.001
-            }
-        elif self.classifier == 'linearsvm':
-            print('Training bacterial / host classifier with SGD')
-            self._clf = SGDClassifier()
-            self._train_params = {
-                'loss' : 'hinge',
-                'penalty' : 'elasticnet',
-                'alpha' : 141.6146176,
-                'learning_rate' : 'adaptive',
-                'class_weight' : self._weights,
-                'eta0' : 0.001,
-                'n_jobs' : -1
-            }
-# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems
-        elif self.classifier == 'sgd':
-            print('Training multiclass SGD classifier')
-            self._clf = SGDClassifier()
-            self._train_params = {
-                'alpha' : 173.5667373,
-                'learning_rate' : 'optimal',
-                'loss': 'modified_huber',
-                'penalty' : 'l2',
-                'class_weight' : self._weights,
-            }
-        elif self.classifier == 'mnb':
-            print('Training multiclass Multinomial Naive Bayes classifier')
-            self._clf = MultinomialNB()
-            self._train_params = {
-                'alpha' : 0.243340248,
-                'fit_prior' : True
-            }
-
-    def fit(self, datasets):
-        print('_fit_model')
-        # Define model
-        self._build()
-        for name, ds in datasets.items():
-            ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            # Trigger the preprocessing computations before ingest in trainer
-            # Otherwise, it would be executed at each epoch
-            ds = ds.materialize()
-            datasets[name] = ray.put(ds)
-        
-        try:
-            training_labels = self._encoded.copy()
-            training_labels = np.delete(training_labels, np.where(training_labels == -1))
-        except:
-            pass
-
-        # Define trainer
-        self._trainer = SklearnPartialTrainer(
-            estimator=self._clf,
-            labels_list=training_labels,
-            features_list=self.kmers,
-            params=self._train_params,
-            datasets=datasets,
-            batch_size=self.batch_size,
-            training_epochs=self._training_epochs,
-            set_estimator_cpus=True,
-            scaling_config=ScalingConfig(
-                trainer_resources={
-                    'CPU': int(os.cpu_count()*0.6)
-                }
-            ),
-            run_config=RunConfig(
-                name=self.classifier,
-                local_dir=self._workdir
-            ),
-        )
-
-        # Training execution
-        training_result = self._trainer.fit()
-        self._model_ckpt = training_result.checkpoint
-
-    def predict(self, ds, threshold = 0.8):
-        print('predict')
-        if ds.count() > 0:
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            ds = ds.materialize()
-            predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
-            self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
-            predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
-            predictions = np.array(predictions.to_pandas()).reshape(-1)
-            return self._label_decode(predictions)    
-        else:
-            raise ValueError('No data to predict')
-
-    def _prob_2_cls(self, predict, nb_cls, threshold):
-        print('_prob_2_cls')
-        def map_predicted_label(ds : pd.DataFrame):
-            predict = pd.DataFrame({
-                'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))],
-                'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))]
-            })
-            predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1
-            return pd.DataFrame(predict['predicted_label'])
-
-        if nb_cls == 1:
-            predict = np.round(abs(np.concatenate(predict.to_pandas()['predictions'])))
-        else:
-            predict = predict.map_batches(map_predicted_label, batch_format = 'pandas')
-            predict = np.ravel(np.array(predict.to_pandas()))
-
-        return predict
\ No newline at end of file
+        return np.array(decoded)
\ No newline at end of file
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
new file mode 100644
index 0000000..59926f8
--- /dev/null
+++ b/src/models/sklearn/multiclass_models.py
@@ -0,0 +1,240 @@
+import os
+import ray
+import warnings
+import numpy as np
+import pandas as pd
+
+# Preprocessing
+from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+
+# Training
+from ray.air.config import ScalingConfig
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
+from models.sklearn.partial_trainer import SklearnPartialTrainer
+from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
+
+# Tuning
+from ray.air.config import RunConfig
+
+# Predicting
+from ray.train.batch_predictor import BatchPredictor
+from models.sklearn.tensor_predictor import SklearnTensorPredictor
+from models.sklearn.probability_predictor import SklearnTensorProbaPredictor
+
+# Parent classes
+from models.sklearn.models import SklearnModels
+from models.multiclass_utils import MulticlassUtils
+
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['SklearnModel']
+
+# Ignore warnings to have a more comprehensible output on stdout
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+warnings.filterwarnings('ignore')
+
+class SklearnMulticlassModels(SklearnModels, MulticlassUtils):
+    """
+    Class used to build, train and predict multiclass models using Ray with Scikit-learn backend
+
+    ----------
+    Attributes
+    ----------
+
+    clf_file : string
+        Path to a file containing the trained model for this object
+
+    ----------
+    Methods
+    ----------
+
+    preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation
+
+    train : train a model using the given datasets
+
+    predict : predict the classes of a dataset
+        ds : ray.data.Dataset
+            Dataset containing K-mers profiles of sequences to be classified
+
+        threshold : float
+            Minimum percentage of probability to effectively classify.
+            Sequences will be classified as 'unknown' if the probability is under this threshold.
+            Defaults to 80%
+    """
+    def __init__(
+        self,
+        classifier,
+        outdir_model,
+        batch_size,
+        training_epochs,
+        taxa,
+        kmers_list,
+        csv
+    ):
+        super().__init__(
+            classifier,
+            outdir_model,
+            batch_size,
+            training_epochs,
+            taxa,
+            kmers_list,
+            csv
+        )
+        self._training_collection = {}
+        self._encoder = {}
+        self._trainer = {}
+        self._model_ckpt = {}
+        self._predictor = {}
+
+    def preprocess(self, ds, scaling = False, scaler_file = None):
+        print('preprocess')
+
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
+            self._scaler.fit(ds)
+        
+        self._training_collection = self._split_dataset(ds, self.taxa, self._csv)
+        
+        for prev_taxa, ds in self._training_collection.items():
+            self._encoder[prev_taxa] = ModelLabelEncoder(self.taxa)
+            self._encoder[prev_taxa].fit(ds)
+
+            # Labels mapping
+            labels = list(self._encoder[prev_taxa].stats_[f'unique_values({self.taxa})'].keys())
+            encoded = np.arange(len(labels))
+            labels = np.append(labels, 'Unknown')
+            encoded = np.append(encoded, -1)
+            
+            self._labels_map[prev_taxa] = {}
+            for (label, encode) in zip(labels, encoded):
+                self._labels_map[prev_taxa][label] = encode
+            
+            # self._weights[prev_taxa] = self._compute_weights()
+
+    def _build(self):
+        print('_build')
+# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems
+        # if self.classifier == 'sgd':
+        print('Training multiclass SGD classifier')
+        self._clf = SGDClassifier()
+        self._train_params = {
+            'alpha' : 173.5667373,
+            'learning_rate' : 'optimal',
+            'loss': 'modified_huber',
+            'penalty' : 'l2',
+            # 'class_weight' : self._weights,
+        }
+        # elif self.classifier == 'mnb':
+        #     print('Training multiclass Multinomial Naive Bayes classifier')
+        #     self._clf = MultinomialNB()
+        #     self._train_params = {
+        #         'alpha' : 0.243340248,
+        #         'fit_prior' : True
+        #     }
+
+    def fit(self, datasets):
+        print('_fit_model')
+        # Define model
+        self._build()
+        training_result = {}
+        for prev_taxa, ds in self._training_collection.items():
+            ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            # Trigger the preprocessing computations before ingest in trainer
+            # Otherwise, it would be executed at each epoch
+            ds = ds.materialize()
+            datasets['train'] = ray.put(ds)
+
+            try:
+                training_labels = list(self._labels_map[prev_taxa].values())
+                training_labels = np.delete(training_labels, np.where(training_labels == -1))
+            except:
+                pass
+
+            # Define trainer
+            self._trainer[prev_taxa] = SklearnPartialTrainer(
+                estimator=self._clf,
+                labels_list=training_labels,
+                features_list=self.kmers,
+                params=self._train_params,
+                datasets=datasets,
+                batch_size=self.batch_size,
+                training_epochs=self._training_epochs,
+                set_estimator_cpus=True,
+                scaling_config=ScalingConfig(
+                    trainer_resources={
+                        'CPU': int(os.cpu_count()*0.6)
+                    }
+                ),
+                run_config=RunConfig(
+                    name=self.classifier,
+                    local_dir=self._workdir
+                ),
+            )
+
+            # Training execution
+            training_result[prev_taxa] = self._trainer.fit()
+            self._model_ckpt[prev_taxa] = training_result[prev_taxa].checkpoint
+        
+    def predict(self, ds):
+        print('predict')
+        if ds.count() > 0:
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+
+            ds = ds.materialize()
+            predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
+            
+            for prev_taxa, ckpt in self._model_ckpt.items():
+                self._predictor[prev_taxa] = BatchPredictor.from_checkpoint(ckpt, SklearnTensorProbaPredictor)
+                predictions = self._predictor[prev_taxa].predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
+            predictions = self._predictions_grouping(predictions)
+            return self._label_decode(predictions)
+        else:
+            raise ValueError('No data to predict')
+    
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        print('predict')
+        if ds.count() > 0:
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
+            self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor)
+            predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
+            predictions = np.array(predictions.to_pandas()).reshape(-1)
+            return self._label_decode(predictions)
+        else:
+            raise ValueError('No data to predict')
+
+    def _get_threshold_pred(self, predict, threshold):
+        print('_get_threshold_pred')
+        def map_predicted_label(ds : pd.DataFrame):
+            predict = pd.DataFrame({
+                'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))],
+                'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))]
+            })
+            predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1
+            return pd.DataFrame(predict['predicted_label'])
+    
+        predict = predict.map_batches(map_predicted_label, batch_format = 'pandas')
+        predict = np.ravel(np.array(predict.to_pandas()))
+
+        return predict
+    
+    def _label_decode(self, predict):
+        print('_label_decode')
+        decoded = pd.Series(np.empty(len(predict), dtype=object))
+        for label, encoded in self._labels_map.items():
+            decoded[predict == encoded] = label
+
+        return np.array(decoded)
\ No newline at end of file

From 3b2e1506f21729a8c36f1c9d15e56d8502a9ce59 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 2 Dec 2023 19:23:51 -0500
Subject: [PATCH 52/92] sklearn multiclass mini-models + bagging strategy

---
 src/models/models_utils.py              |   3 +
 src/models/multiclass_utils.py          |  78 ++++----
 src/models/sklearn/models.py            |   5 -
 src/models/sklearn/multiclass_models.py | 241 +++++++++++++-----------
 4 files changed, 166 insertions(+), 161 deletions(-)

diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index c665f0e..55c1d12 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -73,6 +73,7 @@ def __init__(
         self._nb_kmers = len(kmers_list)
         self._training_epochs = training_epochs
         # Initialize empty
+    # TODO: remove the variable that are not required to be kept throughout the classes
         self._clf = None
         self._weights = {}
         self._scaler = None
@@ -87,6 +88,8 @@ def __init__(
         self._preprocessor = None
         self._workdir = outdir_model
 
+
+
     @abstractmethod
     def preprocess(self, ds):
         """
diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py
index ad07191..d0db7f6 100644
--- a/src/models/multiclass_utils.py
+++ b/src/models/multiclass_utils.py
@@ -18,7 +18,18 @@
 class MulticlassUtils(ABC):
     """
     Abstract class to provide utilities for multiclass classification models.
+    
     These methods are meant to be used when decomposing data into taxonomic groups before training one model per group
+    
+    -----------------------
+    Ray data GroupBy
+    -----------------------
+    https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key)
+    1. GroupBy previous taxa
+    2. Fx for model training (train_fx)
+    3. ds.map_groups(train_fx) to exec the training of models in parallel
+    4. Write results to file / save models
+    
     -----------------------
     Mixture-of-Experts (MoE)
     -----------------------
@@ -50,54 +61,36 @@ class MulticlassUtils(ABC):
     Tutel PyTorch : https://www.microsoft.com/en-us/research/blog/tutel-an-efficient-mixture-of-experts-implementation-for-large-dnn-model-training/
     """
 
-    def _split_dataset(self, ds, taxa, csv):
+    def _get_count_previous_taxa(self, taxa, csv):
         """
-        Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels
-        
-        Makes assumption that classes are order specific -> broad in csv columns
+        Fetch the previous taxa and computes the number of classes in it
 
-        Ray data GroupBy https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key)
-        1. GroupBy previous taxa
-        2. Fx for model training (train_fx)
-        3. ds.map_groups(train_fx) to exec the training of models in parallel
-        4. Write results to file / save models
+        Makes assumption that classes are ordered ``specific -> broad`` in csv columns
+        
+        Used to determine if the dataset should be splitted according to the previous taxonomic level labels
         """
-        ds_collection = {}
-        # cls = pd.read_csv(csv)
-        # prev_tax = list(cls.columns)
-        # prev_tax = prev_tax[prev_tax.index(taxa) + 1]
-        # unique_labs = cls[prev_tax].unique()
-
-
-        # for lab in unique_labs:
-            
-        # def map_split(ds):
-        #     logging.getLogger("ray").info(ds[ds[prev_tax] == lab])
-        #     return ds[ds[prev_tax] == lab]
+        prev_taxa = None
+        cls = pd.read_csv(csv)
+        cols = list(cls.columns)
+        prev_taxa = cols[cols.index(taxa) + 1]
 
-        # test = ds.map(map_split)
+        return prev_taxa, len(cls[prev_taxa].unique())
 
-        # partial_ds = ds.map_batches(map_split, batch_format = 'pandas')
-        # file = '/home/nick/github/test'
-        # partial_ds.write_parquet(file)
-        # ds_collection[lab] = partial_ds
-
-        # for k, v in ds_collection.items():
-        #     # print(v.to_pandas())
-        #     print(v)
+    def _prev_taxa_split_dataset(self, ds, prev_taxa):
         """
-        for lab in unique_labs:
-            ds_collection[lab] = []
-
-        for batch in ds.iter_batches(batch_format = 'pandas'):
-            labs_batch = batch[prev_tax].unique()
-            for lab in labs_batch:
-                ds_collection[lab].append(batch[batch[prev_tax] == lab])
-
-        for lab in unique_labs:
-            ds_collection[lab] = pd.concat(ds_collection[lab])
+        Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels
         """
-        return ds_collection
+        return ds.groupby(prev_taxa)
+    
+    def _random_split_dataset(self, ds):
+        """
+        Assigns random numbers to a new column and group samples by it to form a collection of smaller random datasets
+        
+        Used when there is not enough labels in previous taxa for splitting according to the previous taxonomic level labels
+        """
+        nb_clusters = int(ds.count() / 10)
+        ds = ds.repartition(nb_clusters).add_column('cluster', lambda df: df.index % nb_clusters)
+        return ds.groupby('cluster')
 
     def _predictions_cv(self, predictions):
         """
@@ -110,8 +103,8 @@ def _predictions_cv(self, predictions):
         ----------
         * We know the classes from the previous taxa, can make each model CV on their subpart
         * Metrics for CV overall per taxa ~k-fold strategy (mean / mode)
+        TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy()
         """
-
     
     def _predictions_classif(self, predictions):
         """
@@ -124,4 +117,5 @@ def _predictions_classif(self, predictions):
         ----------
         * Since we know the previous taxa classified per sequence, we can run this specific model to classify at the current level
         * See multi-stage classification
+        TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy()
         """
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index eb379b0..386f684 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -74,11 +74,6 @@ def preprocess(self):
         """
         """
         
-    @abstractmethod
-    def _build(self):
-        """
-        """
-
     @abstractmethod
     def fit(self, datasets):
         """
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 59926f8..7a7e634 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -10,6 +10,7 @@
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Training
+import ray.cloudpickle as cpickle
 from ray.air.config import ScalingConfig
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
@@ -28,6 +29,9 @@
 from models.sklearn.models import SklearnModels
 from models.multiclass_utils import MulticlassUtils
 
+# Data
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
 TENSOR_COLUMN_NAME = '__value__'
 LABELS_COLUMN_NAME = 'labels'
 
@@ -99,137 +103,146 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
             self._scaler.fit(ds)
         
-        self._training_collection = self._split_dataset(ds, self.taxa, self._csv)
+        self._encoder = ModelLabelEncoder(self.taxa)
+        self._encoder.fit(ds)
+
+        # Labels mapping
+        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+        encoded = np.arange(len(labels))
+        labels = np.append(labels, 'Unknown')
+        encoded = np.append(encoded, -1)
         
-        for prev_taxa, ds in self._training_collection.items():
-            self._encoder[prev_taxa] = ModelLabelEncoder(self.taxa)
-            self._encoder[prev_taxa].fit(ds)
-
-            # Labels mapping
-            labels = list(self._encoder[prev_taxa].stats_[f'unique_values({self.taxa})'].keys())
-            encoded = np.arange(len(labels))
-            labels = np.append(labels, 'Unknown')
-            encoded = np.append(encoded, -1)
-            
-            self._labels_map[prev_taxa] = {}
-            for (label, encode) in zip(labels, encoded):
-                self._labels_map[prev_taxa][label] = encode
-            
-            # self._weights[prev_taxa] = self._compute_weights()
-
-    def _build(self):
-        print('_build')
-# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems
-        # if self.classifier == 'sgd':
-        print('Training multiclass SGD classifier')
-        self._clf = SGDClassifier()
-        self._train_params = {
-            'alpha' : 173.5667373,
-            'learning_rate' : 'optimal',
-            'loss': 'modified_huber',
-            'penalty' : 'l2',
-            # 'class_weight' : self._weights,
-        }
-        # elif self.classifier == 'mnb':
-        #     print('Training multiclass Multinomial Naive Bayes classifier')
-        #     self._clf = MultinomialNB()
-        #     self._train_params = {
-        #         'alpha' : 0.243340248,
-        #         'fit_prior' : True
-        #     }
+        self._labels_map = {}
+        for (label, encode) in zip(labels, encoded):
+            self._labels_map[label] = encode
+        
+        # self._weights = self._compute_weights()
 
     def fit(self, datasets):
-        print('_fit_model')
-        # Define model
-        self._build()
-        training_result = {}
-        for prev_taxa, ds in self._training_collection.items():
-            ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            # Trigger the preprocessing computations before ingest in trainer
-            # Otherwise, it would be executed at each epoch
-            ds = ds.materialize()
-            datasets['train'] = ray.put(ds)
-
-            try:
-                training_labels = list(self._labels_map[prev_taxa].values())
-                training_labels = np.delete(training_labels, np.where(training_labels == -1))
-            except:
-                pass
-
-            # Define trainer
-            self._trainer[prev_taxa] = SklearnPartialTrainer(
-                estimator=self._clf,
-                labels_list=training_labels,
-                features_list=self.kmers,
-                params=self._train_params,
-                datasets=datasets,
-                batch_size=self.batch_size,
-                training_epochs=self._training_epochs,
-                set_estimator_cpus=True,
-                scaling_config=ScalingConfig(
-                    trainer_resources={
-                        'CPU': int(os.cpu_count()*0.6)
-                    }
-                ),
-                run_config=RunConfig(
-                    name=self.classifier,
-                    local_dir=self._workdir
-                ),
+        print('fit')
+    # TODO: remove validation from datasets
+    # train / val on training ds, CV on test ds
+        ds = datasets['train']
+        ds = ds.drop_columns(['id'])
+        ds = self._encoder.transform(ds)
+        if self._scaler is not None:
+            ds = self._scaler.transform(ds)
+
+        # One sub-model per artificial cluster of samples
+        ds = self._random_split_dataset(ds)
+        # checkpointing directory
+        model_dir = os.path.join(self._workdir, self.classifier)
+        if not os.path.isdir(model_dir):
+            os.mkdir(model_dir)
+
+        # Model-specific training functions
+        def build_fit_sgd(data):
+            X = data[TENSOR_COLUMN_NAME]
+            y = data[LABELS_COLUMN_NAME]
+            prev_label = data['cluster'][0]
+            model = SGDClassifier(
+                alpha = 173.5667373,
+                learning_rate = 'optimal',
+                loss = 'modified_huber',
+                penalty = 'l2',
+                # 'class_weight' : self._weights,
+            )
+            model.fit(X, y)
+
+            model_file = os.path.join(model_dir, f'{prev_label}.pkl')
+
+            with open(model_file, "wb") as file:
+                cpickle.dump(model, file)
+
+            return {
+                'cluster' : [prev_label],
+                'file' : [model_file]
+            }
+
+        def build_fit_mnb(data):
+            X = data[TENSOR_COLUMN_NAME]
+            y = data[LABELS_COLUMN_NAME]
+            prev_label = data['cluster'][0]
+            model = SGDClassifier(
+                alpha = 173.5667373,
+                learning_rate = 'optimal',
+                loss = 'modified_huber',
+                penalty = 'l2',
+                # 'class_weight' : self._weights,
             )
+            model.fit(X, y)
 
-            # Training execution
-            training_result[prev_taxa] = self._trainer.fit()
-            self._model_ckpt[prev_taxa] = training_result[prev_taxa].checkpoint
+            model_file = os.path.join(model_dir, f'{prev_label}.pkl')
+
+            with open(model_file, "wb") as file:
+                cpickle.dump(model, file)
+
+            return {
+                'cluster' : [prev_label],
+                'file' : [model_file]
+            }
+        
+        if self.classifier == 'sgd':
+            print('Training multiclass SGD classifier')
+            training_result = ds.map_groups(build_fit_sgd, batch_format = 'numpy')
+        elif self.classifier == 'mnb':
+            print('Training multiclass Multinomial Naive Bayes classifier')
+            training_result = ds.map_groups(build_fit_mnb, batch_format = 'numpy')
+
+        training_result = training_result.to_pandas().to_dict('records')
+        for record in training_result:
+            self._model_ckpt[record['cluster']] = record['file']
         
     def predict(self, ds):
         print('predict')
-        if ds.count() > 0:
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-
-            ds = ds.materialize()
-            predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
-            
-            for prev_taxa, ckpt in self._model_ckpt.items():
-                self._predictor[prev_taxa] = BatchPredictor.from_checkpoint(ckpt, SklearnTensorProbaPredictor)
-                predictions = self._predictor[prev_taxa].predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
-            predictions = self._predictions_grouping(predictions)
-            return self._label_decode(predictions)
-        else:
-            raise ValueError('No data to predict')
+        probabilities = self._predict_proba(ds)
+        predictions = np.argmax(probabilities, axis = 1)
+        predictions = self._label_decode(predictions)
+        return predictions
     
     def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
-        print('predict')
+        probabilities = self._predict_proba(ds)
+        predictions = self._get_threshold_pred(probabilities, threshold)
+        return self._label_decode(predictions)
+
+    def _predict_proba(self, ds):
         if ds.count() > 0:
             if self._scaler is not None:
                 ds = self._scaler.transform(ds)
-            ds = ds.materialize()
-            predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
-            self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor)
-            predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs)
-            predictions = np.array(predictions.to_pandas()).reshape(-1)
-            return self._label_decode(predictions)
-        else:
-            raise ValueError('No data to predict')
+            # ds = ds.materialize()
+
+            def predict_func(data):
+                X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
+                pred = np.zeros((len(X), len(self._labels_map)))
+                for cluster, model_file in self._model_ckpt.items():
+                    with open(model_file, 'rb') as file:
+                        model = cpickle.load(file)
+                    proba = model.predict_proba(X)
+                    for i, cls in enumerate(model.classes_):
+                        pred[:, cls] += proba[:, i]
+                pred = pred / len(self._model_ckpt)
+                return {'predictions' : pred}
+
+            probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
+            probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+
+        return probabilities
 
     def _get_threshold_pred(self, predict, threshold):
         print('_get_threshold_pred')
-        def map_predicted_label(ds : pd.DataFrame):
-            predict = pd.DataFrame({
-                'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))],
-                'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))]
-            })
-            predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1
-            return pd.DataFrame(predict['predicted_label'])
-    
-        predict = predict.map_batches(map_predicted_label, batch_format = 'pandas')
-        predict = np.ravel(np.array(predict.to_pandas()))
+        proba_predict = {
+            'best_proba' : [],
+            'predicted_label' : []
+        }
+        for line in predict:
+            proba_predict['best_proba'].append(line[np.argmax(line)]),
+            proba_predict['predicted_label'].append(np.argmax(line))
+
+        proba_predict = pd.DataFrame(proba_predict)
+        proba_predict.loc[proba_predict['best_proba'] < threshold, 'predicted_label'] = -1
 
-        return predict
+        return proba_predict['predicted_label']
     
     def _label_decode(self, predict):
         print('_label_decode')
@@ -237,4 +250,4 @@ def _label_decode(self, predict):
         for label, encoded in self._labels_map.items():
             decoded[predict == encoded] = label
 
-        return np.array(decoded)
\ No newline at end of file
+        return np.array(decoded)

From e0809661e404b2d245f6add277b188059da1eba4 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 2 Dec 2023 19:40:54 -0500
Subject: [PATCH 53/92] debug ray cluster start

---
 src/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 74ca704..b025361 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -71,12 +71,12 @@ def init_ray_cluster(workdir):
     nb_CPU = os.cpu_count()
     nb_GPU = len(list_physical_devices('GPU'))
 
-    try:
-        host_ip = os.environ['HOST_IP']
-    except KeyError:
-        host_ip = '172.24.94.34'
+    # try:
+    #     host_ip = os.environ['HOST_IP']
+    # except KeyError:
+    #     host_ip = '$(hostname -i)'
 
-    cmd = f'ray start --head --node-ip-address {host_ip} --port 34567 --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}'
+    cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}'
     os.system(cmd)
 
     ray.init()

From fe0381e4b72b8532d2a6c7ec4d686e10a4d23372 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 5 Dec 2023 11:03:29 -0500
Subject: [PATCH 54/92] bigger batches for sub-models

---
 src/models/models_utils.py              |  1 +
 src/models/multiclass_utils.py          | 37 +++++--------------------
 src/models/sklearn/multiclass_models.py |  6 ++--
 3 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 55c1d12..9931d55 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -135,4 +135,5 @@ def _compute_weights(self):
         for lab, encoded in self._labels_map.items():
             if lab.lower() != 'unknown':
                 weights[int(encoded)] = cls_weights[classes.index(lab)]
+        print(weights)
         return weights
\ No newline at end of file
diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py
index d0db7f6..f098ef3 100644
--- a/src/models/multiclass_utils.py
+++ b/src/models/multiclass_utils.py
@@ -6,6 +6,7 @@
 
 # Class construction
 from abc import ABC, abstractmethod
+from models.models_utils import ModelsUtils
 
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 
@@ -15,7 +16,7 @@
 
 TENSOR_COLUMN_NAME = '__value__'
 
-class MulticlassUtils(ABC):
+class MulticlassUtils(ModelsUtils, ABC):
     """
     Abstract class to provide utilities for multiclass classification models.
     
@@ -76,10 +77,12 @@ def _get_count_previous_taxa(self, taxa, csv):
 
         return prev_taxa, len(cls[prev_taxa].unique())
 
-    def _prev_taxa_split_dataset(self, ds, prev_taxa):
+    def _prev_taxa_split_dataset(self, ds, prev_taxa = None):
         """
         Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels
         """
+        if prev_taxa is None:
+            prev_taxa, nb_classes = self._get_count_previous_taxa(self.taxa,self._csv)
         return ds.groupby(prev_taxa)
     
     def _random_split_dataset(self, ds):
@@ -88,34 +91,8 @@ def _random_split_dataset(self, ds):
         
         Used when there is not enough labels in previous taxa for splitting according to the previous taxonomic level labels
         """
-        nb_clusters = int(ds.count() / 10)
+        nb_clusters = int(ds.count() / 100)
         ds = ds.repartition(nb_clusters).add_column('cluster', lambda df: df.index % nb_clusters)
         return ds.groupby('cluster')
-
-    def _predictions_cv(self, predictions):
-        """
-        Brings back together the predictions made by multiple models trained on subclasses of the original dataset
-        
-        If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to
-        
-        ----------
-        Cross-validation
-        ----------
-        * We know the classes from the previous taxa, can make each model CV on their subpart
-        * Metrics for CV overall per taxa ~k-fold strategy (mean / mode)
-        TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy()
-        """
     
-    def _predictions_classif(self, predictions):
-        """
-        Brings back together the predictions made by multiple models trained on subclasses of the original dataset
-        
-        If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to
-        
-        ----------
-        Classification
-        ----------
-        * Since we know the previous taxa classified per sequence, we can run this specific model to classify at the current level
-        * See multi-stage classification
-        TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy()
-        """
+    
\ No newline at end of file
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 7a7e634..e479f27 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -145,7 +145,7 @@ def build_fit_sgd(data):
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
                 penalty = 'l2',
-                # 'class_weight' : self._weights,
+                # class_weight = self._weights,
             )
             model.fit(X, y)
 
@@ -168,7 +168,7 @@ def build_fit_mnb(data):
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
                 penalty = 'l2',
-                # 'class_weight' : self._weights,
+                # class_weight = self._weights,
             )
             model.fit(X, y)
 
@@ -226,6 +226,8 @@ def predict_func(data):
 
             probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
             probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+        else:
+            raise ValueError('Empty dataset, cannot execute predictions!')
 
         return probabilities
 

From 0197d6d03dcad587e4450ca35a56a852e857139b Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 5 Dec 2023 12:30:56 -0500
Subject: [PATCH 55/92] rectify ray cluster init

---
 src/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index b025361..8c21dbd 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -76,10 +76,10 @@ def init_ray_cluster(workdir):
     # except KeyError:
     #     host_ip = '$(hostname -i)'
 
-    cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}'
+    cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --storage {workdir}'
     os.system(cmd)
 
-    ray.init()
+    ray.init(_temp_dir = str(workdir))
     logging.getLogger("ray").setLevel(logging.WARNING)
     ray.data.DataContext.get_current().execution_options.verbose_progress = True
     # mem = virtual_memory().total

From 1dbcd18998c3065a2348e67337e2138267aa6caa Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 5 Dec 2023 18:51:20 -0500
Subject: [PATCH 56/92] ray cluster init me + storage management

---
 src/utils.py | 52 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 8c21dbd..89c74ca 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -1,5 +1,6 @@
 import os
 import ray
+import json
 import logging
 
 import numpy as np
@@ -71,15 +72,48 @@ def init_ray_cluster(workdir):
     nb_CPU = os.cpu_count()
     nb_GPU = len(list_physical_devices('GPU'))
 
-    # try:
-    #     host_ip = os.environ['HOST_IP']
-    # except KeyError:
-    #     host_ip = '$(hostname -i)'
-
-    cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --storage {workdir}'
-    os.system(cmd)
-
-    ray.init(_temp_dir = str(workdir))
+    mem = ray._private.utils.get_shared_memory_bytes() - 10
+
+    workdir='/home/nicdemon/ray/'
+
+    if 'HOST_IP' in list(os.environ.keys()):
+        ray.init(
+            _node_ip_address = os.environ['HOST_IP'],
+            num_cpus = nb_CPU,
+            num_gpus = nb_GPU,
+            _temp_dir = str(workdir),
+            object_store_memory = mem,
+            _system_config={    
+                "object_spilling_config": json.dumps({
+                    "type": "filesystem",
+                    "params": {
+                        "directory_path": str(workdir)
+                    },
+                })
+            },
+        )
+        # cmd = f"ray start --head --node-ip-address {os.environ['HOST_IP']} --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}"
+    else:
+        ray.init(
+            num_cpus = nb_CPU,
+            num_gpus = nb_GPU,
+            _temp_dir = str(workdir),
+            object_store_memory = mem,
+            _system_config={
+                "object_spilling_config": json.dumps({
+                    "type": "filesystem",
+                    "params": {
+                        "directory_path": str(workdir)
+                    },
+                })
+            },
+        )
+
+    # cmd = f"ray start --head --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}"
+
+    # os.system(cmd)
+
+    # ray.init()
     logging.getLogger("ray").setLevel(logging.WARNING)
     ray.data.DataContext.get_current().execution_options.verbose_progress = True
     # mem = virtual_memory().total

From e2cb50065a60a65ef00e80bbe07e53d6bc017a5e Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 6 Dec 2023 18:22:07 -0500
Subject: [PATCH 57/92] debug cluster + use weights for sklearn multiclass

---
 src/models/models_utils.py              |  2 +-
 src/models/sklearn/multiclass_models.py | 11 ++++++-----
 src/utils.py                            |  2 --
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 9931d55..3f42d9e 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -135,5 +135,5 @@ def _compute_weights(self):
         for lab, encoded in self._labels_map.items():
             if lab.lower() != 'unknown':
                 weights[int(encoded)] = cls_weights[classes.index(lab)]
-        print(weights)
+        
         return weights
\ No newline at end of file
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index e479f27..c9a393b 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -116,7 +116,7 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         for (label, encode) in zip(labels, encoded):
             self._labels_map[label] = encode
         
-        # self._weights = self._compute_weights()
+        self._weights = self._compute_weights()
 
     def fit(self, datasets):
         print('fit')
@@ -145,7 +145,7 @@ def build_fit_sgd(data):
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
                 penalty = 'l2',
-                # class_weight = self._weights,
+                class_weight = self._weights,
             )
             model.fit(X, y)
 
@@ -168,7 +168,7 @@ def build_fit_mnb(data):
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
                 penalty = 'l2',
-                # class_weight = self._weights,
+                class_weight = self._weights,
             )
             model.fit(X, y)
 
@@ -221,15 +221,16 @@ def predict_func(data):
                     proba = model.predict_proba(X)
                     for i, cls in enumerate(model.classes_):
                         pred[:, cls] += proba[:, i]
-                pred = pred / len(self._model_ckpt)
+                # pred = pred / len(self._model_ckpt)
                 return {'predictions' : pred}
 
             probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
             probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+            
+            return probabilities
         else:
             raise ValueError('Empty dataset, cannot execute predictions!')
 
-        return probabilities
 
     def _get_threshold_pred(self, predict, threshold):
         print('_get_threshold_pred')
diff --git a/src/utils.py b/src/utils.py
index 89c74ca..3f1c5fe 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -74,8 +74,6 @@ def init_ray_cluster(workdir):
 
     mem = ray._private.utils.get_shared_memory_bytes() - 10
 
-    workdir='/home/nicdemon/ray/'
-
     if 'HOST_IP' in list(os.environ.keys()):
         ray.init(
             _node_ip_address = os.environ['HOST_IP'],

From 2e35a0bdb476da6b659f440c0eac33cf40758c25 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 7 Dec 2023 17:47:25 -0500
Subject: [PATCH 58/92] rectify sgd classif

---
 src/models/multiclass_utils.py          | 15 ++++++++++----
 src/models/sklearn/multiclass_models.py | 26 +++++++++++--------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py
index f098ef3..dd6f362 100644
--- a/src/models/multiclass_utils.py
+++ b/src/models/multiclass_utils.py
@@ -3,6 +3,7 @@
 import warnings
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 
 # Class construction
 from abc import ABC, abstractmethod
@@ -91,8 +92,14 @@ def _random_split_dataset(self, ds):
         
         Used when there is not enough labels in previous taxa for splitting according to the previous taxonomic level labels
         """
+        def map_clusters(batch):
+            clusters = np.arange(len(batch))
+            batch['cluster'] = clusters
+            return batch
+
         nb_clusters = int(ds.count() / 100)
-        ds = ds.repartition(nb_clusters).add_column('cluster', lambda df: df.index % nb_clusters)
-        return ds.groupby('cluster')
-    
-    
\ No newline at end of file
+
+        ds = ds.repartition(100)
+        ds = ds.map_batches(map_clusters, batch_size = nb_clusters, batch_format = 'pandas')
+
+        return ds.groupby('cluster')
\ No newline at end of file
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index c9a393b..1414675 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -128,10 +128,12 @@ def fit(self, datasets):
         if self._scaler is not None:
             ds = self._scaler.transform(ds)
 
+
         # One sub-model per artificial cluster of samples
         ds = self._random_split_dataset(ds)
+        
         # checkpointing directory
-        model_dir = os.path.join(self._workdir, self.classifier)
+        model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}')
         if not os.path.isdir(model_dir):
             os.mkdir(model_dir)
 
@@ -139,9 +141,9 @@ def fit(self, datasets):
         def build_fit_sgd(data):
             X = data[TENSOR_COLUMN_NAME]
             y = data[LABELS_COLUMN_NAME]
-            prev_label = data['cluster'][0]
+            cluster = data['cluster'][0]
             model = SGDClassifier(
-                alpha = 173.5667373,
+                # alpha = 173.5667373,
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
                 penalty = 'l2',
@@ -149,36 +151,30 @@ def build_fit_sgd(data):
             )
             model.fit(X, y)
 
-            model_file = os.path.join(model_dir, f'{prev_label}.pkl')
+            model_file = os.path.join(model_dir, f'{cluster}.pkl')
 
             with open(model_file, "wb") as file:
                 cpickle.dump(model, file)
 
             return {
-                'cluster' : [prev_label],
+                'cluster' : [cluster],
                 'file' : [model_file]
             }
 
         def build_fit_mnb(data):
             X = data[TENSOR_COLUMN_NAME]
             y = data[LABELS_COLUMN_NAME]
-            prev_label = data['cluster'][0]
-            model = SGDClassifier(
-                alpha = 173.5667373,
-                learning_rate = 'optimal',
-                loss = 'modified_huber',
-                penalty = 'l2',
-                class_weight = self._weights,
-            )
+            cluster = data['cluster'][0]
+            model = MultinomialNB()
             model.fit(X, y)
 
-            model_file = os.path.join(model_dir, f'{prev_label}.pkl')
+            model_file = os.path.join(model_dir, f'{cluster}.pkl')
 
             with open(model_file, "wb") as file:
                 cpickle.dump(model, file)
 
             return {
-                'cluster' : [prev_label],
+                'cluster' : [cluster],
                 'file' : [model_file]
             }
         

From 5576aa89500bb386907bfc8457cdc89f87577de1 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 8 Dec 2023 15:41:59 -0500
Subject: [PATCH 59/92] sklearn calibrated classifier

---
 src/models/kerasTF/binary_models.py     | 224 +++++++++++++++++++-
 src/models/kerasTF/models.py            | 260 ++----------------------
 src/models/kerasTF/multiclass_models.py | 218 +++++++++++++++++++-
 src/models/models_utils.py              |  10 +-
 src/models/multiclass_utils.py          |  33 +--
 src/models/sklearn/binary_models.py     |  68 ++++---
 src/models/sklearn/models.py            |  10 +-
 src/models/sklearn/multiclass_models.py |  94 ++++++---
 src/utils.py                            |  10 +-
 9 files changed, 578 insertions(+), 349 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index bc79f35..603434b 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -19,6 +19,7 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
+from models.kerasTF.models import train_func, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
@@ -70,4 +71,225 @@ class KerasTFBinaryModels(KerasTFModels):
             Minimum percentage of probability to effectively classify.
             Sequences will be classified as 'unknown' if the probability is under this threshold.
             Defaults to 80%
-    """
\ No newline at end of file
+    """
+
+    def __init__(
+        self,
+        classifier,
+        outdir_model,
+        batch_size,
+        training_epochs,
+        taxa,
+        kmers_list,
+        csv
+    ):
+        super().__init__(
+            classifier,
+            outdir_model,
+            batch_size,
+            training_epochs,
+            taxa,
+            kmers_list,
+            csv
+        )
+        # Parameters
+        # Initialize hidden
+        self._nb_CPU_data = int(os.cpu_count() * 0.2)
+        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data)
+        self._nb_GPU = len(tf.config.list_physical_devices('GPU'))
+        # Initialize empty
+        self._nb_classes = 2
+        self._nb_CPU_per_worker = 0
+        self._nb_GPU_per_worker = 0
+        # Computing variables
+        if self._nb_GPU > 0:
+            self._use_gpu = True
+            self._n_workers = self._nb_GPU
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers)
+            self._nb_GPU_per_worker = 1
+        else:
+            self._use_gpu = False
+            self._n_workers = int(self._nb_CPU_training * 0.2)
+            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
+
+        if self.classifier == 'attention':
+            print('Training bacterial / host classifier based on Attention Weighted Neural Network')
+        elif self.classifier == 'lstm':
+            print('Training bacterial / host classifier based on Shallow LSTM Neural Network')
+        elif self.classifier == 'deeplstm':
+            print('Training bacterial / host classifier based on Deep LSTM Neural Network')
+        
+    # Data preprocessing
+    #########################################################################################################
+
+    def preprocess(self, ds, scaling = False, scaler_file = None):
+        print('preprocess')
+        # Labels encoding
+        self._encoder = ModelLabelEncoder(self.taxa)
+        self._encoder.fit(ds)
+
+        # Labels mapping
+        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+        self._nb_classes = len(labels)
+        self._encoded = np.arange(len(labels))
+        labels = np.append(labels, 'Unknown')
+        self._encoded = np.append(self._encoded, -1)
+        
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
+        
+        # Class weights
+        self._weights = self._compute_weights()
+        
+        # Scaling
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
+            self._scaler.fit(ds)
+        
+    # Model training
+    #########################################################################################################
+
+    def fit(self, datasets):
+        print('fit')
+        # Preprocessing loop
+        for name, ds in datasets.items():
+            # ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            datasets[name] = ds
+
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        # Define trainer / tuner
+        self._trainer = TensorflowTrainer(
+            train_loop_per_worker=train_func,
+            train_loop_config=train_params,
+            scaling_config=ScalingConfig(
+                trainer_resources={'CPU': self._nb_CPU_data},
+                num_workers=self._n_workers,
+                use_gpu=self._use_gpu,
+                resources_per_worker={
+                    'CPU': self._nb_CPU_per_worker,
+                    'GPU': self._nb_GPU_per_worker
+                }
+            ),
+            run_config=RunConfig(
+                name=self.classifier,
+                local_dir=self._workdir,
+            ),
+            datasets=datasets,
+        )
+
+        training_result = self._trainer.fit()
+        self._model_ckpt = training_result.best_checkpoints[0][0]
+
+    # Model predicting
+    #########################################################################################################
+
+    def predict(self, ds):
+        print('predict')
+        # Predict with model
+        predictions = self._predict_proba(ds)
+
+        # Convert predictions to labels
+        predictions = self._get_abs_pred(predictions)
+
+        # Return decoded labels
+        return self._label_decode(predictions)
+
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        # Predict with model
+        predictions = self._predict_proba(ds)
+
+        # Convert predictions to labels with threshold
+        predictions = self._get_threshold_pred(predictions, threshold)
+
+        # Return decoded labels
+        return self._label_decode(predictions)
+    
+    def _predict_proba(self, ds):
+        if ds.count() > 0:
+            if len(ds.schema().names) > 1:
+                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
+                ds = ds.drop_columns(col_2_drop)
+
+            # Preprocess
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+
+            self._predictor = BatchPredictor.from_checkpoint(
+                self._model_ckpt,
+                TensorflowPredictor,
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+            )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+                num_cpus_per_worker = self._nb_CPU_per_worker,
+                num_gpus_per_worker = self._nb_GPU_per_worker
+            )
+            return predictions
+        else:
+            raise ValueError('No data to predict')
+        
+    def _get_abs_pred(self, predictions):
+        print('_get_abs_pred')
+        def map_predicted_label(ds):
+            ds = np.ravel(ds['predictions'])
+            threshold = 0.5
+            predict = pd.DataFrame({
+                'proba': ds,
+                'predicted_label': np.full(len(ds), -1)
+            })
+            predict.loc[predict['proba'] > threshold, 'predicted_label'] = 1
+            predict.loc[predict['proba'] < threshold, 'predicted_label'] = 0
+            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
+                
+        predict = []
+        predictions = predictions.map_batches(
+            lambda batch : map_predicted_label(batch),
+            batch_format = 'numpy',
+            batch_size = self.batch_size
+        )
+        for row in predictions.iter_rows():
+            predict.append(row['predictions'])
+
+        return predict
+
+    def _get_threshold_pred(self, predictions, threshold):
+        print('_get_threshold_pred')
+        def map_predicted_label(ds, threshold):
+            ds = np.ravel(ds['predictions'])
+            lower_threshold = 0.5 - (threshold * 0.5)
+            upper_threshold = 0.5 + (threshold * 0.5)
+            predict = pd.DataFrame({
+                'proba': ds,
+                'predicted_label': np.full(len(ds), -1)
+            })
+            predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
+            predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
+            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
+        
+        predict = []
+        predictions = predictions.map_batches(
+            lambda batch : map_predicted_label(batch, threshold),
+            batch_format = 'numpy',
+            batch_size = self.batch_size
+        )
+        for row in predictions.iter_rows():
+            predict.append(row['predictions'])
+
+        return predict
\ No newline at end of file
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 989f934..ba1c3a4 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -43,7 +43,7 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 warnings.filterwarnings('ignore')
 
-class KerasTFModels(ModelsUtils):
+class KerasTFModels(ModelsUtils, ABC):
     """
     Class used to build, train and predict models using Ray with Keras Tensorflow backend
 
@@ -94,247 +94,31 @@ def __init__(
             kmers_list,
             csv
         )
-        # Parameters
-        # Initialize hidden
-        self._nb_CPU_data = int(os.cpu_count() * 0.2)
-        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data)
-        self._nb_GPU = len(tf.config.list_physical_devices('GPU'))
-        # Initialize empty
-        self._nb_classes = None
-        self._nb_CPU_per_worker = 0
-        self._nb_GPU_per_worker = 0
-        # Computing variables
-        if self._nb_GPU > 0:
-            self._use_gpu = True
-            self._n_workers = self._nb_GPU
-            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers)
-            self._nb_GPU_per_worker = 1
-        else:
-            self._use_gpu = False
-            self._n_workers = int(self._nb_CPU_training * 0.2)
-            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
-
-        if self.classifier == 'attention':
-            print('Training bacterial / host classifier based on Attention Weighted Neural Network')
-        elif self.classifier == 'lstm':
-            print('Training bacterial / host classifier based on Shallow LSTM Neural Network')
-        elif self.classifier == 'deeplstm':
-            print('Training bacterial / host classifier based on Deep LSTM Neural Network')
-        elif self.classifier == 'lstm_attention':
-            print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention')
-        elif self.classifier == 'cnn':
-            print('Training multiclass classifier based on CNN Neural Network')
-        elif self.classifier == 'widecnn':
-            print('Training multiclass classifier based on Wide CNN Network')
-
-    def preprocess(self, ds, scaling = False, scaler_file = None):
-        print('preprocess')
-        self._encoder = ModelLabelEncoder(self.taxa)
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
-        self._encoder.fit(ds)
-        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
-        self._nb_classes = len(self._encoder.stats_[f'unique_values({self.taxa})'])
-        if self._nb_classes > 2 :
-            self._encoder = Chain(
-                self._encoder,
-                OneHotTensorEncoder(self.taxa)
-            )
-            self._encoder.fit(ds)
         
-        self._encoded = np.arange(len(labels))
-        labels = np.append(labels, 'Unknown')
-        self._encoded = np.append(self._encoded, -1)
-        for (label, encoded) in zip(labels, self._encoded):
-            self._labels_map[label] = encoded
-        self._compute_weights()
-
-    def _label_decode(self, predict):
-        print('_label_decode')
-        decoded = pd.Series(np.empty(len(predict), dtype=object))
-        for label, encoded in self._labels_map.items():
-            decoded[predict == encoded] = label
-
-        return np.array(decoded)
-
+    @abstractmethod
+    def preprocess(self):
+        """
+        """
+        
+    @abstractmethod
     def fit(self, datasets):
-        print('fit')
-        # Preprocessing loop
-        for name, ds in datasets.items():
-            ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            # Trigger the preprocessing computations before ingest in trainer
-            # Otherwise, it would be executed at each epoch
-            ds = ds.materialize()
-            datasets[name] = ds
-
-        # Training parameters
-        self._train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        # Define trainer / tuner
-        self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func,
-            train_loop_config=self._train_params,
-            scaling_config=ScalingConfig(
-                trainer_resources={'CPU': self._nb_CPU_data},
-                num_workers=self._n_workers,
-                use_gpu=self._use_gpu,
-                resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker,
-                    'GPU': self._nb_GPU_per_worker
-                }
-            ),
-            run_config=RunConfig(
-                name=self.classifier,
-                local_dir=self._workdir,
-            ),
-            datasets=datasets,
-        )
-
-        training_result = self._trainer.fit()
-        self._model_ckpt = training_result.best_checkpoints[0][0]
+        """
+        """
 
+    @abstractmethod
     def predict(self, ds):
-        print('predict')
-        # Predict with model
-        predictions = self._make_predictions(ds)
-
-        # Convert predictions to labels for cross-validation of classification
-        predictions = self._get_abs_pred(predictions)
-
-        # Return decoded labels
-        return self._label_decode(predictions)
-
-    def predict_proba(self, ds, threshold = 0.8):
-        print('predict_proba')
-        # Predict with model
-        predictions = self._make_predictions(ds)
-
-        # Convert predictions to labels with threshold for top-down classification
-        predictions = self._get_threshold_pred(predictions, threshold)
-
-        # Return decoded labels
-        return self._label_decode(predictions)
-        
-    def _make_predictions(self, ds):
-        if ds.count() > 0:
-            if len(ds.schema().names) > 1:
-                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
-                ds = ds.drop_columns(col_2_drop)
-
-            # Preprocess
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            ds = ds.materialize()
-
-            self._predictor = BatchPredictor.from_checkpoint(
-                self._model_ckpt,
-                TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-                num_cpus_per_worker = self._nb_CPU_per_worker,
-                num_gpus_per_worker = self._nb_GPU_per_worker
-            )
-            return predictions
-        else:
-            raise ValueError('No data to predict')
-    
-    def _get_abs_pred(self, predictions):
-        print('_get_abs_pred')
-        def map_predicted_label_binary(ds):
-            ds = np.ravel(ds['predictions'])
-            lower_threshold = 0.5
-            upper_threshold = 0.5
-            predict = pd.DataFrame({
-                'proba': ds,
-                'predicted_label': np.full(len(ds), -1)
-            })
-            predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
-            predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
-            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
-        
-        def map_predicted_label_multiclass(ds):
-            ds = ds['predictions']
-            pred = pd.DataFrame({
-                'best_proba': [np.max(arr) for arr in ds],
-                'predicted_label' : [np.argmax(arr) for arr in ds]
-            })
-
-            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
-        
-        if self._nb_classes > 2:
-            print('map_predicted_label_multiclass')
-            fn = map_predicted_label_multiclass
-        else:
-            print('map_predicted_label_binary')
-            fn = map_predicted_label_binary
-
-        predict = []
-        predictions = predictions.map_batches(
-            lambda batch : fn(batch),
-            batch_format = 'numpy',
-            batch_size = self.batch_size
-        )
-        for row in predictions.iter_rows():
-            predict.append(row['predictions'])
-
-        return predict
-
-    def _get_threshold_pred(self, predictions, threshold):
-        print('_get_threshold_pred')
-        def map_predicted_label_binary(ds, threshold):
-            ds = np.ravel(ds['predictions'])
-            lower_threshold = 0.5 - (threshold * 0.5)
-            upper_threshold = 0.5 + (threshold * 0.5)
-            predict = pd.DataFrame({
-                'proba': ds,
-                'predicted_label': np.full(len(ds), -1)
-            })
-            predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
-            predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
-            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
-        
-        def map_predicted_label_multiclass(ds, threshold):
-            ds = ds['predictions']
-            pred = pd.DataFrame({
-                'best_proba': [np.max(arr) for arr in ds],
-                'predicted_label' : [np.argmax(arr) for arr in ds]
-            })
-            pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1
-
-            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
-        
-        if self._nb_classes > 2:
-            print('map_predicted_label_multiclass')
-            fn = map_predicted_label_multiclass
-        else:
-            print('map_predicted_label_binary')
-            fn = map_predicted_label_binary
-
-        predict = []
-        predictions = predictions.map_batches(
-            lambda batch : fn(batch, threshold),
-            batch_format = 'numpy',
-            batch_size = self.batch_size
-        )
-        for row in predictions.iter_rows():
-            predict.append(row['predictions'])
-
-        return predict
+        """
+        """
+
+    @abstractmethod
+    def predict_proba(self):
+        """
+        """
+
+    @abstractmethod
+    def _get_threshold_pred(self):
+        """
+        """
 
 
 # Training/building function outside of the class as mentioned on the Ray discussion
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index b422ff8..3b0902a 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -11,14 +11,16 @@
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Parent class / models
-from models.models_utils import ModelsUtils
+from models.kerasTF.models import KerasTFModels
 from models.kerasTF.build_neural_networks import *
+from models.multiclass_utils import MulticlassUtils
 
 # Training
 import tensorflow as tf
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
+from models.kerasTF.models import train_func
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
@@ -40,7 +42,7 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 warnings.filterwarnings('ignore')
 
-class KerasTFMulticlassModels(ModelsUtils):
+class KerasTFMulticlassModels(KerasTFModels, MulticlassUtils):
     """
     Class used to build, train and predict models using Ray with Keras Tensorflow backend
 
@@ -70,4 +72,216 @@ class KerasTFMulticlassModels(ModelsUtils):
             Minimum percentage of probability to effectively classify.
             Sequences will be classified as 'unknown' if the probability is under this threshold.
             Defaults to 80%
+    """
+
+    def __init__(
+        self,
+        classifier,
+        outdir_model,
+        batch_size,
+        training_epochs,
+        taxa,
+        kmers_list,
+        csv
+    ):
+        super().__init__(
+            classifier,
+            outdir_model,
+            batch_size,
+            training_epochs,
+            taxa,
+            kmers_list,
+            csv
+        )
+        # Parameters
+        # Initialize hidden
+        self._nb_CPU_data = int(os.cpu_count() * 0.2)
+        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data)
+        self._nb_GPU = len(tf.config.list_physical_devices('GPU'))
+        # Initialize empty
+        self._nb_classes = None
+        self._nb_CPU_per_worker = 0
+        self._nb_GPU_per_worker = 0
+        # Computing variables
+        if self._nb_GPU > 0:
+            self._use_gpu = True
+            self._n_workers = self._nb_GPU
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers)
+            self._nb_GPU_per_worker = 1
+        else:
+            self._use_gpu = False
+            self._n_workers = int(self._nb_CPU_training * 0.2)
+            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
+
+    # Data preprocessing
+    #########################################################################################################
+
+    def preprocess(self, ds, scaling = False, scaler_file = None):
+        print('preprocess')
+        # Labels encoding
+        self._encoder = Chain(
+            ModelLabelEncoder(self.taxa),
+            OneHotTensorEncoder(LABELS_COLUMN_NAME)
+        )
+        self._encoder.fit(ds)
+
+        # Labels mapping
+        labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys())
+        self._nb_classes = len(labels)
+        self._encoded = np.arange(len(labels))
+        labels = np.append(labels, 'Unknown')
+        self._encoded = np.append(self._encoded, -1)
+
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
+        
+        # Class weights
+        self._weights = self._compute_weights()
+        
+        # Scaling
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
+            self._scaler.fit(ds)
+
+    # Models training
+    #########################################################################################################
+
+    def fit(self, datasets):
+        print('fit')
+        # Preprocessing loop
+        for name, ds in datasets.items():
+            # ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            datasets[name] = ds
+        
+        # One sub-model per artificial cluster of samples
+        ds['train'] = self._random_split_dataset(ds['train'])
+
+        # Checkpointing directory
+        model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}')
+        if not os.path.isdir(model_dir):
+            os.mkdir(model_dir)
+
+# TODO: train_func per model
+# TODO: Confirm how it works in Jupyter Notebook
+        # Distributed building & training
+        if self.classifier == 'lstm_attention':
+            print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention')
+            training_result = ds.map_groups(build_fit_lstm_attention, batch_format = 'numpy')
+        elif self.classifier == 'cnn':
+            print('Training multiclass classifier based on CNN Neural Network')
+            training_result = ds.map_groups(build_fit_cnn, batch_format = 'numpy')
+        elif self.classifier == 'widecnn':
+            print('Training multiclass classifier based on Wide CNN Network')
+            training_result = ds.map_groups(build_fit_widecnn, batch_format = 'numpy')
+
+        training_result = training_result.to_pandas().to_dict('records')
+        for record in training_result:
+            self._model_ckpt[record['cluster']] = record['file']
+
+    # Models predicting
+    #########################################################################################################
+
+    def predict(self, ds):
+        print('predict')
+        probabilities = self._predict_proba(ds)
+        predictions = np.argmax(probabilities, axis = 1)
+        predictions = self._label_decode(predictions)
+        return predictions
+    
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        probabilities = self._predict_proba(ds)
+        predictions = self._get_threshold_pred(probabilities, threshold)
+        return self._label_decode(predictions)
+
+# TODO: Confirm how it works in Jupyter Notebook
+    def _predict_proba(self, ds):
+        print('_predict_proba')
+        if ds.count() > 0:
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            # ds = ds.materialize()
+
+            def predict_func(data):
+                X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
+                pred = np.zeros((len(X), len(self._labels_map)))
+                for cluster, model_file in self._model_ckpt.items():
+                    with open(model_file, 'rb') as file:
+                        model = cpickle.load(file)
+                    proba = model.predict_proba(X)
+                    for i, cls in enumerate(model.classes_):
+                        pred[:, cls] += proba[:, i]
+                # pred = pred / len(self._model_ckpt)
+                return {'predictions' : pred}
+
+            probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
+            probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+            
+            return probabilities
+        else:
+            raise ValueError('Empty dataset, cannot execute predictions!')
+
+    def _get_abs_pred(self, predictions):
+        print('_get_abs_pred')
+        def map_predicted_label(ds):
+            ds = ds['predictions']
+            pred = pd.DataFrame({
+                'best_proba': [np.max(arr) for arr in ds],
+                'predicted_label' : [np.argmax(arr) for arr in ds]
+            })
+
+            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
+        
+        predict = []
+        predictions = predictions.map_batches(
+            lambda batch : map_predicted_label(batch),
+            batch_format = 'numpy',
+            batch_size = self.batch_size
+        )
+        for row in predictions.iter_rows():
+            predict.append(row['predictions'])
+
+        return predict
+
+    def _get_threshold_pred(self, predictions, threshold):
+        print('_get_threshold_pred')
+        def map_predicted_label(ds, threshold):
+            ds = ds['predictions']
+            pred = pd.DataFrame({
+                'best_proba': [np.max(arr) for arr in ds],
+                'predicted_label' : [np.argmax(arr) for arr in ds]
+            })
+            pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1
+
+            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
+
+        predict = []
+        predictions = predictions.map_batches(
+            lambda batch : map_predicted_label(batch, threshold),
+            batch_format = 'numpy',
+            batch_size = self.batch_size
+        )
+        for row in predictions.iter_rows():
+            predict.append(row['predictions'])
+
+        return predict
+
+# TODO: Confirm how it works in Jupyter Notebook
+def build_fit_lstm_attention(data):
+    """
+    LSTM-Attention NN training function
+    """
+
+def build_fit_cnn(data):
+    """
+    Convolution NN training function
+    """
+
+def build_fit_widecnn(data):
+    """
+    Wide Convolution NN training function
     """
\ No newline at end of file
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 3f42d9e..08d7c79 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -136,4 +136,12 @@ def _compute_weights(self):
             if lab.lower() != 'unknown':
                 weights[int(encoded)] = cls_weights[classes.index(lab)]
         
-        return weights
\ No newline at end of file
+        return weights
+    
+    def _label_decode(self, predict):
+        print('_label_decode')
+        decoded = pd.Series(np.empty(len(predict), dtype=object))
+        for label, encoded in self._labels_map.items():
+            decoded[predict == encoded] = label
+
+        return np.array(decoded)
\ No newline at end of file
diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py
index dd6f362..a142ca3 100644
--- a/src/models/multiclass_utils.py
+++ b/src/models/multiclass_utils.py
@@ -24,43 +24,14 @@ class MulticlassUtils(ModelsUtils, ABC):
     These methods are meant to be used when decomposing data into taxonomic groups before training one model per group
     
     -----------------------
-    Ray data GroupBy
+    Ray data GroupBy + Bagging meta-estimator
     -----------------------
     https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key)
+    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
     1. GroupBy previous taxa
     2. Fx for model training (train_fx)
     3. ds.map_groups(train_fx) to exec the training of models in parallel
     4. Write results to file / save models
-    
-    -----------------------
-    Mixture-of-Experts (MoE)
-    -----------------------
-    1. Train each expert on their task-associated data
-        * Split training data into 80/20% splits
-        * Train/val over multiple epochs
-    2. Train a gating network on the whole task
-        * Perceptron NN for gating
-        * Train on whole training ds
-        * Validation on simulated reads ds
-        * CV on test simulated reads ds 
-    https://medium.com/@bensalemh300/harnessing-the-best-of-both-worlds-how-mixture-of-experts-meets-pyspark-for-mnist-mastery-315f82e65a0e
-    https://machinelearningmastery.com/mixture-of-experts/
-
-    1. Cluster Data Split: Data within each cluster is divided into training and testing sets.
-    2. Decision Tree Classifiers: For clusters where there’s more than one unique class in the training data, we train Decision Tree classifiers. These classifiers can distinguish between different classes within the cluster.
-    3. Storing Expert Models: Trained Decision Tree models are stored in a dictionary, where each expert corresponds to a specific cluster.
-    4. Performance Evaluation: The performance of each expert model is assessed by evaluating its accuracy on the corresponding test data.
-    
-    Sklearn LogisticRegression : https://github.com/zermelozf/esn-lm/blob/master/esnlm/readouts/smoe.py
-    Keras/TF : https://abdulkaderhelwan.medium.com/mixture-of-experts-introduction-39f244a4ff05
-    Keras/TF on article 2018 : https://github.com/drawbridge/keras-mmoe
-    Keras/TF 2018 : https://github.com/eminorhan/mixture-of-experts
-    Detailed example : https://mattgorb.github.io/moe
-    Detailed example : https://towardsdatascience.com/how-to-build-a-wide-and-deep-model-using-keras-in-tensorflow-2-0-2f7a236b5a4b
-    Keras example : https://keras.io/examples/nlp/text_classification_with_switch_transformer/
-    Keras example : https://stackoverflow.com/questions/77551865/how-to-extend-keras-gpt2-model-moe-example
-    FastMoE PyTorch : https://fastmoe.ai/
-    Tutel PyTorch : https://www.microsoft.com/en-us/research/blog/tutel-an-efficient-mixture-of-experts-implementation-for-large-dnn-model-training/
     """
 
     def _get_count_previous_taxa(self, taxa, csv):
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index 90d3679..61d0656 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -86,8 +86,12 @@ def __init__(
             csv
         )
 
+    # Data preprocessing
+    #########################################################################################################
+
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
+        # Labels encoding + mapping
         if self.classifier == 'onesvm':
             self._encoder = OneClassSVMLabelEncoder(self.taxa)
             self._encoded = np.array([1,-1], dtype = np.int32)
@@ -100,52 +104,31 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             self._encoded = np.arange(len(labels))
             labels = np.append(labels, 'Unknown')
             self._encoded = np.append(self._encoded, -1)
+            # Class weights
             self._weights = self._compute_weights()
 
+        # Labels mapping
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
+
+        # Scaling
         if scaling:
             self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
             self._scaler.fit(ds)
 
-        # Labels mapping
-        for (label, encoded) in zip(labels, self._encoded):
-            self._labels_map[label] = encoded
 
-    def _build(self):
-        print('_build')
-        if self.classifier == 'onesvm':
-            print('Training bacterial extractor with One Class SVM')
-            self._clf = ScoringSGDOneClassSVM()
-            self._train_params = {
-                'nu' : 0.026441491,
-                'learning_rate' : 'constant',
-                'tol' : 1e-3,
-                'eta0' : 0.001
-            }
-        else :
-            print('Training bacterial / host classifier with SGD')
-            self._clf = SGDClassifier()
-            self._train_params = {
-                'loss' : 'hinge',
-                'penalty' : 'elasticnet',
-                'alpha' : 141.6146176,
-                'learning_rate' : 'adaptive',
-                'class_weight' : self._weights,
-                'eta0' : 0.001,
-                'n_jobs' : -1
-            }
-    
+    # Model training
+    #########################################################################################################
+
     def fit(self, datasets):
         print('_fit_model')
         # Define model
         self._build()
         for name, ds in datasets.items():
-            ds = ds.drop_columns(['id'])
+            # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
             if self._scaler is not None:
                 ds = self._scaler.transform(ds)
-            # Trigger the preprocessing computations before ingest in trainer
-            # Otherwise, it would be executed at each epoch
-            ds = ds.materialize()
             datasets[name] = ray.put(ds)
         
         try:
@@ -179,6 +162,28 @@ def fit(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.checkpoint
     
+    def _build(self):
+        print('_build')
+        if self.classifier == 'onesvm':
+            print('Training bacterial extractor with One Class SVM')
+            self._clf = ScoringSGDOneClassSVM()
+            self._train_params = {
+                'learning_rate' : 'optimal'
+            }
+        else :
+            print('Training bacterial / host classifier with SGD')
+            self._clf = SGDClassifier()
+            self._train_params = {
+                'loss' : 'hinge',
+                'penalty' : 'elasticnet',
+                'learning_rate' : 'optimal',
+                'class_weight' : self._weights,
+                'n_jobs' : -1
+            }
+
+    # Model predicting
+    #########################################################################################################
+
     def predict(self, ds):
         print('predict')
         if ds.count() > 0:
@@ -195,6 +200,7 @@ def predict(self, ds):
     
     def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
+        # No predict_proba methods implemented for these models
         return self.predict(ds)
 
     def _get_threshold_pred(self, predict, nb_cls, threshold):
diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py
index 386f684..3cba9c3 100644
--- a/src/models/sklearn/models.py
+++ b/src/models/sklearn/models.py
@@ -92,12 +92,4 @@ def predict_proba(self):
     @abstractmethod
     def _get_threshold_pred(self):
         """
-        """
-
-    def _label_decode(self, predict):
-        print('_label_decode')
-        decoded = pd.Series(np.empty(len(predict), dtype=object))
-        for label, encoded in self._labels_map.items():
-            decoded[predict == encoded] = label
-
-        return np.array(decoded)
\ No newline at end of file
+        """
\ No newline at end of file
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 1414675..159c672 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -14,6 +14,7 @@
 from ray.air.config import ScalingConfig
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
+from sklearn.calibration import CalibratedClassifierCV
 from models.sklearn.partial_trainer import SklearnPartialTrainer
 from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM
 
@@ -96,13 +97,12 @@ def __init__(
         self._model_ckpt = {}
         self._predictor = {}
 
+    # Data preprocessing
+    #########################################################################################################
+
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
-
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
-        
+        # Labels encoding
         self._encoder = ModelLabelEncoder(self.taxa)
         self._encoder.fit(ds)
 
@@ -112,49 +112,70 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         labels = np.append(labels, 'Unknown')
         encoded = np.append(encoded, -1)
         
-        self._labels_map = {}
         for (label, encode) in zip(labels, encoded):
             self._labels_map[label] = encode
         
+        # Class weights
         self._weights = self._compute_weights()
+        
+        # Scaling
+        if scaling:
+            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
+            self._scaler.fit(ds)
+
+    # Models training
+    #########################################################################################################
 
     def fit(self, datasets):
         print('fit')
-    # TODO: remove validation from datasets
-    # train / val on training ds, CV on test ds
-        ds = datasets['train']
-        ds = ds.drop_columns(['id'])
-        ds = self._encoder.transform(ds)
-        if self._scaler is not None:
-            ds = self._scaler.transform(ds)
-
+        for name, ds in datasets.items():
+            # ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            datasets[name] = ds
 
         # One sub-model per artificial cluster of samples
-        ds = self._random_split_dataset(ds)
+        ds_train = self._random_split_dataset(datasets['train'])
+        ds_val = datasets['validation']
         
-        # checkpointing directory
+        # Checkpointing directory
         model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}')
         if not os.path.isdir(model_dir):
             os.mkdir(model_dir)
 
         # Model-specific training functions
         def build_fit_sgd(data):
-            X = data[TENSOR_COLUMN_NAME]
-            y = data[LABELS_COLUMN_NAME]
+            # Training data
+            X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
+            y_train = np.array(data[LABELS_COLUMN_NAME])
+            # Validation data
+            X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME]
+            y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME]
+            msk_val = y_val.isin(np.unique(y_train))
+            X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
+            y_val = np.array(y_val[msk_val])
             cluster = data['cluster'][0]
             model = SGDClassifier(
-                # alpha = 173.5667373,
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
                 penalty = 'l2',
                 class_weight = self._weights,
             )
-            model.fit(X, y)
+            model.fit(X_train, y_train)
+
+            calibrator = CalibratedClassifierCV(
+                estimator = model,
+                method = 'isotonic',
+                cv = 'prefit',     
+            )
+
+            calibrator.fit(X_val,y_val)
 
             model_file = os.path.join(model_dir, f'{cluster}.pkl')
 
             with open(model_file, "wb") as file:
-                cpickle.dump(model, file)
+                cpickle.dump(calibrator, file)
 
             return {
                 'cluster' : [cluster],
@@ -162,16 +183,31 @@ def build_fit_sgd(data):
             }
 
         def build_fit_mnb(data):
-            X = data[TENSOR_COLUMN_NAME]
-            y = data[LABELS_COLUMN_NAME]
+            # Training data
+            X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
+            y_train = np.array(data[LABELS_COLUMN_NAME])
+            # Validation data
+            X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME]
+            y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME]
+            msk_val = y_val.isin(np.unique(y_train))
+            X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
+            y_val = np.array(y_val[msk_val])
             cluster = data['cluster'][0]
             model = MultinomialNB()
-            model.fit(X, y)
+            model.fit(X_train, y_train)
 
             model_file = os.path.join(model_dir, f'{cluster}.pkl')
 
+            calibrator = CalibratedClassifierCV(
+                estimator = model,
+                method = 'isotonic',
+                cv = 'prefit',     
+            )
+
+            calibrator.fit(X_val,y_val)
+
             with open(model_file, "wb") as file:
-                cpickle.dump(model, file)
+                cpickle.dump(calibrator, file)
 
             return {
                 'cluster' : [cluster],
@@ -180,15 +216,18 @@ def build_fit_mnb(data):
         
         if self.classifier == 'sgd':
             print('Training multiclass SGD classifier')
-            training_result = ds.map_groups(build_fit_sgd, batch_format = 'numpy')
+            training_result = ds_train.map_groups(build_fit_sgd, batch_format = 'numpy')
         elif self.classifier == 'mnb':
             print('Training multiclass Multinomial Naive Bayes classifier')
-            training_result = ds.map_groups(build_fit_mnb, batch_format = 'numpy')
+            training_result = ds_train.map_groups(build_fit_mnb, batch_format = 'numpy')
 
         training_result = training_result.to_pandas().to_dict('records')
         for record in training_result:
             self._model_ckpt[record['cluster']] = record['file']
         
+    # Models predicting
+    #########################################################################################################
+
     def predict(self, ds):
         print('predict')
         probabilities = self._predict_proba(ds)
@@ -227,7 +266,6 @@ def predict_func(data):
         else:
             raise ValueError('Empty dataset, cannot execute predictions!')
 
-
     def _get_threshold_pred(self, predict, threshold):
         print('_get_threshold_pred')
         proba_predict = {
diff --git a/src/utils.py b/src/utils.py
index 3f1c5fe..4f48bfa 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -65,9 +65,9 @@ def init_ray_cluster(workdir):
     """
     1. Get physical material available
         Number of available CPUs and GPUs
-    2. Get host IP from OS
+    2. Get host IP from OS if available
         Defaults to 172.24.94.34
-    3. Start the ray cluster at OS level
+    3. Start the ray cluster with parameters
     """
     nb_CPU = os.cpu_count()
     nb_GPU = len(list_physical_devices('GPU'))
@@ -90,7 +90,6 @@ def init_ray_cluster(workdir):
                 })
             },
         )
-        # cmd = f"ray start --head --node-ip-address {os.environ['HOST_IP']} --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}"
     else:
         ray.init(
             num_cpus = nb_CPU,
@@ -107,11 +106,6 @@ def init_ray_cluster(workdir):
             },
         )
 
-    # cmd = f"ray start --head --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}"
-
-    # os.system(cmd)
-
-    # ray.init()
     logging.getLogger("ray").setLevel(logging.WARNING)
     ray.data.DataContext.get_current().execution_options.verbose_progress = True
     # mem = virtual_memory().total

From 7b9fddf2c472fef5ac8c3d961f176dc1b8ae495f Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 8 Dec 2023 17:17:06 -0500
Subject: [PATCH 60/92] val ds to pandas outside of training func

---
 src/models/kerasTF/build_neural_networks.py |  6 ++--
 src/models/kerasTF/multiclass_models.py     | 11 +++----
 src/models/sklearn/multiclass_models.py     | 32 ++++++++++-----------
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 8294110..cdcf08f 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -108,7 +108,7 @@ def build_LSTM_attention(nb_features, nb_classes):
     net = Dense(nb_classes)(net)
     outputs = Activation('softmax')(net)
     model = Model(inputs = inputs, outputs = outputs)
-    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -134,7 +134,7 @@ def build_CNN(nb_features, nb_classes):
     model.add(Dropout(0.5))
     model.add(Dense(nb_classes))
     model.add(Activation('softmax'))
-    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
 
@@ -172,6 +172,6 @@ def build_wideCNN(nb_features, nb_classes):
     net = Dense(nb_classes)(net)
     outputs = Activation('softmax')(net)
     model = Model(inputs = inputs, outputs = outputs)
-    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
+    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 3b0902a..a5c48a0 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -119,14 +119,15 @@ def __init__(
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         # Labels encoding
-        self._encoder = Chain(
-            ModelLabelEncoder(self.taxa),
-            OneHotTensorEncoder(LABELS_COLUMN_NAME)
-        )
+        # self._encoder = Chain(
+        #     ModelLabelEncoder(self.taxa),
+        #     OneHotTensorEncoder(LABELS_COLUMN_NAME)
+        # )
+        self._encoder = ModelLabelEncoder(self.taxa)
         self._encoder.fit(ds)
 
         # Labels mapping
-        labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys())
+        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
         self._nb_classes = len(labels)
         self._encoded = np.arange(len(labels))
         labels = np.append(labels, 'Unknown')
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 159c672..87e9575 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -136,8 +136,8 @@ def fit(self, datasets):
             datasets[name] = ds
 
         # One sub-model per artificial cluster of samples
-        ds_train = self._random_split_dataset(datasets['train'])
-        ds_val = datasets['validation']
+        train_ds = self._random_split_dataset(datasets['train'])
+        val_ds = datasets['validation'].to_pandas()
         
         # Checkpointing directory
         model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}')
@@ -145,17 +145,17 @@ def fit(self, datasets):
             os.mkdir(model_dir)
 
         # Model-specific training functions
-        def build_fit_sgd(data):
+        def build_fit_sgd(train_data, val_data):
             # Training data
-            X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
-            y_train = np.array(data[LABELS_COLUMN_NAME])
+            X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME])
+            y_train = np.array(train_data[LABELS_COLUMN_NAME])
             # Validation data
-            X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME]
-            y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME]
+            X_val = val_data[TENSOR_COLUMN_NAME]
+            y_val = val_data[LABELS_COLUMN_NAME]
             msk_val = y_val.isin(np.unique(y_train))
             X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
             y_val = np.array(y_val[msk_val])
-            cluster = data['cluster'][0]
+            cluster = train_data['cluster'][0]
             model = SGDClassifier(
                 learning_rate = 'optimal',
                 loss = 'modified_huber',
@@ -182,17 +182,17 @@ def build_fit_sgd(data):
                 'file' : [model_file]
             }
 
-        def build_fit_mnb(data):
+        def build_fit_mnb(train_data, val_data):
             # Training data
-            X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
-            y_train = np.array(data[LABELS_COLUMN_NAME])
+            X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME])
+            y_train = np.array(train_data[LABELS_COLUMN_NAME])
             # Validation data
-            X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME]
-            y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME]
+            X_val = val_data[TENSOR_COLUMN_NAME]
+            y_val = val_data[LABELS_COLUMN_NAME]
             msk_val = y_val.isin(np.unique(y_train))
             X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
             y_val = np.array(y_val[msk_val])
-            cluster = data['cluster'][0]
+            cluster = train_data['cluster'][0]
             model = MultinomialNB()
             model.fit(X_train, y_train)
 
@@ -216,10 +216,10 @@ def build_fit_mnb(data):
         
         if self.classifier == 'sgd':
             print('Training multiclass SGD classifier')
-            training_result = ds_train.map_groups(build_fit_sgd, batch_format = 'numpy')
+            training_result = train_ds.map_groups(lambda ds: build_fit_sgd(ds, val_ds), batch_format = 'numpy')
         elif self.classifier == 'mnb':
             print('Training multiclass Multinomial Naive Bayes classifier')
-            training_result = ds_train.map_groups(build_fit_mnb, batch_format = 'numpy')
+            training_result = train_ds.map_groups(lambda ds: build_fit_mnb(ds, val_ds), batch_format = 'numpy')
 
         training_result = training_result.to_pandas().to_dict('records')
         for record in training_result:

From e3cc5c068d56d55ea0b30ba051c2a8130271b362 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 9 Dec 2023 09:08:30 -0500
Subject: [PATCH 61/92] sklearn remove clibrated classifier

---
 src/models/classification.py                |   9 +-
 src/models/kerasTF/binary_models.py         |   4 -
 src/models/kerasTF/build_neural_networks.py |   5 +-
 src/models/kerasTF/multiclass_models.py     | 101 ++++++++++++++++++--
 src/models/models_utils.py                  |   3 +-
 src/models/multiclass_utils.py              |   4 +-
 src/models/sklearn/multiclass_models.py     |  73 +++++++-------
 7 files changed, 144 insertions(+), 55 deletions(-)

diff --git a/src/models/classification.py b/src/models/classification.py
index 3c62ddf..b1f76e6 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -7,11 +7,10 @@
 
 from warnings import warn
 from typing import Dict, List
-from models.kerasTF.models import KerasTFModels
 from models.sklearn.binary_models import SklearnBinaryModels
-# from models.kerasTF.binary_models import KerasTFBinaryModels
+from models.kerasTF.binary_models import KerasTFBinaryModels
 from models.sklearn.multiclass_models import SklearnMulticlassModels
-# from models.kerasTF.multiclass_models import KerasTFMulticlassModels
+from models.kerasTF.multiclass_models import KerasTFMulticlassModels
 
 # CV metrics
 from sklearn.metrics import precision_recall_fscore_support
@@ -204,7 +203,7 @@ def _binary_training(self, datasets, taxa, file):
                 self._database_data['csv']
             )
         else:
-            model = KerasTFModels(
+            model = KerasTFBinaryModels(
                 self._classifier_binary,
                 self._outdirs['models_dir'],
                 self._batch_size,
@@ -235,7 +234,7 @@ def _multiclass_training(self, datasets, taxa, file):
                 self._database_data['csv']
             )
         else:
-            model = KerasTFModels(
+            model = KerasTFMulticlassModels(
                 self._classifier_multiclass,
                 self._outdirs['models_dir'],
                 self._batch_size,
diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 603434b..0b76a69 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -200,10 +200,8 @@ def predict(self, ds):
         print('predict')
         # Predict with model
         predictions = self._predict_proba(ds)
-
         # Convert predictions to labels
         predictions = self._get_abs_pred(predictions)
-
         # Return decoded labels
         return self._label_decode(predictions)
 
@@ -211,10 +209,8 @@ def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
         # Predict with model
         predictions = self._predict_proba(ds)
-
         # Convert predictions to labels with threshold
         predictions = self._get_threshold_pred(predictions, threshold)
-
         # Return decoded labels
         return self._label_decode(predictions)
     
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index cdcf08f..7d62eca 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -72,7 +72,10 @@ def build_deepLSTM(nb_features):
     netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs)
     netB = Dense(40, activation='tanh',name='H_%d'%40) (netB)
 
-    net = Concatenate()([netA,netB]) # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 1000, 40)]
+    # TODO: Debug error caught in local and on Narval
+    # TODO: Finish testing NNs
+    # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 100, 40)]
+    net = Concatenate()([netA,netB])
 
     net = Dense(200, activation='relu', name='C_%d'%(10*2))(net)
     net = Dropout(0.1,name='fr_%.1f'%0.1)(net)
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index a5c48a0..1052d33 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -20,7 +20,7 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
-from models.kerasTF.models import train_func
+from models.kerasTF.models import train_func, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
@@ -119,10 +119,6 @@ def __init__(
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         # Labels encoding
-        # self._encoder = Chain(
-        #     ModelLabelEncoder(self.taxa),
-        #     OneHotTensorEncoder(LABELS_COLUMN_NAME)
-        # )
         self._encoder = ModelLabelEncoder(self.taxa)
         self._encoder.fit(ds)
 
@@ -149,6 +145,10 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
 
     def fit(self, datasets):
         print('fit')
+        """
+        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
+        TODO: train_func per model
+        TODO: Confirm how it works in Jupyter Notebook
         # Preprocessing loop
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
@@ -166,8 +166,6 @@ def fit(self, datasets):
         if not os.path.isdir(model_dir):
             os.mkdir(model_dir)
 
-# TODO: train_func per model
-# TODO: Confirm how it works in Jupyter Notebook
         # Distributed building & training
         if self.classifier == 'lstm_attention':
             print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention')
@@ -182,26 +180,89 @@ def fit(self, datasets):
         training_result = training_result.to_pandas().to_dict('records')
         for record in training_result:
             self._model_ckpt[record['cluster']] = record['file']
+        """
+
+        # Preprocessing loop
+        for name, ds in datasets.items():
+            # ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            datasets[name] = ds
+
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        # Define trainer / tuner
+        self._trainer = TensorflowTrainer(
+            train_loop_per_worker=train_func,
+            train_loop_config=train_params,
+            scaling_config=ScalingConfig(
+                trainer_resources={'CPU': self._nb_CPU_data},
+                num_workers=self._n_workers,
+                use_gpu=self._use_gpu,
+                resources_per_worker={
+                    'CPU': self._nb_CPU_per_worker,
+                    'GPU': self._nb_GPU_per_worker
+                }
+            ),
+            run_config=RunConfig(
+                name=self.classifier,
+                local_dir=self._workdir,
+            ),
+            datasets=datasets,
+        )
+
+        training_result = self._trainer.fit()
+        self._model_ckpt = training_result.best_checkpoints[0][0]
 
     # Models predicting
     #########################################################################################################
 
     def predict(self, ds):
         print('predict')
+        """
+        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
         probabilities = self._predict_proba(ds)
         predictions = np.argmax(probabilities, axis = 1)
         predictions = self._label_decode(predictions)
         return predictions
+        """
+        # Predict with model
+        predictions = self._predict_proba(ds)
+        # Convert predictions to labels
+        predictions = self._get_abs_pred(predictions)
+        # Return decoded labels
+        return self._label_decode(predictions)
     
     def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
+        """
+        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
         probabilities = self._predict_proba(ds)
         predictions = self._get_threshold_pred(probabilities, threshold)
         return self._label_decode(predictions)
+        """
+        # Predict with model
+        predictions = self._predict_proba(ds)
+        # Convert predictions to labels with threshold
+        predictions = self._get_threshold_pred(predictions, threshold)
+        # Return decoded labels
+        return self._label_decode(predictions)
 
 # TODO: Confirm how it works in Jupyter Notebook
     def _predict_proba(self, ds):
         print('_predict_proba')
+        """
+        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
         if ds.count() > 0:
             if self._scaler is not None:
                 ds = self._scaler.transform(ds)
@@ -225,6 +286,32 @@ def predict_func(data):
             return probabilities
         else:
             raise ValueError('Empty dataset, cannot execute predictions!')
+        """
+        if ds.count() > 0:
+            if len(ds.schema().names) > 1:
+                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
+                ds = ds.drop_columns(col_2_drop)
+
+            # Preprocess
+            if self._scaler is not None:
+                ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+
+            self._predictor = BatchPredictor.from_checkpoint(
+                self._model_ckpt,
+                TensorflowPredictor,
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+            )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+                num_cpus_per_worker = self._nb_CPU_per_worker,
+                num_gpus_per_worker = self._nb_GPU_per_worker
+            )
+            return predictions
+        else:
+            raise ValueError('No data to predict')
 
     def _get_abs_pred(self, predictions):
         print('_get_abs_pred')
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index 08d7c79..da4061b 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -122,7 +122,8 @@ def _compute_weights(self):
         weights = {}
         if isinstance(self._csv, tuple):
             cls = pd.concat([pd.read_csv(self._csv[0]),pd.read_csv(self._csv[1])], axis = 0, join = 'inner', ignore_index = True)
-        cls = pd.read_csv(self._csv)
+        else:
+            cls = pd.read_csv(self._csv)
         if self.taxa == 'domain':
             cls.loc[cls['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria'
         classes = list(cls[self.taxa].unique())
diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py
index a142ca3..9534670 100644
--- a/src/models/multiclass_utils.py
+++ b/src/models/multiclass_utils.py
@@ -68,9 +68,9 @@ def map_clusters(batch):
             batch['cluster'] = clusters
             return batch
 
-        nb_clusters = int(ds.count() / 100)
+        nb_clusters = int(ds.count() / self.batch_size)
 
-        ds = ds.repartition(100)
+        ds = ds.repartition(self.batch_size)
         ds = ds.map_batches(map_clusters, batch_size = nb_clusters, batch_format = 'pandas')
 
         return ds.groupby('cluster')
\ No newline at end of file
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 87e9575..3473a75 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -128,16 +128,17 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
 
     def fit(self, datasets):
         print('fit')
-        for name, ds in datasets.items():
+        # for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            datasets[name] = ds
+        train_ds = datasets['train']
+        train_ds = self._encoder.transform(train_ds)
+        if self._scaler is not None:
+            train_ds = self._scaler.transform(train_ds)
+        # datasets[name] = ds
 
         # One sub-model per artificial cluster of samples
-        train_ds = self._random_split_dataset(datasets['train'])
-        val_ds = datasets['validation'].to_pandas()
+        train_ds = self._random_split_dataset(train_ds)
+        # val_ds = datasets['validation'].to_pandas()
         
         # Checkpointing directory
         model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}')
@@ -145,16 +146,16 @@ def fit(self, datasets):
             os.mkdir(model_dir)
 
         # Model-specific training functions
-        def build_fit_sgd(train_data, val_data):
+        def build_fit_sgd(train_data):#, val_data):
             # Training data
             X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME])
             y_train = np.array(train_data[LABELS_COLUMN_NAME])
             # Validation data
-            X_val = val_data[TENSOR_COLUMN_NAME]
-            y_val = val_data[LABELS_COLUMN_NAME]
-            msk_val = y_val.isin(np.unique(y_train))
-            X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
-            y_val = np.array(y_val[msk_val])
+            # X_val = val_data[TENSOR_COLUMN_NAME]
+            # y_val = val_data[LABELS_COLUMN_NAME]
+            # msk_val = y_val.isin(np.unique(y_train))
+            # X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
+            # y_val = np.array(y_val[msk_val])
             cluster = train_data['cluster'][0]
             model = SGDClassifier(
                 learning_rate = 'optimal',
@@ -164,50 +165,50 @@ def build_fit_sgd(train_data, val_data):
             )
             model.fit(X_train, y_train)
 
-            calibrator = CalibratedClassifierCV(
-                estimator = model,
-                method = 'isotonic',
-                cv = 'prefit',     
-            )
+            # calibrator = CalibratedClassifierCV(
+            #     estimator = model,
+            #     method = 'isotonic',
+            #     cv = 'prefit',     
+            # )
 
-            calibrator.fit(X_val,y_val)
+            # calibrator.fit(X_val,y_val)
 
             model_file = os.path.join(model_dir, f'{cluster}.pkl')
 
             with open(model_file, "wb") as file:
-                cpickle.dump(calibrator, file)
+                cpickle.dump(model, file)
 
             return {
                 'cluster' : [cluster],
                 'file' : [model_file]
             }
 
-        def build_fit_mnb(train_data, val_data):
+        def build_fit_mnb(train_data):#, val_data):
             # Training data
             X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME])
             y_train = np.array(train_data[LABELS_COLUMN_NAME])
             # Validation data
-            X_val = val_data[TENSOR_COLUMN_NAME]
-            y_val = val_data[LABELS_COLUMN_NAME]
-            msk_val = y_val.isin(np.unique(y_train))
-            X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
-            y_val = np.array(y_val[msk_val])
+            # X_val = val_data[TENSOR_COLUMN_NAME]
+            # y_val = val_data[LABELS_COLUMN_NAME]
+            # msk_val = y_val.isin(np.unique(y_train))
+            # X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val])
+            # y_val = np.array(y_val[msk_val])
             cluster = train_data['cluster'][0]
             model = MultinomialNB()
             model.fit(X_train, y_train)
 
             model_file = os.path.join(model_dir, f'{cluster}.pkl')
 
-            calibrator = CalibratedClassifierCV(
-                estimator = model,
-                method = 'isotonic',
-                cv = 'prefit',     
-            )
+            # calibrator = CalibratedClassifierCV(
+            #     estimator = model,
+            #     method = 'isotonic',
+            #     cv = 'prefit',     
+            # )
 
-            calibrator.fit(X_val,y_val)
+            # calibrator.fit(X_val,y_val)
 
             with open(model_file, "wb") as file:
-                cpickle.dump(calibrator, file)
+                cpickle.dump(model, file)
 
             return {
                 'cluster' : [cluster],
@@ -216,10 +217,12 @@ def build_fit_mnb(train_data, val_data):
         
         if self.classifier == 'sgd':
             print('Training multiclass SGD classifier')
-            training_result = train_ds.map_groups(lambda ds: build_fit_sgd(ds, val_ds), batch_format = 'numpy')
+            training_result = train_ds.map_groups(build_fit_sgd, batch_format = 'numpy')
+            # training_result = train_ds.map_groups(lambda ds: build_fit_sgd(ds, val_ds), batch_format = 'numpy')
         elif self.classifier == 'mnb':
             print('Training multiclass Multinomial Naive Bayes classifier')
-            training_result = train_ds.map_groups(lambda ds: build_fit_mnb(ds, val_ds), batch_format = 'numpy')
+            training_result = train_ds.map_groups(build_fit_mnb, batch_format = 'numpy')
+            # training_result = train_ds.map_groups(lambda ds: build_fit_mnb(ds, val_ds), batch_format = 'numpy')
 
         training_result = training_result.to_pandas().to_dict('records')
         for record in training_result:

From 31750a81a55f709792761b2c210ea4cb8a8dda30 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 9 Dec 2023 11:14:33 -0500
Subject: [PATCH 62/92] NN architectures debug

---
 src/models/kerasTF/build_neural_networks.py | 29 ++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 7d62eca..e69747a 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -2,7 +2,7 @@
 from keras.models import Model, Sequential
 from tensorflow.keras import mixed_precision
 from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
-from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape
+from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape, AveragePooling1D
 
 
 
@@ -70,11 +70,10 @@ def build_deepLSTM(nb_features):
     netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA)
 
     netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs)
-    netB = Dense(40, activation='tanh',name='H_%d'%40) (netB)
+    netB = Dense(100, activation='tanh',name='H_%d'%40) (netB)
+    netB = AveragePooling1D(100) (netB)
+    netB = Flatten() (netB)
 
-    # TODO: Debug error caught in local and on Narval
-    # TODO: Finish testing NNs
-    # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 100, 40)]
     net = Concatenate()([netA,netB])
 
     net = Dense(200, activation='relu', name='C_%d'%(10*2))(net)
@@ -149,21 +148,21 @@ def build_wideCNN(nb_features, nb_classes):
     https://github.com/KennthShang/CHEER/blob/master/Classifier/model/Wcnn.py
     """
 
-    inputs = Input(shape = (nb_features,1))
+    inputs = Input(shape = (nb_features, 1))
     # embed = Embedding(248, 100)(inputs)
-    # embed = Reshape((nb_features, -1, 1))(embed)
+    # inputs = Reshape((nb_features, -1, 1))(inputs)
 
-    conv1 = Conv2D(256, 3, activation = 'relu')(inputs)
-    conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv1)
+    conv1 = Conv1D(256, 3, activation = 'relu')(inputs)
+    conv1 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv1)
 
-    conv2 = Conv2D(256, 7, activation = 'relu')(inputs)
-    conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv2)
+    conv2 = Conv1D(256, 7, activation = 'relu')(inputs)
+    conv2 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv2)
 
-    conv3 = Conv2D(256, 11, activation = 'relu')(inputs)
-    conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv3)
+    conv3 = Conv1D(256, 11, activation = 'relu')(inputs)
+    conv3 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv3)
 
-    conv4 = Conv2D(256, 15, activation = 'relu')(inputs)
-    conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv4)
+    conv4 = Conv1D(256, 15, activation = 'relu')(inputs)
+    conv4 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv4)
 
     net = Concatenate(axis = 1)([conv1,conv2,conv3,conv4])
     net = Flatten()(net)

From 301e2d6bd185ed263dd4d8e9aebd41864a11dd78 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 11 Dec 2023 11:01:48 -0500
Subject: [PATCH 63/92] min-max scaling for values 0-1

---
 src/models/kerasTF/multiclass_models.py       | 12 ++---
 .../preprocessors/compute_class_weights.py    | 49 -------------------
 src/models/preprocessors/min_max_scaler.py    | 15 +++---
 src/models/sklearn/multiclass_models.py       | 14 +++---
 4 files changed, 18 insertions(+), 72 deletions(-)
 delete mode 100644 src/models/preprocessors/compute_class_weights.py

diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 1052d33..23b1cb4 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -7,6 +7,7 @@
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
@@ -136,9 +137,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler.fit(ds)
 
     # Models training
     #########################################################################################################
@@ -186,8 +186,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
             datasets[name] = ds
 
@@ -293,8 +292,7 @@ def predict_func(data):
                 ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
             self._predictor = BatchPredictor.from_checkpoint(
diff --git a/src/models/preprocessors/compute_class_weights.py b/src/models/preprocessors/compute_class_weights.py
deleted file mode 100644
index 43b4c5d..0000000
--- a/src/models/preprocessors/compute_class_weights.py
+++ /dev/null
@@ -1,49 +0,0 @@
-
-import numpy as np
-import pandas as pd
-
-from ray.data.dataset import Dataset
-from ray.data.preprocessor import Preprocessor
-
-TENSOR_COLUMN_NAME = '__value__'
-
-class ComputeClassWeights(Preprocessor):
-    """
-    Custom implementation of Class Weight Computation inspired by sklearn.utils.class_weight.compute_class_weight to be used as a Ray preprocessor.
-    https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html
-    This permits to estimate balanced class weights for an unbalanced dataset.
-    """
-
-    def __init__(self, class_col):
-        # Parameters
-        self._col = class_col
-        self._cls = []
-        self._counts_map = {}
-
-    def _fit(self, ds: Dataset) -> Preprocessor:
-        def get_cls_counts(df):
-            mapping = {}
-            counts = df[self._col].value_counts()
-            for cls in self._cls:
-                if cls in counts.index:
-                    mapping[str(cls)] = [counts[cls]]
-                else:
-                    mapping[str(cls)] = [0]
-            return mapping
-        
-        self._cls = ds.unique(self._col)
-        
-        counts = ds.map_batches(get_cls_counts, batch_format = 'pandas')
-                
-        for cls in self._cls:
-            self._counts_map[str(cls)] = counts.sum(str(cls))
-
-        freqs = ds.count() / (len(self._cls) * np.array(list(self._counts_map.values())).astype(np.float64))
-        
-        self.stats_ = {}
-        for i, cls in enumerate(self._cls):
-            self.stats_[cls] = freqs[i]
-                
-        return self
-
-        
diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py
index 1cb6aa0..0f672a6 100644
--- a/src/models/preprocessors/min_max_scaler.py
+++ b/src/models/preprocessors/min_max_scaler.py
@@ -13,9 +13,9 @@ class TensorMinMaxScaler(Preprocessor):
     Custom implementation of Ray's MinMax Scaler for usage with tensor column in ray.data.dataset.Dataset.
     """
     
-    def __init__(self, features):
+    def __init__(self, nb_features):
         # Parameters
-        self._features = features
+        self.__nb_features = nb_features
         
     def _fit(self, ds: Dataset) -> Preprocessor:
         """
@@ -23,16 +23,15 @@ def _fit(self, ds: Dataset) -> Preprocessor:
         """
         min = []
         max = []
-        nb_features = len(self._features)
 
         def Min(dct):
             arr = dct[TENSOR_COLUMN_NAME]
-            min = np.array([arr[:,i].min() for i in range(nb_features)])
+            min = np.array([arr[:,i].min() for i in range(self.__nb_features)])
             return min
 
         def Max(dct):
             arr = dct[TENSOR_COLUMN_NAME]
-            max = np.array([arr[:,i].max() for i in range(nb_features)])
+            max = np.array([arr[:,i].max() for i in range(self.__nb_features)])
             return max
 
         for batch in ds.iter_batches(batch_format = 'numpy'):
@@ -42,8 +41,8 @@ def Max(dct):
         min = np.array(min)
         max = np.array(max)
 
-        min = np.array([min[:,i].min() for i in range(nb_features)])
-        max = np.array([max[:,i].max() for i in range(nb_features)])
+        min = np.array([min[:,i].min() for i in range(self.__nb_features)])
+        max = np.array([max[:,i].max() for i in range(self.__nb_features)])
                 
         self.stats_ = {'min' : min, 'max' : max}
 
@@ -80,4 +79,4 @@ def _transform_numpy(self, batch: dict):
         return batch
 
     def __repr__(self):
-        return f"{self.__class__.__name__}(columns={self._features_list!r})"
+        return f"{self.__class__.__name__}(columns={self._nb_features!r})"
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 3473a75..d449e64 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -6,6 +6,7 @@
 
 # Preprocessing
 from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
@@ -119,10 +120,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
-
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler.fit(ds)
+        
     # Models training
     #########################################################################################################
 
@@ -132,8 +132,7 @@ def fit(self, datasets):
             # ds = ds.drop_columns(['id'])
         train_ds = datasets['train']
         train_ds = self._encoder.transform(train_ds)
-        if self._scaler is not None:
-            train_ds = self._scaler.transform(train_ds)
+        train_ds = self._scaler.transform(train_ds)
         # datasets[name] = ds
 
         # One sub-model per artificial cluster of samples
@@ -246,8 +245,7 @@ def predict_proba(self, ds, threshold = 0.8):
 
     def _predict_proba(self, ds):
         if ds.count() > 0:
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             # ds = ds.materialize()
 
             def predict_func(data):

From fe9fd362f993c8c6d58ce4a80135df9e555d847a Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 11 Dec 2023 11:04:21 -0500
Subject: [PATCH 64/92] MinMax scaling for binary models

---
 src/models/kerasTF/binary_models.py | 14 ++++++--------
 src/models/sklearn/binary_models.py | 12 +++++-------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 0b76a69..d7a9ed4 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -7,6 +7,7 @@
 # Preprocessing
 from ray.data.preprocessors import LabelEncoder, Chain
 from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
@@ -142,10 +143,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
-        
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler.fit(ds)
+    
     # Model training
     #########################################################################################################
 
@@ -155,8 +155,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
             datasets[name] = ds
 
@@ -221,8 +220,7 @@ def _predict_proba(self, ds):
                 ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
             self._predictor = BatchPredictor.from_checkpoint(
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index 61d0656..ca137d0 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -6,6 +6,7 @@
 
 # Preprocessing
 from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.preprocessors.min_max_scaler import TensorMinMaxScaler
 from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
@@ -112,9 +113,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             self._labels_map[label] = encoded
 
         # Scaling
-        if scaling:
-            self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file)
-            self._scaler.fit(ds)
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler.fit(ds)
 
 
     # Model training
@@ -127,8 +127,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             datasets[name] = ray.put(ds)
         
         try:
@@ -187,8 +186,7 @@ def _build(self):
     def predict(self, ds):
         print('predict')
         if ds.count() > 0:
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)

From b80f9f43fd1bc9dff364374d7d36111068aed9bd Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 11 Dec 2023 17:51:16 -0500
Subject: [PATCH 65/92] ray cluster init

---
 src/utils.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 4f48bfa..44386c4 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -75,21 +75,7 @@ def init_ray_cluster(workdir):
     mem = ray._private.utils.get_shared_memory_bytes() - 10
 
     if 'HOST_IP' in list(os.environ.keys()):
-        ray.init(
-            _node_ip_address = os.environ['HOST_IP'],
-            num_cpus = nb_CPU,
-            num_gpus = nb_GPU,
-            _temp_dir = str(workdir),
-            object_store_memory = mem,
-            _system_config={    
-                "object_spilling_config": json.dumps({
-                    "type": "filesystem",
-                    "params": {
-                        "directory_path": str(workdir)
-                    },
-                })
-            },
-        )
+        ray.init(address = f"{os.environ['HOST_IP']}:{os.environ['RAY_PORT']}", _node_ip_address = os.environ['HOST_IP'])
     else:
         ray.init(
             num_cpus = nb_CPU,

From 275f6d21ba031cc0926037d2e328cff1dafaf9df Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 12 Dec 2023 17:37:34 -0500
Subject: [PATCH 66/92] resources for BatchPredictor

---
 src/models/kerasTF/binary_models.py     | 21 ++++++++++++++-------
 src/models/kerasTF/multiclass_models.py | 22 ++++++++++++++--------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index d7a9ed4..2d48cec 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -228,13 +228,20 @@ def _predict_proba(self, ds):
                 TensorflowPredictor,
                 model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
             )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-                num_cpus_per_worker = self._nb_CPU_per_worker,
-                num_gpus_per_worker = self._nb_GPU_per_worker
-            )
+            if self._nb_GPU > 0:
+                predictions = self._predictor.predict(
+                    data = ds,
+                    feature_columns = [TENSOR_COLUMN_NAME],
+                    batch_size = self.batch_size,
+                    num_gpus_per_worker = self._nb_GPU_per_worker
+                )
+            else:
+                predictions = self._predictor.predict(
+                    data = ds,
+                    feature_columns = [TENSOR_COLUMN_NAME],
+                    batch_size = self.batch_size,
+                    num_cpus_per_worker = self._nb_CPU_per_worker
+                )
             return predictions
         else:
             raise ValueError('No data to predict')
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 23b1cb4..259ac83 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -257,7 +257,6 @@ def predict_proba(self, ds, threshold = 0.8):
         # Return decoded labels
         return self._label_decode(predictions)
 
-# TODO: Confirm how it works in Jupyter Notebook
     def _predict_proba(self, ds):
         print('_predict_proba')
         """
@@ -300,13 +299,20 @@ def predict_func(data):
                 TensorflowPredictor,
                 model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
             )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-                num_cpus_per_worker = self._nb_CPU_per_worker,
-                num_gpus_per_worker = self._nb_GPU_per_worker
-            )
+            if self._nb_GPU > 0:
+                predictions = self._predictor.predict(
+                    data = ds,
+                    feature_columns = [TENSOR_COLUMN_NAME],
+                    batch_size = self.batch_size,
+                    num_gpus_per_worker = self._nb_GPU_per_worker
+                )
+            else:
+                predictions = self._predictor.predict(
+                    data = ds,
+                    feature_columns = [TENSOR_COLUMN_NAME],
+                    batch_size = self.batch_size,
+                    num_cpus_per_worker = self._nb_CPU_per_worker
+                )
             return predictions
         else:
             raise ValueError('No data to predict')

From 810de385565cfccfbe43e9efe4c694fd4d198b64 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 13 Dec 2023 17:48:28 -0500
Subject: [PATCH 67/92] tf-idf scaler instead of MinMax for usage with only
 reduced data

---
 environment.yml                         | 124 ------------------------
 frozen_requirements.txt                 |  95 ------------------
 src/models/kerasTF/binary_models.py     |   6 +-
 src/models/kerasTF/multiclass_models.py |   6 +-
 src/models/sklearn/binary_models.py     |   6 +-
 src/models/sklearn/multiclass_models.py |   7 +-
 6 files changed, 20 insertions(+), 224 deletions(-)
 delete mode 100644 environment.yml
 delete mode 100644 frozen_requirements.txt

diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index d48b75b..0000000
--- a/environment.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-name: caribou
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - ca-certificates=2022.10.11=h06a4308_0
-  - certifi=2022.12.7=py38h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.3=he6710b0_2
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - ncurses=6.3=h5eee18b_3
-  - openssl=1.1.1s=h7f8727e_0
-  - pip=22.3.1=py38h06a4308_0
-  - python=3.8.10=h12debd9_8
-  - readline=8.2=h5eee18b_0
-  - setuptools=65.5.0=py38h06a4308_0
-  - sqlite=3.40.0=h5082296_0
-  - tk=8.6.12=h1ccaba5_0
-  - wheel=0.37.1=pyhd3eb1b0_0
-  - xz=5.2.8=h5eee18b_0
-  - zlib=1.2.13=h5eee18b_0
-  - pip:
-    - absl-py==1.3.0
-    - aiohttp==3.8.3
-    - aiohttp-cors==0.7.0
-    - aiorwlock==1.3.0
-    - aiosignal==1.3.1
-    - anyio==3.6.2
-    - astunparse==1.6.3
-    - async-timeout==4.0.2
-    - attrs==22.2.0
-    - biopython==1.78
-    - blessed==1.19.1
-    - cachetools==5.2.0
-    - charset-normalizer==2.1.1
-    - click==8.1.3
-    - cloudpickle==2.2.0
-    - colorful==0.5.5
-    - distlib==0.3.6
-    - fastapi==0.88.0
-    - filelock==3.8.2
-    - flatbuffers==22.12.6
-    - frozenlist==1.3.3
-    - fsspec==2022.11.0
-    - future==0.18.2
-    - gast==0.4.0
-    - google-api-core==2.11.0
-    - google-auth==2.15.0
-    - google-auth-oauthlib==0.4.6
-    - google-pasta==0.2.0
-    - googleapis-common-protos==1.57.0
-    - gpustat==1.0.0
-    - grpcio==1.51.1
-    - h11==0.14.0
-    - h5py==3.7.0
-    - idna==3.4
-    - importlib-metadata==5.2.0
-    - importlib-resources==5.10.1
-    - insilicoseq==1.5.4
-    - joblib==1.2.0
-    - jsonschema==4.17.3
-    - keras==2.11.0
-    - libclang==14.0.6
-    - markdown==3.4.1
-    - markupsafe==2.1.1
-    - msgpack==1.0.4
-    - multidict==6.0.3
-    - numpy==1.23.4
-    - nvidia-ml-py==11.495.46
-    - oauthlib==3.2.2
-    - opencensus==0.11.0
-    - opencensus-context==0.1.3
-    - opt-einsum==3.3.0
-    - packaging==22.0
-    - pandas==1.5.2
-    - pkgutil-resolve-name==1.3.10
-    - platformdirs==2.6.0
-    - prometheus-client==0.13.1
-    - protobuf==3.19.6
-    - psutil==5.9.4
-    - py-spy==0.3.14
-    - pyarrow==6.0.1
-    - pyasn1==0.4.8
-    - pyasn1-modules==0.2.8
-    - pydantic==1.10.2
-    - pyrsistent==0.19.2
-    - pysam==0.20.0
-    - python-dateutil==2.8.2
-    - pytz==2022.7
-    - pyyaml==6.0
-    - ray==2.2.0
-    - requests==2.28.1
-    - requests-oauthlib==1.3.1
-    - rsa==4.9
-    - scikit-learn==1.2.0
-    - scipy==1.9.3
-    - six==1.16.0
-    - smart-open==6.3.0
-    - sniffio==1.3.0
-    - starlette==0.22.0
-    - tabulate==0.9.0
-    - tensorboard==2.11.0
-    - tensorboard-data-server==0.6.1
-    - tensorboard-plugin-wit==1.8.1
-    - tensorboardx==2.5.1
-    - tensorflow==2.11.0
-    - tensorflow-estimator==2.11.0
-    - tensorflow-io-gcs-filesystem==0.29.0
-    - termcolor==2.1.1
-    - threadpoolctl==3.1.0
-    - tqdm==4.64.1
-    - typing-extensions==4.4.0
-    - urllib3==1.26.13
-    - uvicorn==0.20.0
-    - virtualenv==20.17.1
-    - wcwidth==0.2.5
-    - werkzeug==2.2.2
-    - wrapt==1.14.1
-    - yarl==1.8.2
-    - zipp==3.11.0
-prefix: /root/anaconda3/envs/caribou
diff --git a/frozen_requirements.txt b/frozen_requirements.txt
deleted file mode 100644
index caf68a4..0000000
--- a/frozen_requirements.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-absl-py==1.4.0
-aiohttp==3.8.5
-aiohttp-cors==0.7.0
-aiosignal==1.3.1
-astunparse==1.6.3
-async-timeout==4.0.2
-attrs==23.1.0
-biopython==1.78
-blessed==1.20.0
-cachetools==5.3.1
-certifi==2023.7.22
-charset-normalizer==3.2.0
-click==8.1.6
-cloudpickle==2.2.1
-colorful==0.5.5
-Cython==3.0.0
-distlib==0.3.7
-filelock==3.12.2
-flatbuffers==23.5.26
-frozenlist==1.4.0
-future==0.18.3
-gast==0.4.0
-google-api-core==2.11.1
-google-auth==2.22.0
-google-auth-oauthlib==1.0.0
-google-pasta==0.2.0
-googleapis-common-protos==1.60.0
-gpustat==1.1
-grpcio==1.47.0
-h5py==3.8.0
-idna==3.4
-importlib-metadata==6.8.0
-importlib-resources==6.0.0
-InSilicoSeq==1.5.4
-joblib==1.3.1
-jsonschema==4.18.6
-jsonschema-specifications==2023.7.1
-keras==2.13.1
-libclang==16.0.6
-Markdown==3.4.4
-MarkupSafe==2.1.3
-msgpack==1.0.5
-multidict==6.0.4
-numpy==1.25.2
-nvidia-ml-py==12.535.77
-oauthlib==3.2.2
-opencensus==0.11.2
-opencensus-context==0.1.3
-opt-einsum==3.3.0
-packaging==23.1
-pandas==2.0.3
-pkgutil_resolve_name==1.3.10
-platformdirs==3.10.0
-prometheus-client==0.13.1
-protobuf==4.23.4
-psutil==5.9.5
-py-spy==0.3.14
-pyarrow==12.0.0
-pyasn1==0.5.0
-pyasn1-modules==0.3.0
-pydantic==1.10.12
-pysam==0.21.0
-python-dateutil==2.8.2
-pytz==2023.3
-PyYAML==6.0.1
-ray==2.6.3
-referencing==0.30.2
-requests==2.31.0
-requests-oauthlib==1.3.1
-rpds-py==0.10.0
-rsa==4.9
-scikit-learn==1.3.0
-scipy==1.10.1
-six==1.16.0
-smart-open==6.3.0
-tabulate==0.9.0
-tensorboard==2.13.0
-tensorboard-data-server==0.7.1
-tensorboardX==2.6.2
-tensorflow==2.14.0
-tensorflow-estimator==2.13.0
-tensorflow-io-gcs-filesystem==0.32.0
-termcolor==2.3.0
-threadpoolctl==3.2.0
-tqdm==4.65.0
-tune-sklearn==0.4.6
-typing_extensions==4.5.0
-tzdata==2023.3
-urllib3==1.26.16
-virtualenv==20.24.2
-wcwidth==0.2.6
-Werkzeug==2.3.6
-wrapt==1.15.0
-yarl==1.9.2
-zipp==3.16.2
\ No newline at end of file
diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 2d48cec..8eedd51 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -143,7 +143,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler = TensorTfIdfTransformer(
+            features = self.kmers,
+            file = scaler_file
+        )
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
     
     # Model training
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 259ac83..d1d1651 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -137,7 +137,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler = TensorTfIdfTransformer(
+            features = self.kmers,
+            file = scaler_file
+        )
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
 
     # Models training
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index ca137d0..9846f47 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -113,7 +113,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             self._labels_map[label] = encoded
 
         # Scaling
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler = TensorTfIdfTransformer(
+            features = self.kmers,
+            file = scaler_file
+        )
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
 
 
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index d449e64..c464d0d 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -7,7 +7,6 @@
 # Preprocessing
 from models.encoders.model_label_encoder import ModelLabelEncoder
 from models.preprocessors.min_max_scaler import TensorMinMaxScaler
-from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Training
@@ -120,7 +119,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        self._scaler = TensorTfIdfTransformer(
+            features = self.kmers,
+            file = scaler_file
+        )
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
         
     # Models training

From 291d5a952e6eeb978be029e97ba4cd438c33791f Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 14 Dec 2023 19:40:44 -0500
Subject: [PATCH 68/92] sklearn SGD default regularization

---
 src/models/sklearn/binary_models.py     | 1 -
 src/models/sklearn/multiclass_models.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index 9846f47..8a2af15 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -178,7 +178,6 @@ def _build(self):
             self._clf = SGDClassifier()
             self._train_params = {
                 'loss' : 'hinge',
-                'penalty' : 'elasticnet',
                 'learning_rate' : 'optimal',
                 'class_weight' : self._weights,
                 'n_jobs' : -1
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index c464d0d..ef2de2a 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -160,9 +160,8 @@ def build_fit_sgd(train_data):#, val_data):
             # y_val = np.array(y_val[msk_val])
             cluster = train_data['cluster'][0]
             model = SGDClassifier(
-                learning_rate = 'optimal',
                 loss = 'modified_huber',
-                penalty = 'l2',
+                learning_rate = 'optimal',
                 class_weight = self._weights,
             )
             model.fit(X_train, y_train)

From eebbd743fed392fa3e4de9c359960796c05faede Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 14 Dec 2023 19:42:37 -0500
Subject: [PATCH 69/92] MinMax scaler for usage with decomposed data

---
 src/models/kerasTF/binary_models.py     | 10 +++++-----
 src/models/kerasTF/multiclass_models.py | 10 +++++-----
 src/models/sklearn/binary_models.py     | 10 +++++-----
 src/models/sklearn/multiclass_models.py | 10 +++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 8eedd51..fe7fb58 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -143,11 +143,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        self._scaler = TensorTfIdfTransformer(
-            features = self.kmers,
-            file = scaler_file
-        )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler = TensorTfIdfTransformer(
+        #     features = self.kmers,
+        #     file = scaler_file
+        # )
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
     
     # Model training
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index d1d1651..bc1b963 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -137,11 +137,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        self._scaler = TensorTfIdfTransformer(
-            features = self.kmers,
-            file = scaler_file
-        )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler = TensorTfIdfTransformer(
+        #     features = self.kmers,
+        #     file = scaler_file
+        # )
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
 
     # Models training
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index 8a2af15..0ffa104 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -113,11 +113,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
             self._labels_map[label] = encoded
 
         # Scaling
-        self._scaler = TensorTfIdfTransformer(
-            features = self.kmers,
-            file = scaler_file
-        )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler = TensorTfIdfTransformer(
+        #     features = self.kmers,
+        #     file = scaler_file
+        # )
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
 
 
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index ef2de2a..d12724b 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -119,11 +119,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         self._weights = self._compute_weights()
         
         # Scaling
-        self._scaler = TensorTfIdfTransformer(
-            features = self.kmers,
-            file = scaler_file
-        )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler = TensorTfIdfTransformer(
+        #     features = self.kmers,
+        #     file = scaler_file
+        # )
+        self._scaler = TensorMinMaxScaler(self._nb_kmers)
         self._scaler.fit(ds)
         
     # Models training

From aa888a73a48969e3d165194bf3afbe02e016aa84 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 17 Dec 2023 12:46:46 -0500
Subject: [PATCH 70/92] weighted multiclass sklearn predict_proba

---
 src/models/kerasTF/multiclass_models.py | 37 -------------------------
 src/models/sklearn/multiclass_models.py |  8 +++++-
 2 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index bc1b963..55e7421 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -149,43 +149,6 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
 
     def fit(self, datasets):
         print('fit')
-        """
-        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
-        TODO: train_func per model
-        TODO: Confirm how it works in Jupyter Notebook
-        # Preprocessing loop
-        for name, ds in datasets.items():
-            # ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            ds = ds.materialize()
-            datasets[name] = ds
-        
-        # One sub-model per artificial cluster of samples
-        ds['train'] = self._random_split_dataset(ds['train'])
-
-        # Checkpointing directory
-        model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}')
-        if not os.path.isdir(model_dir):
-            os.mkdir(model_dir)
-
-        # Distributed building & training
-        if self.classifier == 'lstm_attention':
-            print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention')
-            training_result = ds.map_groups(build_fit_lstm_attention, batch_format = 'numpy')
-        elif self.classifier == 'cnn':
-            print('Training multiclass classifier based on CNN Neural Network')
-            training_result = ds.map_groups(build_fit_cnn, batch_format = 'numpy')
-        elif self.classifier == 'widecnn':
-            print('Training multiclass classifier based on Wide CNN Network')
-            training_result = ds.map_groups(build_fit_widecnn, batch_format = 'numpy')
-
-        training_result = training_result.to_pandas().to_dict('records')
-        for record in training_result:
-            self._model_ckpt[record['cluster']] = record['file']
-        """
-
         # Preprocessing loop
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index d12724b..f6fade1 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -259,11 +259,17 @@ def predict_func(data):
                     proba = model.predict_proba(X)
                     for i, cls in enumerate(model.classes_):
                         pred[:, cls] += proba[:, i]
-                # pred = pred / len(self._model_ckpt)
+                pred = pred / len(self._model_ckpt)
                 return {'predictions' : pred}
 
             probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
             probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+
+            weights = np.zeros(len(self._weights))
+            for encoded, w in self._weights.items():
+                weights[encoded] = w
+            
+            probabilities = probabilities * weights
             
             return probabilities
         else:

From 4cd603f74df5e0ad532f2cf49336cd4aecc74a8d Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sun, 17 Dec 2023 16:27:01 -0500
Subject: [PATCH 71/92] debug multiclass weighted proba

---
 src/models/sklearn/multiclass_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index f6fade1..0f185d4 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -252,7 +252,7 @@ def _predict_proba(self, ds):
 
             def predict_func(data):
                 X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
-                pred = np.zeros((len(X), len(self._labels_map)))
+                pred = np.zeros((len(X), len(self._labels_map)-1))
                 for cluster, model_file in self._model_ckpt.items():
                     with open(model_file, 'rb') as file:
                         model = cpickle.load(file)

From 1391c327c57f1f9d4c078dbfcc2d140752d2eb29 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 18 Dec 2023 14:45:04 -0500
Subject: [PATCH 72/92] predictions rectify

---
 src/Caribou_reduce_features.py             |  2 +-
 src/models/kerasTF/multiclass_models.py    | 85 ++--------------------
 src/models/models_utils.py                 |  6 +-
 src/models/preprocessors/min_max_scaler.py | 10 +--
 src/models/sklearn/multiclass_models.py    | 15 ----
 5 files changed, 13 insertions(+), 105 deletions(-)

diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py
index efe88db..1f2e5bb 100644
--- a/src/Caribou_reduce_features.py
+++ b/src/Caribou_reduce_features.py
@@ -151,7 +151,7 @@ def features_selection(train_ds, export_ds, kmers, taxa):
     parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files')
     parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced')
     # Parameters
-    parser.add_argument('-t','--taxa', default='phylum', help='The taxonomic level to use for the classification, defaults to Phylum.')
+    parser.add_argument('-t','--taxa', default='species', help='The taxonomic level to use for the classification, defaults to Phylum.')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
     parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled')
     args = parser.parse_args()
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 55e7421..4980023 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -32,6 +32,9 @@
 from ray.train.tensorflow import TensorflowPredictor
 from ray.train.batch_predictor import BatchPredictor
 
+# Data
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
 __author__ = 'Nicolas de Montigny'
 
 __all__ = ['KerasTFModel']
@@ -195,28 +198,16 @@ def fit(self, datasets):
 
     def predict(self, ds):
         print('predict')
-        """
-        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
-        probabilities = self._predict_proba(ds)
-        predictions = np.argmax(probabilities, axis = 1)
-        predictions = self._label_decode(predictions)
-        return predictions
-        """
         # Predict with model
-        predictions = self._predict_proba(ds)
+        probabilities = self._predict_proba(ds)
         # Convert predictions to labels
-        predictions = self._get_abs_pred(predictions)
+        probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+        predictions = np.argmax(probabilities, axis = 1)
         # Return decoded labels
         return self._label_decode(predictions)
     
     def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
-        """
-        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
-        probabilities = self._predict_proba(ds)
-        predictions = self._get_threshold_pred(probabilities, threshold)
-        return self._label_decode(predictions)
-        """
         # Predict with model
         predictions = self._predict_proba(ds)
         # Convert predictions to labels with threshold
@@ -226,32 +217,6 @@ def predict_proba(self, ds, threshold = 0.8):
 
     def _predict_proba(self, ds):
         print('_predict_proba')
-        """
-        TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training
-        if ds.count() > 0:
-            if self._scaler is not None:
-                ds = self._scaler.transform(ds)
-            # ds = ds.materialize()
-
-            def predict_func(data):
-                X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
-                pred = np.zeros((len(X), len(self._labels_map)))
-                for cluster, model_file in self._model_ckpt.items():
-                    with open(model_file, 'rb') as file:
-                        model = cpickle.load(file)
-                    proba = model.predict_proba(X)
-                    for i, cls in enumerate(model.classes_):
-                        pred[:, cls] += proba[:, i]
-                # pred = pred / len(self._model_ckpt)
-                return {'predictions' : pred}
-
-            probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
-            probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
-            
-            return probabilities
-        else:
-            raise ValueError('Empty dataset, cannot execute predictions!')
-        """
         if ds.count() > 0:
             if len(ds.schema().names) > 1:
                 col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
@@ -284,28 +249,6 @@ def predict_func(data):
         else:
             raise ValueError('No data to predict')
 
-    def _get_abs_pred(self, predictions):
-        print('_get_abs_pred')
-        def map_predicted_label(ds):
-            ds = ds['predictions']
-            pred = pd.DataFrame({
-                'best_proba': [np.max(arr) for arr in ds],
-                'predicted_label' : [np.argmax(arr) for arr in ds]
-            })
-
-            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
-        
-        predict = []
-        predictions = predictions.map_batches(
-            lambda batch : map_predicted_label(batch),
-            batch_format = 'numpy',
-            batch_size = self.batch_size
-        )
-        for row in predictions.iter_rows():
-            predict.append(row['predictions'])
-
-        return predict
-
     def _get_threshold_pred(self, predictions, threshold):
         print('_get_threshold_pred')
         def map_predicted_label(ds, threshold):
@@ -328,19 +271,3 @@ def map_predicted_label(ds, threshold):
             predict.append(row['predictions'])
 
         return predict
-
-# TODO: Confirm how it works in Jupyter Notebook
-def build_fit_lstm_attention(data):
-    """
-    LSTM-Attention NN training function
-    """
-
-def build_fit_cnn(data):
-    """
-    Convolution NN training function
-    """
-
-def build_fit_widecnn(data):
-    """
-    Wide Convolution NN training function
-    """
\ No newline at end of file
diff --git a/src/models/models_utils.py b/src/models/models_utils.py
index da4061b..7b7fc6c 100644
--- a/src/models/models_utils.py
+++ b/src/models/models_utils.py
@@ -110,11 +110,6 @@ def _get_threshold_pred(self):
         """
         """
 
-    @abstractmethod
-    def _label_decode(self):
-        """
-        """
-
     def _compute_weights(self):
         """
         Set class weights depending on their abundance in data-associated classes csv
@@ -141,6 +136,7 @@ def _compute_weights(self):
     
     def _label_decode(self, predict):
         print('_label_decode')
+
         decoded = pd.Series(np.empty(len(predict), dtype=object))
         for label, encoded in self._labels_map.items():
             decoded[predict == encoded] = label
diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py
index 0f672a6..a311b5e 100644
--- a/src/models/preprocessors/min_max_scaler.py
+++ b/src/models/preprocessors/min_max_scaler.py
@@ -15,7 +15,7 @@ class TensorMinMaxScaler(Preprocessor):
     
     def __init__(self, nb_features):
         # Parameters
-        self.__nb_features = nb_features
+        self._nb_features = nb_features
         
     def _fit(self, ds: Dataset) -> Preprocessor:
         """
@@ -26,12 +26,12 @@ def _fit(self, ds: Dataset) -> Preprocessor:
 
         def Min(dct):
             arr = dct[TENSOR_COLUMN_NAME]
-            min = np.array([arr[:,i].min() for i in range(self.__nb_features)])
+            min = np.array([arr[:,i].min() for i in range(self._nb_features)])
             return min
 
         def Max(dct):
             arr = dct[TENSOR_COLUMN_NAME]
-            max = np.array([arr[:,i].max() for i in range(self.__nb_features)])
+            max = np.array([arr[:,i].max() for i in range(self._nb_features)])
             return max
 
         for batch in ds.iter_batches(batch_format = 'numpy'):
@@ -41,8 +41,8 @@ def Max(dct):
         min = np.array(min)
         max = np.array(max)
 
-        min = np.array([min[:,i].min() for i in range(self.__nb_features)])
-        max = np.array([max[:,i].max() for i in range(self.__nb_features)])
+        min = np.array([min[:,i].min() for i in range(self._nb_features)])
+        max = np.array([max[:,i].max() for i in range(self._nb_features)])
                 
         self.stats_ = {'min' : min, 'max' : max}
 
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 0f185d4..7a53787 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -248,7 +248,6 @@ def predict_proba(self, ds, threshold = 0.8):
     def _predict_proba(self, ds):
         if ds.count() > 0:
             ds = self._scaler.transform(ds)
-            # ds = ds.materialize()
 
             def predict_func(data):
                 X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
@@ -264,12 +263,6 @@ def predict_func(data):
 
             probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
             probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
-
-            weights = np.zeros(len(self._weights))
-            for encoded, w in self._weights.items():
-                weights[encoded] = w
-            
-            probabilities = probabilities * weights
             
             return probabilities
         else:
@@ -289,11 +282,3 @@ def _get_threshold_pred(self, predict, threshold):
         proba_predict.loc[proba_predict['best_proba'] < threshold, 'predicted_label'] = -1
 
         return proba_predict['predicted_label']
-    
-    def _label_decode(self, predict):
-        print('_label_decode')
-        decoded = pd.Series(np.empty(len(predict), dtype=object))
-        for label, encoded in self._labels_map.items():
-            decoded[predict == encoded] = label
-
-        return np.array(decoded)

From 5eba525bc3252c3a6c19ba95da9d232dbdb44339 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 18 Dec 2023 16:54:15 -0500
Subject: [PATCH 73/92] performances tuning

---
 src/models/classification.py            |  3 ++-
 src/models/kerasTF/binary_models.py     |  8 ++++----
 src/models/kerasTF/multiclass_models.py |  8 ++++----
 src/models/sklearn/binary_models.py     |  8 ++++----
 src/models/sklearn/multiclass_models.py | 16 +++++++---------
 5 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/models/classification.py b/src/models/classification.py
index b1f76e6..c164bf8 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -320,7 +320,8 @@ def _score_cv(self, y_true, y_pred, taxa):
         support = precision_recall_fscore_support(
             y_compare['y_true'],
             y_compare['y_pred'],
-            average = 'weighted'
+            average = 'weighted',
+            zero_division = np.nan
         )
 
         scores = pd.DataFrame({
diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index fe7fb58..ae30644 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -147,8 +147,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         #     features = self.kmers,
         #     file = scaler_file
         # )
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        self._scaler.fit(ds)
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler.fit(ds)
     
     # Model training
     #########################################################################################################
@@ -159,7 +159,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._scaler.transform(ds)
+            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
             datasets[name] = ds
 
@@ -224,7 +224,7 @@ def _predict_proba(self, ds):
                 ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            ds = self._scaler.transform(ds)
+            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
             self._predictor = BatchPredictor.from_checkpoint(
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 4980023..9807556 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -144,8 +144,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         #     features = self.kmers,
         #     file = scaler_file
         # )
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        self._scaler.fit(ds)
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler.fit(ds)
 
     # Models training
     #########################################################################################################
@@ -156,7 +156,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._scaler.transform(ds)
+            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
             datasets[name] = ds
 
@@ -223,7 +223,7 @@ def _predict_proba(self, ds):
                 ds = ds.drop_columns(col_2_drop)
 
             # Preprocess
-            ds = self._scaler.transform(ds)
+            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
             self._predictor = BatchPredictor.from_checkpoint(
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index 0ffa104..cd5e6c0 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -117,8 +117,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         #     features = self.kmers,
         #     file = scaler_file
         # )
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        self._scaler.fit(ds)
+        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
+        # self._scaler.fit(ds)
 
 
     # Model training
@@ -131,7 +131,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            ds = self._scaler.transform(ds)
+            # ds = self._scaler.transform(ds)
             datasets[name] = ray.put(ds)
         
         try:
@@ -189,7 +189,7 @@ def _build(self):
     def predict(self, ds):
         print('predict')
         if ds.count() > 0:
-            ds = self._scaler.transform(ds)
+            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index 7a53787..f7e9f10 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -118,13 +118,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         # Class weights
         self._weights = self._compute_weights()
         
-        # Scaling
-        # self._scaler = TensorTfIdfTransformer(
-        #     features = self.kmers,
-        #     file = scaler_file
-        # )
-        self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        self._scaler.fit(ds)
+        if self.classifier == 'mnb':
+            self._scaler = TensorMinMaxScaler(self._nb_kmers)
+            self._scaler.fit(ds)
         
     # Models training
     #########################################################################################################
@@ -135,7 +131,8 @@ def fit(self, datasets):
             # ds = ds.drop_columns(['id'])
         train_ds = datasets['train']
         train_ds = self._encoder.transform(train_ds)
-        train_ds = self._scaler.transform(train_ds)
+        if self.classifier == 'mnb':
+            train_ds = self._scaler.transform(train_ds)
         # datasets[name] = ds
 
         # One sub-model per artificial cluster of samples
@@ -247,7 +244,8 @@ def predict_proba(self, ds, threshold = 0.8):
 
     def _predict_proba(self, ds):
         if ds.count() > 0:
-            ds = self._scaler.transform(ds)
+            if self.classifier == 'mnb':
+                ds = self._scaler.transform(ds)
 
             def predict_func(data):
                 X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])

From 747ae2e9326df5929aaff1a2be05c9ae47defb24 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Mon, 18 Dec 2023 17:09:23 -0500
Subject: [PATCH 74/92] LSTM params to be used with cuDNN

---
 src/models/kerasTF/build_neural_networks.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index e69747a..12893f3 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -22,8 +22,8 @@ def build_attention(nb_features):
     inputs = Input(shape = (nb_features,1))
     # x = Embedding(nb_features, 128)(inputs)
 
-    x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(inputs)
-    x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x)
+    x = LSTM(128, return_sequences = True, dropout = 0.1)(inputs)
+    x = LSTM(128, return_sequences = True, dropout = 0.1)(x)
     x = AttentionWeightedAverage()(x)
 
     x = Dense(128, activation = "relu")(x)
@@ -46,7 +46,7 @@ def build_LSTM(nb_features):
     inputs = Input(shape = (nb_features,1))
     # x = Embedding(nb_features, 128)(inputs)
 
-    x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(inputs)
+    x = LSTM(128, dropout = 0.1)(inputs)
 
     x = Dense(1, activation = 'tanh')(x)
     
@@ -66,8 +66,8 @@ def build_deepLSTM(nb_features):
     inputs = Input(shape=(nb_features,1))
 
     # netA = Embedding(nb_features, 128)(inputs)
-    netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (inputs)
-    netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA)
+    netA = LSTM(40, activation='tanh',dropout=0.1,name='A_%d'%40,return_sequences=True) (inputs)
+    netA = LSTM(40, activation='tanh',dropout=0.1,name='B_%d'%40) (netA)
 
     netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs)
     netB = Dense(100, activation='tanh',name='H_%d'%40) (netB)

From d851ee0905fa87b946b1f6926c6fd10c05609a39 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 19 Dec 2023 16:49:07 -0500
Subject: [PATCH 75/92] keras debugged for GPU

---
 src/models/kerasTF/binary_models.py     | 25 ++------
 src/models/kerasTF/models.py            | 78 +++++++++++++++++++++++--
 src/models/kerasTF/multiclass_models.py | 25 ++------
 3 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index ae30644..1a81eb9 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -20,7 +20,7 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
-from models.kerasTF.models import train_func, build_model
+from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
@@ -93,25 +93,7 @@ def __init__(
             kmers_list,
             csv
         )
-        # Parameters
-        # Initialize hidden
-        self._nb_CPU_data = int(os.cpu_count() * 0.2)
-        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data)
-        self._nb_GPU = len(tf.config.list_physical_devices('GPU'))
-        # Initialize empty
         self._nb_classes = 2
-        self._nb_CPU_per_worker = 0
-        self._nb_GPU_per_worker = 0
-        # Computing variables
-        if self._nb_GPU > 0:
-            self._use_gpu = True
-            self._n_workers = self._nb_GPU
-            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers)
-            self._nb_GPU_per_worker = 1
-        else:
-            self._use_gpu = False
-            self._n_workers = int(self._nb_CPU_training * 0.2)
-            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
 
         if self.classifier == 'attention':
             print('Training bacterial / host classifier based on Attention Weighted Neural Network')
@@ -173,6 +155,11 @@ def fit(self, datasets):
             'weights': self._weights
         }
 
+        if self._nb_GPU > 0:
+            train_func = train_func_GPU
+        else:
+            train_func = train_func_CPU
+
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
             train_loop_per_worker=train_func,
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index ba1c3a4..d26b28f 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -94,7 +94,25 @@ def __init__(
             kmers_list,
             csv
         )
-        
+        # Parameters
+        # Initialize hidden
+        self._nb_CPU_data = int(os.cpu_count() * 0.2) # 6
+        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 26
+        self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 6
+        # Initialize empty
+        self._nb_CPU_per_worker = 0
+        self._nb_GPU_per_worker = 0
+        # Computing variables
+        if self._nb_GPU > 0:
+            self._use_gpu = True
+            self._n_workers = self._nb_GPU #6
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
+            self._nb_GPU_per_worker = 1
+        else:
+            self._use_gpu = False
+            self._n_workers = int(self._nb_CPU_training * 0.2)
+            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
+
     @abstractmethod
     def preprocess(self):
         """
@@ -120,7 +138,6 @@ def _get_threshold_pred(self):
         """
         """
 
-
 # Training/building function outside of the class as mentioned on the Ray discussion
 # https://discuss.ray.io/t/statuscode-resource-exhausted/4379/16
 ################################################################################
@@ -130,7 +147,7 @@ def _get_threshold_pred(self):
 # Smaller nb of workers + bigger nb CPU_per_worker + smaller batch_size to avoid memory overload
 # https://discuss.ray.io/t/ray-sgd-distributed-tensorflow/261/8
 
-def train_func(config):
+def train_func_CPU(config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
@@ -139,7 +156,7 @@ def train_func(config):
     model = config.get('model')
     weights = config.get('weights')
 
-    # Model construction 
+    # Model construction
     model = build_model(model, nb_cls, size)
 
     train_data = session.get_dataset_shard('train')
@@ -181,6 +198,59 @@ def train_func(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
+def train_func_GPU(config):
+    # Parameters
+    batch_size = config.get('batch_size', 128)
+    epochs = config.get('epochs', 10)
+    size = config.get('size')
+    nb_cls = config.get('nb_cls')
+    model = config.get('model')
+    weights = config.get('weights')
+
+    # Model construction
+    strategy = tf.distribute.MirroredStrategy()
+    with strategy.scope():
+        model = build_model(model, nb_cls, size)
+
+    train_data = session.get_dataset_shard('train')
+    val_data = session.get_dataset_shard('validation')
+
+    for _ in range(epochs):
+        batch_train = train_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        batch_val = val_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        history = model.fit(
+            x = batch_train,
+            validation_data = batch_val,
+            callbacks = [ReportCheckpointCallback()],
+            class_weight = weights,
+            verbose = 0
+        )
+        session.report({
+            'accuracy': history.history['accuracy'][0],
+            'loss': history.history['loss'][0],
+            'val_accuracy': history.history['val_accuracy'][0],
+            'val_loss': history.history['val_loss'][0],
+        },
+            checkpoint=TensorflowCheckpoint.from_model(model)
+        )
+        gc.collect()
+        tf.keras.backend.clear_session()
+    del model
+    gc.collect()
+    tf.keras.backend.clear_session()
+
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':
         model = build_attention(nb_kmers)
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 9807556..68fe190 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -21,7 +21,7 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
-from models.kerasTF.models import train_func, build_model
+from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
@@ -97,25 +97,7 @@ def __init__(
             kmers_list,
             csv
         )
-        # Parameters
-        # Initialize hidden
-        self._nb_CPU_data = int(os.cpu_count() * 0.2)
-        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data)
-        self._nb_GPU = len(tf.config.list_physical_devices('GPU'))
-        # Initialize empty
         self._nb_classes = None
-        self._nb_CPU_per_worker = 0
-        self._nb_GPU_per_worker = 0
-        # Computing variables
-        if self._nb_GPU > 0:
-            self._use_gpu = True
-            self._n_workers = self._nb_GPU
-            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers)
-            self._nb_GPU_per_worker = 1
-        else:
-            self._use_gpu = False
-            self._n_workers = int(self._nb_CPU_training * 0.2)
-            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
 
     # Data preprocessing
     #########################################################################################################
@@ -170,6 +152,11 @@ def fit(self, datasets):
             'weights': self._weights
         }
 
+        if self._nb_GPU > 0:
+            train_func = train_func_GPU
+        else:
+            train_func = train_func_CPU
+
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
             train_loop_per_worker=train_func,

From 64bd8d8b08dd51b7763906083cec171bfe895011 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Tue, 19 Dec 2023 16:55:29 -0500
Subject: [PATCH 76/92] keras predictions all resources

---
 src/models/kerasTF/binary_models.py     | 19 +++++--------------
 src/models/kerasTF/multiclass_models.py | 19 +++++--------------
 2 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 1a81eb9..0679396 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -219,20 +219,11 @@ def _predict_proba(self, ds):
                 TensorflowPredictor,
                 model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
             )
-            if self._nb_GPU > 0:
-                predictions = self._predictor.predict(
-                    data = ds,
-                    feature_columns = [TENSOR_COLUMN_NAME],
-                    batch_size = self.batch_size,
-                    num_gpus_per_worker = self._nb_GPU_per_worker
-                )
-            else:
-                predictions = self._predictor.predict(
-                    data = ds,
-                    feature_columns = [TENSOR_COLUMN_NAME],
-                    batch_size = self.batch_size,
-                    num_cpus_per_worker = self._nb_CPU_per_worker
-                )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+            )
             return predictions
         else:
             raise ValueError('No data to predict')
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 68fe190..4ab1b9c 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -218,20 +218,11 @@ def _predict_proba(self, ds):
                 TensorflowPredictor,
                 model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
             )
-            if self._nb_GPU > 0:
-                predictions = self._predictor.predict(
-                    data = ds,
-                    feature_columns = [TENSOR_COLUMN_NAME],
-                    batch_size = self.batch_size,
-                    num_gpus_per_worker = self._nb_GPU_per_worker
-                )
-            else:
-                predictions = self._predictor.predict(
-                    data = ds,
-                    feature_columns = [TENSOR_COLUMN_NAME],
-                    batch_size = self.batch_size,
-                    num_cpus_per_worker = self._nb_CPU_per_worker
-                )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+            )
             return predictions
         else:
             raise ValueError('No data to predict')

From fb4fc1b354a7daa08af75d1d3cab10fba444d515 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 12:34:14 -0500
Subject: [PATCH 77/92] keras CPU/GPU strategies

---
 src/models/kerasTF/binary_models.py     | 148 +++++++-------
 src/models/kerasTF/models.py            | 259 +++++++++++++++++++-----
 src/models/kerasTF/multiclass_models.py | 120 ++++++-----
 3 files changed, 358 insertions(+), 169 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 0679396..b03165b 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -104,7 +104,7 @@ def __init__(
         
     # Data preprocessing
     #########################################################################################################
-
+    """
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         # Labels encoding
@@ -131,10 +131,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         # )
         # self._scaler = TensorMinMaxScaler(self._nb_kmers)
         # self._scaler.fit(ds)
-    
+    """
     # Model training
     #########################################################################################################
 
+    """
     def fit(self, datasets):
         print('fit')
         # Preprocessing loop
@@ -145,6 +146,12 @@ def fit(self, datasets):
             ds = ds.materialize()
             datasets[name] = ds
 
+        if self._nb_GPU > 0:
+            self._fit_GPU(datasets)
+        else:
+            self._fit_CPU(datasets)
+
+    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -155,22 +162,16 @@ def fit(self, datasets):
             'weights': self._weights
         }
 
-        if self._nb_GPU > 0:
-            train_func = train_func_GPU
-        else:
-            train_func = train_func_CPU
-
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func,
+            train_loop_per_worker=train_func_CPU,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker,
-                    'GPU': self._nb_GPU_per_worker
+                    'CPU': self._nb_CPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -183,24 +184,40 @@ def fit(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
+    def _fit_GPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'taxa': self.taxa,
+            'workdir':self._workdir,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        self._model_ckpt = train_func_GPU(datasets, train_params)
+    """
     # Model predicting
     #########################################################################################################
 
+    """
     def predict(self, ds):
         print('predict')
         # Predict with model
-        predictions = self._predict_proba(ds)
+        probabilities = self._predict_proba(ds)
         # Convert predictions to labels
-        predictions = self._get_abs_pred(predictions)
+        predictions = self._get_abs_pred(probabilities)
         # Return decoded labels
         return self._label_decode(predictions)
 
     def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
         # Predict with model
-        predictions = self._predict_proba(ds)
+        probabilities = self._predict_proba(ds)
         # Convert predictions to labels with threshold
-        predictions = self._get_threshold_pred(predictions, threshold)
+        predictions = self._get_threshold_pred(probabilities, threshold)
         # Return decoded labels
         return self._label_decode(predictions)
     
@@ -210,69 +227,64 @@ def _predict_proba(self, ds):
                 col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
                 ds = ds.drop_columns(col_2_drop)
 
-            # Preprocess
-            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
-            self._predictor = BatchPredictor.from_checkpoint(
-                self._model_ckpt,
-                TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-            )
-            return predictions
+            if self._nb_GPU > 0:
+                probabilities = self._predict_proba_GPU(ds)
+            else:
+                probabilities = self._predict_proba_CPU(ds)
+            
+            return probabilities
+
         else:
             raise ValueError('No data to predict')
-        
+    def _predict_proba_CPU(self, ds):
+        print('_predict_proba_CPU')
+        self._predictor = BatchPredictor.from_checkpoint(
+            self._model_ckpt,
+            TensorflowPredictor,
+            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+        )
+        predictions = self._predictor.predict(
+            data = ds,
+            feature_columns = [TENSOR_COLUMN_NAME],
+            batch_size = self.batch_size,
+        )
+
+        probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+
+        return predictions
+    
+    def _predict_proba_GPU(self, ds):
+        print('_predict_proba_GPU')
+        model = load_model(self._model_ckpt)
+        probabilities = []
+        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
+            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
+    """
     def _get_abs_pred(self, predictions):
         print('_get_abs_pred')
-        def map_predicted_label(ds):
-            ds = np.ravel(ds['predictions'])
-            threshold = 0.5
-            predict = pd.DataFrame({
-                'proba': ds,
-                'predicted_label': np.full(len(ds), -1)
-            })
-            predict.loc[predict['proba'] > threshold, 'predicted_label'] = 1
-            predict.loc[predict['proba'] < threshold, 'predicted_label'] = 0
-            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
-                
-        predict = []
-        predictions = predictions.map_batches(
-            lambda batch : map_predicted_label(batch),
-            batch_format = 'numpy',
-            batch_size = self.batch_size
-        )
-        for row in predictions.iter_rows():
-            predict.append(row['predictions'])
+        return np.round(np.ravel(predictions))
+        # predict = pd.DataFrame({
+        #     'proba': np.ravel(predictions),
+        #     'predicted_label' : np.full(len(predictions), -1)
+        # })
+        # predict.loc[predict['proba'] > 0.5, 'predicted_label'] = 1
+        # predict.loc[predict['proba'] < 0.5, 'predicted_label'] = 0
 
-        return predict
+        # return predict
 
     def _get_threshold_pred(self, predictions, threshold):
         print('_get_threshold_pred')
-        def map_predicted_label(ds, threshold):
-            ds = np.ravel(ds['predictions'])
-            lower_threshold = 0.5 - (threshold * 0.5)
-            upper_threshold = 0.5 + (threshold * 0.5)
-            predict = pd.DataFrame({
-                'proba': ds,
-                'predicted_label': np.full(len(ds), -1)
-            })
-            predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1
-            predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0
-            return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)}
+        lower_threshold = 0.5 - (threshold * 0.5)
+        upper_threshold = 0.5 + (threshold * 0.5)
         
-        predict = []
-        predictions = predictions.map_batches(
-            lambda batch : map_predicted_label(batch, threshold),
-            batch_format = 'numpy',
-            batch_size = self.batch_size
-        )
-        for row in predictions.iter_rows():
-            predict.append(row['predictions'])
+        predict = pd.DataFrame({
+            'proba': np.ravel(predictions),
+            'label' : np.full(len(predictions), -1)
+        })
 
-        return predict
\ No newline at end of file
+        predict.loc[predict['proba'] >= upper_threshold, 'label'] = 1
+        predict.loc[predict['proba'] <= lower_threshold, 'label'] = 0
+        
+        return predict['label'].to_numpy(dtype = np.int32)
\ No newline at end of file
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index d26b28f..93917f6 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -20,18 +20,22 @@
 # Training
 import tensorflow as tf
 from ray.air import session
-# from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
 from ray.air.integrations.keras import ReportCheckpointCallback
+from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
 # Tuning
 from ray.air.config import RunConfig
 
 # Predicting
+from tensorflow.keras.models import load_model
 from ray.train.tensorflow import TensorflowPredictor
 from ray.train.batch_predictor import BatchPredictor
 
+# Data
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
 __author__ = 'Nicolas de Montigny'
 
 __all__ = ['KerasTFModel']
@@ -113,31 +117,169 @@ def __init__(
             self._n_workers = int(self._nb_CPU_training * 0.2)
             self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
 
-    @abstractmethod
-    def preprocess(self):
-        """
-        """
+    # Data preprocessing
+    #########################################################################################################
+
+    def preprocess(self, ds, scaling = False, scaler_file = None):
+        print('preprocess')
+        # Labels encoding
+        self._encoder = ModelLabelEncoder(self.taxa)
+        self._encoder.fit(ds)
+
+        # Labels mapping
+        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+        self._nb_classes = len(labels)
+        self._encoded = np.arange(len(labels))
+        labels = np.append(labels, 'Unknown')
+        self._encoded = np.append(self._encoded, -1)
+
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
         
-    @abstractmethod
+        # Class weights
+        self._weights = self._compute_weights()
+
+    # Models training
+    #########################################################################################################
+
     def fit(self, datasets):
-        """
-        """
+        print('fit')
+        # Preprocessing loop
+        for name, ds in datasets.items():
+            # ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            # ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            datasets[name] = ds
+
+        if self._nb_GPU > 0:
+            self._fit_GPU(datasets)
+        else:
+            self._fit_CPU(datasets)
+
+    def _fit_CPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        # Define trainer / tuner
+        self._trainer = TensorflowTrainer(
+            train_loop_per_worker=train_func_CPU,
+            train_loop_config=train_params,
+            scaling_config=ScalingConfig(
+                trainer_resources={'CPU': self._nb_CPU_data},
+                num_workers=self._n_workers,
+                use_gpu=self._use_gpu,
+                resources_per_worker={
+                    'CPU': self._nb_CPU_per_worker
+                }
+            ),
+            run_config=RunConfig(
+                name=self.classifier,
+                local_dir=self._workdir,
+            ),
+            datasets=datasets,
+        )
+
+        training_result = self._trainer.fit()
+        self._model_ckpt = training_result.best_checkpoints[0][0]
+
+    def _fit_GPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'taxa': self.taxa,
+            'workdir':self._workdir,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        self._model_ckpt = train_func_GPU(datasets, train_params)
+
+    # Models predicting
+    #########################################################################################################
 
-    @abstractmethod
     def predict(self, ds):
-        """
-        """
+        print('predict')
+        # Predict with model
+        probabilities = self._predict_proba(ds)
+        # Convert predictions to labels
+        predictions = self._get_abs_pred(probabilities)
+        # Return decoded labels
+        return self._label_decode(predictions)
+    
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        # Predict with model
+        probabilities = self._predict_proba(ds)
+        # Convert predictions to labels with threshold
+        predictions = self._get_threshold_pred(probabilities, threshold)
+        # Return decoded labels
+        return self._label_decode(predictions)
+
+    def _predict_proba(self, ds):
+        print('_predict_proba')
+        if ds.count() > 0:
+            if len(ds.schema().names) > 1:
+                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
+                ds = ds.drop_columns(col_2_drop)
+
+            ds = ds.materialize()
+
+            if self._nb_GPU > 0:
+                probabilities = self._predict_proba_GPU(ds)
+            else:
+                probabilities = self._predict_proba_CPU(ds)
+            
+            return probabilities
+        else:
+            raise ValueError('No data to predict')
+
+    def _predict_proba_CPU(self, ds):
+        print('_predict_proba_CPU')
+        self._predictor = BatchPredictor.from_checkpoint(
+            self._model_ckpt,
+            TensorflowPredictor,
+            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+        )
+        predictions = self._predictor.predict(
+            data = ds,
+            feature_columns = [TENSOR_COLUMN_NAME],
+            batch_size = self.batch_size,
+        )
+
+        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+
+        return probabilities
+    
+    def _predict_proba_GPU(self, ds):
+        print('_predict_proba_GPU')
+        model = load_model(self._model_ckpt)
+        probabilities = []
+        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
+            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
+
+        return probabilities
 
     @abstractmethod
-    def predict_proba(self):
+    def _get_abs_pred(self):
         """
         """
-
+    
     @abstractmethod
     def _get_threshold_pred(self):
         """
         """
-
+    
 # Training/building function outside of the class as mentioned on the Ray discussion
 # https://discuss.ray.io/t/statuscode-resource-exhausted/4379/16
 ################################################################################
@@ -159,6 +301,7 @@ def train_func_CPU(config):
     # Model construction
     model = build_model(model, nb_cls, size)
 
+    # Data
     train_data = session.get_dataset_shard('train')
     val_data = session.get_dataset_shard('validation')
 
@@ -177,6 +320,7 @@ def train_func_CPU(config):
             local_shuffle_buffer_size = batch_size,
             local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
+        # Training
         history = model.fit(
             x = batch_train,
             validation_data = batch_val,
@@ -184,6 +328,7 @@ def train_func_CPU(config):
             class_weight = weights,
             verbose = 0
         )
+        # Checkpointing
         session.report({
             'accuracy': history.history['accuracy'][0],
             'loss': history.history['loss'][0],
@@ -198,58 +343,62 @@ def train_func_CPU(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
-def train_func_GPU(config):
+def train_func_GPU(datasets, config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
     size = config.get('size')
     nb_cls = config.get('nb_cls')
+    taxa = config.get('taxa')
+    workdir = config.get('workdir')
     model = config.get('model')
     weights = config.get('weights')
 
+    checkpoint = os.path.join(workdir, model)
+
+    # Data
+    train_ds = datasets['train']
+    val_ds = datasets['validation']
+
+    # Convert datasets to tensorflow ds & generator
+    train_ds = train_ds.iterator().to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+    val_ds = val_ds.to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+
     # Model construction
-    strategy = tf.distribute.MirroredStrategy()
-    with strategy.scope():
-        model = build_model(model, nb_cls, size)
+    model = build_model(model, nb_cls, size)
 
-    train_data = session.get_dataset_shard('train')
-    val_data = session.get_dataset_shard('validation')
+    # Callbacks
+    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
+    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
+    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
+    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
+    csv = CSVLogger(model_csv)
+
+    # Training
+    hist = model.fit(
+        train_ds,
+        epochs = epochs,
+        validation_data = val_ds,
+        callbacks = [modelckpt, early, csv],
+        class_weight = weights,
+        verbose = 0
+    )
+
+    # Checkpointing
+    best_model = np.argmin(hist.history['val_loss']) + 1
+    best_model = f'{best_model:03d}.hdf5'
+    best_model = os.path.join(checkpoint, taxa, best_model)
+    
+    return best_model
 
-    for _ in range(epochs):
-        batch_train = train_data.to_tf(
-            feature_columns = TENSOR_COLUMN_NAME,
-            label_columns = LABELS_COLUMN_NAME,
-            batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
-        )
-        batch_val = val_data.to_tf(
-            feature_columns = TENSOR_COLUMN_NAME,
-            label_columns = LABELS_COLUMN_NAME,
-            batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
-        )
-        history = model.fit(
-            x = batch_train,
-            validation_data = batch_val,
-            callbacks = [ReportCheckpointCallback()],
-            class_weight = weights,
-            verbose = 0
-        )
-        session.report({
-            'accuracy': history.history['accuracy'][0],
-            'loss': history.history['loss'][0],
-            'val_accuracy': history.history['val_accuracy'][0],
-            'val_loss': history.history['val_loss'][0],
-        },
-            checkpoint=TensorflowCheckpoint.from_model(model)
-        )
-        gc.collect()
-        tf.keras.backend.clear_session()
-    del model
-    gc.collect()
-    tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 4ab1b9c..99bf6bd 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -21,7 +21,6 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
-from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
@@ -29,6 +28,7 @@
 from ray.air.config import RunConfig
 
 # Predicting
+from tensorflow.keras.models import load_model
 from ray.train.tensorflow import TensorflowPredictor
 from ray.train.batch_predictor import BatchPredictor
 
@@ -102,6 +102,7 @@ def __init__(
     # Data preprocessing
     #########################################################################################################
 
+    """
     def preprocess(self, ds, scaling = False, scaler_file = None):
         print('preprocess')
         # Labels encoding
@@ -128,10 +129,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         # )
         # self._scaler = TensorMinMaxScaler(self._nb_kmers)
         # self._scaler.fit(ds)
-
+    """
     # Models training
     #########################################################################################################
 
+    """
     def fit(self, datasets):
         print('fit')
         # Preprocessing loop
@@ -142,6 +144,12 @@ def fit(self, datasets):
             ds = ds.materialize()
             datasets[name] = ds
 
+        if self._nb_GPU > 0:
+            self._fit_GPU(datasets)
+        else:
+            self._fit_CPU(datasets)
+
+    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -152,22 +160,16 @@ def fit(self, datasets):
             'weights': self._weights
         }
 
-        if self._nb_GPU > 0:
-            train_func = train_func_GPU
-        else:
-            train_func = train_func_CPU
-
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func,
+            train_loop_per_worker=train_func_CPU,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker,
-                    'GPU': self._nb_GPU_per_worker
+                    'CPU': self._nb_CPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -179,26 +181,41 @@ def fit(self, datasets):
 
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
+    
+    def _fit_GPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'taxa': self.taxa,
+            'workdir':self._workdir,
+            'model': self.classifier,
+            'weights': self._weights
+        }
 
+        self._model_ckpt = train_func_GPU(datasets, train_params)
+        """
     # Models predicting
     #########################################################################################################
 
+    """
     def predict(self, ds):
         print('predict')
         # Predict with model
         probabilities = self._predict_proba(ds)
         # Convert predictions to labels
-        probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
-        predictions = np.argmax(probabilities, axis = 1)
+        predictions = self._get_abs_pred(probabilities)
         # Return decoded labels
         return self._label_decode(predictions)
     
     def predict_proba(self, ds, threshold = 0.8):
         print('predict_proba')
         # Predict with model
-        predictions = self._predict_proba(ds)
+        probabilities = self._predict_proba(ds)
         # Convert predictions to labels with threshold
-        predictions = self._get_threshold_pred(predictions, threshold)
+        predictions = self._get_threshold_pred(probabilities, threshold)
         # Return decoded labels
         return self._label_decode(predictions)
 
@@ -209,43 +226,54 @@ def _predict_proba(self, ds):
                 col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
                 ds = ds.drop_columns(col_2_drop)
 
-            # Preprocess
-            # ds = self._scaler.transform(ds)
             ds = ds.materialize()
 
-            self._predictor = BatchPredictor.from_checkpoint(
-                self._model_ckpt,
-                TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-            )
-            return predictions
+            if self._nb_GPU > 0:
+                probabilities = self._predict_proba_GPU(ds)
+            else:
+                probabilities = self._predict_proba_CPU(ds)
+            
+            return probabilities
         else:
             raise ValueError('No data to predict')
 
+    def _predict_proba_CPU(self, ds):
+        print('_predict_proba_CPU')
+        self._predictor = BatchPredictor.from_checkpoint(
+            self._model_ckpt,
+            TensorflowPredictor,
+            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+        )
+        predictions = self._predictor.predict(
+            data = ds,
+            feature_columns = [TENSOR_COLUMN_NAME],
+            batch_size = self.batch_size,
+        )
+
+        probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+
+        return probabilities
+    
+    def _predict_proba_GPU(self, ds):
+        print('_predict_proba_GPU')
+        model = load_model(self._model_ckpt)
+        probabilities = []
+        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
+            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
+        
+        return probabilities
+    """
+    def _get_abs_pred(self, predictions):
+        print('_get_abs_pred')
+        return np.argmax(predictions, axis = 1)
+
     def _get_threshold_pred(self, predictions, threshold):
         print('_get_threshold_pred')
-        def map_predicted_label(ds, threshold):
-            ds = ds['predictions']
-            pred = pd.DataFrame({
-                'best_proba': [np.max(arr) for arr in ds],
-                'predicted_label' : [np.argmax(arr) for arr in ds]
-            })
-            pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1
-
-            return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)}
-
-        predict = []
-        predictions = predictions.map_batches(
-            lambda batch : map_predicted_label(batch, threshold),
-            batch_format = 'numpy',
-            batch_size = self.batch_size
-        )
-        for row in predictions.iter_rows():
-            predict.append(row['predictions'])
+        pred = pd.DataFrame({
+            'proba': [np.max(arr) for arr in predictions],
+            'label' : [np.argmax(arr) for arr in predictions]
+        })
+        pred.loc[pred['proba'] < threshold, 'label'] = -1
+
+        return pred['label'].to_numpy(dtype = np.int32)
 
-        return predict

From 88a10529a95f496a769c841ebdbacf083b129425 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 12:48:44 -0500
Subject: [PATCH 78/92] keras fit verbose

---
 src/models/kerasTF/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 93917f6..8330925 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -389,7 +389,7 @@ def train_func_GPU(datasets, config):
         validation_data = val_ds,
         callbacks = [modelckpt, early, csv],
         class_weight = weights,
-        verbose = 0
+        verbose = 1
     )
 
     # Checkpointing

From 4b86498af3382bd64bc3c5b291ff36b5a989fae7 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 15:24:33 -0500
Subject: [PATCH 79/92] TF-IDF scaling for smaller k experiments

---
 src/models/kerasTF/models.py            | 173 ++++------
 src/models/kerasTF/models_linear.py     | 423 ++++++++++++++++++++++++
 src/models/sklearn/binary_models.py     |  15 +-
 src/models/sklearn/multiclass_models.py |  13 +-
 4 files changed, 507 insertions(+), 117 deletions(-)
 create mode 100644 src/models/kerasTF/models_linear.py

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 8330925..126a908 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -136,6 +136,10 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         for (label, encoded) in zip(labels, self._encoded):
             self._labels_map[label] = encoded
         
+        # Features scaling
+        self._scaler = TensorTfIdfTransformer(features = self.kmers, file = scaler_file)
+        self._scaler.fit(ds)
+
         # Class weights
         self._weights = self._compute_weights()
 
@@ -148,16 +152,10 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            # ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
             datasets[name] = ds
 
-        if self._nb_GPU > 0:
-            self._fit_GPU(datasets)
-        else:
-            self._fit_CPU(datasets)
-
-    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -168,16 +166,22 @@ def _fit_CPU(self, datasets):
             'weights': self._weights
         }
 
+        if self._nb_GPU > 0:
+            train_func = train_func_GPU
+        else:
+            train_func = train_func_CPU
+
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func_CPU,
+            train_loop_per_worker=train_func,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker
+                    'CPU': self._nb_CPU_per_worker,
+                    'GPU' : self._nb_GPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -190,21 +194,6 @@ def _fit_CPU(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
-    def _fit_GPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'taxa': self.taxa,
-            'workdir':self._workdir,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        self._model_ckpt = train_func_GPU(datasets, train_params)
-
     # Models predicting
     #########################################################################################################
 
@@ -233,43 +222,27 @@ def _predict_proba(self, ds):
                 col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
                 ds = ds.drop_columns(col_2_drop)
 
+            ds = self._scaler.transform(ds)
+
             ds = ds.materialize()
 
-            if self._nb_GPU > 0:
-                probabilities = self._predict_proba_GPU(ds)
-            else:
-                probabilities = self._predict_proba_CPU(ds)
+            self._predictor = BatchPredictor.from_checkpoint(
+                self._model_ckpt,
+                TensorflowPredictor,
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+            )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+            )
+
+            probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
             
             return probabilities
         else:
             raise ValueError('No data to predict')
 
-    def _predict_proba_CPU(self, ds):
-        print('_predict_proba_CPU')
-        self._predictor = BatchPredictor.from_checkpoint(
-            self._model_ckpt,
-            TensorflowPredictor,
-            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-        )
-        predictions = self._predictor.predict(
-            data = ds,
-            feature_columns = [TENSOR_COLUMN_NAME],
-            batch_size = self.batch_size,
-        )
-
-        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
-
-        return probabilities
-    
-    def _predict_proba_GPU(self, ds):
-        print('_predict_proba_GPU')
-        model = load_model(self._model_ckpt)
-        probabilities = []
-        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
-            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
-
-        return probabilities
-
     @abstractmethod
     def _get_abs_pred(self):
         """
@@ -326,7 +299,7 @@ def train_func_CPU(config):
             validation_data = batch_val,
             callbacks = [ReportCheckpointCallback()],
             class_weight = weights,
-            verbose = 0
+            verbose = 1
         )
         # Checkpointing
         session.report({
@@ -343,62 +316,62 @@ def train_func_CPU(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
-def train_func_GPU(datasets, config):
+
+def train_func_GPU(config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
     size = config.get('size')
     nb_cls = config.get('nb_cls')
-    taxa = config.get('taxa')
-    workdir = config.get('workdir')
     model = config.get('model')
     weights = config.get('weights')
 
-    checkpoint = os.path.join(workdir, model)
-
-    # Data
-    train_ds = datasets['train']
-    val_ds = datasets['validation']
-
-    # Convert datasets to tensorflow ds & generator
-    train_ds = train_ds.iterator().to_tf(
-        feature_columns = TENSOR_COLUMN_NAME,
-        label_columns = LABELS_COLUMN_NAME,
-        batch_size = batch_size
-    )
-    val_ds = val_ds.to_tf(
-        feature_columns = TENSOR_COLUMN_NAME,
-        label_columns = LABELS_COLUMN_NAME,
-        batch_size = batch_size
-    )
-
     # Model construction
-    model = build_model(model, nb_cls, size)
+    strategy = tf.distribute.MirroredStrategy()
+    with strategy.scope():
+        model = build_model(model, nb_cls, size)
 
-    # Callbacks
-    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
-    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
-    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
-    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
-    csv = CSVLogger(model_csv)
-
-    # Training
-    hist = model.fit(
-        train_ds,
-        epochs = epochs,
-        validation_data = val_ds,
-        callbacks = [modelckpt, early, csv],
-        class_weight = weights,
-        verbose = 1
-    )
-
-    # Checkpointing
-    best_model = np.argmin(hist.history['val_loss']) + 1
-    best_model = f'{best_model:03d}.hdf5'
-    best_model = os.path.join(checkpoint, taxa, best_model)
-    
-    return best_model
+    # Data
+    train_data = session.get_dataset_shard('train')
+    val_data = session.get_dataset_shard('validation')
 
+    for _ in range(epochs):
+        batch_train = train_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        batch_val = val_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        # Training
+        history = model.fit(
+            x = batch_train,
+            validation_data = batch_val,
+            callbacks = [ReportCheckpointCallback()],
+            class_weight = weights,
+            verbose = 0
+        )
+        # Checkpointing
+        session.report({
+            'accuracy': history.history['accuracy'][0],
+            'loss': history.history['loss'][0],
+            'val_accuracy': history.history['val_accuracy'][0],
+            'val_loss': history.history['val_loss'][0],
+        },
+            checkpoint=TensorflowCheckpoint.from_model(model)
+        )
+        gc.collect()
+        tf.keras.backend.clear_session()
+    del model
+    gc.collect()
+    tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':
diff --git a/src/models/kerasTF/models_linear.py b/src/models/kerasTF/models_linear.py
new file mode 100644
index 0000000..adc81e3
--- /dev/null
+++ b/src/models/kerasTF/models_linear.py
@@ -0,0 +1,423 @@
+import os
+import gc
+import warnings
+import numpy as np
+import pandas as pd
+
+# Class construction
+from abc import ABC, abstractmethod
+
+# Preprocessing
+from ray.data.preprocessors import LabelEncoder, Chain
+from models.encoders.model_label_encoder import ModelLabelEncoder
+from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder
+from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
+
+# Parent class / models
+from models.models_utils import ModelsUtils
+from models.kerasTF.build_neural_networks import *
+
+# Training
+import tensorflow as tf
+from ray.air import session
+from ray.air.config import ScalingConfig
+from ray.air.integrations.keras import ReportCheckpointCallback
+from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
+from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
+
+# Tuning
+from ray.air.config import RunConfig
+
+# Predicting
+from tensorflow.keras.models import load_model
+from ray.train.tensorflow import TensorflowPredictor
+from ray.train.batch_predictor import BatchPredictor
+
+# Data
+from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
+
+__author__ = 'Nicolas de Montigny'
+
+__all__ = ['KerasTFModel']
+
+TENSOR_COLUMN_NAME = '__value__'
+LABELS_COLUMN_NAME = 'labels'
+
+# Ignore warnings to have a more comprehensible output on stdout
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+warnings.filterwarnings('ignore')
+
+class KerasTFModels(ModelsUtils, ABC):
+    """
+    Class used to build, train and predict models using Ray with Keras Tensorflow backend
+
+    ----------
+    Attributes
+    ----------
+
+    clf_file : string
+        Path to a file containing the trained model for this object
+
+    nb_classes : int
+        Number of classes for learning
+
+    ----------
+    Methods
+    ----------
+
+    preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation
+
+    train : train a model using the given datasets
+
+    predict : predict the classes of a dataset
+        ds : ray.data.Dataset
+            Dataset containing K-mers profiles of sequences to be classified
+
+        threshold : float
+            Minimum percentage of probability to effectively classify.
+            Sequences will be classified as 'unknown' if the probability is under this threshold.
+            Defaults to 80%
+    """
+
+    def __init__(
+        self,
+        classifier,
+        outdir_model,
+        batch_size,
+        training_epochs,
+        taxa,
+        kmers_list,
+        csv
+    ):
+        super().__init__(
+            classifier,
+            outdir_model,
+            batch_size,
+            training_epochs,
+            taxa,
+            kmers_list,
+            csv
+        )
+        # Parameters
+        # Initialize hidden
+        self._nb_CPU_data = int(os.cpu_count() * 0.2) # 6
+        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 26
+        self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 6
+        # Initialize empty
+        self._nb_CPU_per_worker = 0
+        self._nb_GPU_per_worker = 0
+        # Computing variables
+        if self._nb_GPU > 0:
+            self._use_gpu = True
+            self._n_workers = self._nb_GPU #6
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
+            self._nb_GPU_per_worker = 1
+        else:
+            self._use_gpu = False
+            self._n_workers = int(self._nb_CPU_training * 0.2)
+            self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers)
+
+    # Data preprocessing
+    #########################################################################################################
+
+    def preprocess(self, ds, scaling = False, scaler_file = None):
+        print('preprocess')
+        # Labels encoding
+        self._encoder = ModelLabelEncoder(self.taxa)
+        self._encoder.fit(ds)
+
+        # Labels mapping
+        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
+        self._nb_classes = len(labels)
+        self._encoded = np.arange(len(labels))
+        labels = np.append(labels, 'Unknown')
+        self._encoded = np.append(self._encoded, -1)
+
+        for (label, encoded) in zip(labels, self._encoded):
+            self._labels_map[label] = encoded
+        
+        # Features scaling
+        self._scaler = TensorTfIdfTransformer(features = self.kmers, file = scaler_file)
+        self._scaler.fit(ds)
+
+        # Class weights
+        self._weights = self._compute_weights()
+
+    # Models training
+    #########################################################################################################
+
+    def fit(self, datasets):
+        print('fit')
+        # Preprocessing loop
+        for name, ds in datasets.items():
+            # ds = ds.drop_columns(['id'])
+            ds = self._encoder.transform(ds)
+            ds = self._scaler.transform(ds)
+            ds = ds.materialize()
+            datasets[name] = ds
+
+        if self._nb_GPU > 0:
+            self._fit_GPU(datasets)
+        else:
+            self._fit_CPU(datasets)
+
+    def _fit_CPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        # Define trainer / tuner
+        self._trainer = TensorflowTrainer(
+            train_loop_per_worker=train_func_CPU,
+            train_loop_config=train_params,
+            scaling_config=ScalingConfig(
+                trainer_resources={'CPU': self._nb_CPU_data},
+                num_workers=self._n_workers,
+                use_gpu=self._use_gpu,
+                resources_per_worker={
+                    'CPU': self._nb_CPU_per_worker
+                }
+            ),
+            run_config=RunConfig(
+                name=self.classifier,
+                local_dir=self._workdir,
+            ),
+            datasets=datasets,
+        )
+
+        training_result = self._trainer.fit()
+        self._model_ckpt = training_result.best_checkpoints[0][0]
+
+    def _fit_GPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'taxa': self.taxa,
+            'workdir':self._workdir,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        self._model_ckpt = train_func_GPU(datasets, train_params)
+
+    # Models predicting
+    #########################################################################################################
+
+    def predict(self, ds):
+        print('predict')
+        # Predict with model
+        probabilities = self._predict_proba(ds)
+        # Convert predictions to labels
+        predictions = self._get_abs_pred(probabilities)
+        # Return decoded labels
+        return self._label_decode(predictions)
+    
+    def predict_proba(self, ds, threshold = 0.8):
+        print('predict_proba')
+        # Predict with model
+        probabilities = self._predict_proba(ds)
+        # Convert predictions to labels with threshold
+        predictions = self._get_threshold_pred(probabilities, threshold)
+        # Return decoded labels
+        return self._label_decode(predictions)
+
+    def _predict_proba(self, ds):
+        print('_predict_proba')
+        if ds.count() > 0:
+            if len(ds.schema().names) > 1:
+                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
+                ds = ds.drop_columns(col_2_drop)
+
+            ds = self._scaler.transform(ds)
+
+            ds = ds.materialize()
+
+            if self._nb_GPU > 0:
+                probabilities = self._predict_proba_GPU(ds)
+            else:
+                probabilities = self._predict_proba_CPU(ds)
+            
+            return probabilities
+        else:
+            raise ValueError('No data to predict')
+
+    def _predict_proba_CPU(self, ds):
+        print('_predict_proba_CPU')
+        self._predictor = BatchPredictor.from_checkpoint(
+            self._model_ckpt,
+            TensorflowPredictor,
+            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+        )
+        predictions = self._predictor.predict(
+            data = ds,
+            feature_columns = [TENSOR_COLUMN_NAME],
+            batch_size = self.batch_size,
+        )
+
+        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+
+        return probabilities
+    
+    def _predict_proba_GPU(self, ds):
+        print('_predict_proba_GPU')
+        model = load_model(self._model_ckpt)
+        probabilities = []
+        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
+            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
+
+        return probabilities
+
+    @abstractmethod
+    def _get_abs_pred(self):
+        """
+        """
+    
+    @abstractmethod
+    def _get_threshold_pred(self):
+        """
+        """
+    
+# Training/building function outside of the class as mentioned on the Ray discussion
+# https://discuss.ray.io/t/statuscode-resource-exhausted/4379/16
+################################################################################
+
+# Data streaming in PipelineDataset for larger than memory data, should prevent OOM
+# https://docs.ray.io/en/latest/ray-air/check-ingest.html#enabling-streaming-ingest
+# Smaller nb of workers + bigger nb CPU_per_worker + smaller batch_size to avoid memory overload
+# https://discuss.ray.io/t/ray-sgd-distributed-tensorflow/261/8
+
+def train_func_CPU(config):
+    # Parameters
+    batch_size = config.get('batch_size', 128)
+    epochs = config.get('epochs', 10)
+    size = config.get('size')
+    nb_cls = config.get('nb_cls')
+    model = config.get('model')
+    weights = config.get('weights')
+
+    # Model construction
+    model = build_model(model, nb_cls, size)
+
+    # Data
+    train_data = session.get_dataset_shard('train')
+    val_data = session.get_dataset_shard('validation')
+
+    for _ in range(epochs):
+        batch_train = train_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        batch_val = val_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        # Training
+        history = model.fit(
+            x = batch_train,
+            validation_data = batch_val,
+            callbacks = [ReportCheckpointCallback()],
+            class_weight = weights,
+            verbose = 0
+        )
+        # Checkpointing
+        session.report({
+            'accuracy': history.history['accuracy'][0],
+            'loss': history.history['loss'][0],
+            'val_accuracy': history.history['val_accuracy'][0],
+            'val_loss': history.history['val_loss'][0],
+        },
+            checkpoint=TensorflowCheckpoint.from_model(model)
+        )
+        gc.collect()
+        tf.keras.backend.clear_session()
+    del model
+    gc.collect()
+    tf.keras.backend.clear_session()
+
+def train_func_GPU(datasets, config):
+    # Parameters
+    batch_size = config.get('batch_size', 128)
+    epochs = config.get('epochs', 10)
+    size = config.get('size')
+    nb_cls = config.get('nb_cls')
+    taxa = config.get('taxa')
+    workdir = config.get('workdir')
+    model = config.get('model')
+    weights = config.get('weights')
+
+    checkpoint = os.path.join(workdir, model)
+
+    # Data
+    train_ds = datasets['train']
+    val_ds = datasets['validation']
+
+    # Convert datasets to tensorflow ds & generator
+    train_ds = train_ds.iterator().to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+    val_ds = val_ds.to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+
+    # Model construction
+    model = build_model(model, nb_cls, size)
+
+    # Callbacks
+    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
+    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
+    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
+    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
+    csv = CSVLogger(model_csv)
+
+    # Training
+    hist = model.fit(
+        train_ds,
+        epochs = epochs,
+        validation_data = val_ds,
+        callbacks = [modelckpt, early, csv],
+        class_weight = weights,
+        verbose = 1
+    )
+
+    # Checkpointing
+    best_model = np.argmin(hist.history['val_loss']) + 1
+    best_model = f'{best_model:03d}.hdf5'
+    best_model = os.path.join(checkpoint, taxa, best_model)
+    
+    return best_model
+
+
+def build_model(classifier, nb_cls, nb_kmers):
+    if classifier == 'attention':
+        model = build_attention(nb_kmers)
+    elif classifier == 'lstm':
+        model = build_LSTM(nb_kmers)
+    elif classifier == 'deeplstm':
+        model = build_deepLSTM(nb_kmers)
+    elif classifier == 'lstm_attention':
+        model = build_LSTM_attention(nb_kmers, nb_cls)
+    elif classifier == 'cnn':
+        model = build_CNN(nb_kmers, nb_cls)
+    elif classifier == 'widecnn':
+        model = build_wideCNN(nb_kmers, nb_cls)
+    return model
+
diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py
index cd5e6c0..9c72baa 100644
--- a/src/models/sklearn/binary_models.py
+++ b/src/models/sklearn/binary_models.py
@@ -112,14 +112,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         for (label, encoded) in zip(labels, self._encoded):
             self._labels_map[label] = encoded
 
-        # Scaling
-        # self._scaler = TensorTfIdfTransformer(
-        #     features = self.kmers,
-        #     file = scaler_file
-        # )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        # self._scaler.fit(ds)
-
+        # Features scaling
+        self._scaler = TensorTfIdfTransformer(features = self.kmers,file = scaler_file)
+        self._scaler.fit(ds)
 
     # Model training
     #########################################################################################################
@@ -131,7 +126,7 @@ def fit(self, datasets):
         for name, ds in datasets.items():
             # ds = ds.drop_columns(['id'])
             ds = self._encoder.transform(ds)
-            # ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             datasets[name] = ray.put(ds)
         
         try:
@@ -189,7 +184,7 @@ def _build(self):
     def predict(self, ds):
         print('predict')
         if ds.count() > 0:
-            # ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
             ds = ds.materialize()
             predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1}
             self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor)
diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py
index f7e9f10..798e6aa 100644
--- a/src/models/sklearn/multiclass_models.py
+++ b/src/models/sklearn/multiclass_models.py
@@ -118,9 +118,10 @@ def preprocess(self, ds, scaling = False, scaler_file = None):
         # Class weights
         self._weights = self._compute_weights()
         
-        if self.classifier == 'mnb':
-            self._scaler = TensorMinMaxScaler(self._nb_kmers)
-            self._scaler.fit(ds)
+        # Features scaling
+        self._scaler = TensorTfIdfTransformer(features = self.kmers,file = scaler_file)
+        self._scaler.fit(ds)
+
         
     # Models training
     #########################################################################################################
@@ -131,8 +132,7 @@ def fit(self, datasets):
             # ds = ds.drop_columns(['id'])
         train_ds = datasets['train']
         train_ds = self._encoder.transform(train_ds)
-        if self.classifier == 'mnb':
-            train_ds = self._scaler.transform(train_ds)
+        train_ds = self._scaler.transform(train_ds)
         # datasets[name] = ds
 
         # One sub-model per artificial cluster of samples
@@ -244,8 +244,7 @@ def predict_proba(self, ds, threshold = 0.8):
 
     def _predict_proba(self, ds):
         if ds.count() > 0:
-            if self.classifier == 'mnb':
-                ds = self._scaler.transform(ds)
+            ds = self._scaler.transform(ds)
 
             def predict_func(data):
                 X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])

From 312ea308c775175058d264e70fd69c9fe488b31d Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 20:24:15 -0500
Subject: [PATCH 80/92] NN no verbose

---
 src/models/kerasTF/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 126a908..7429b7a 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -299,7 +299,7 @@ def train_func_CPU(config):
             validation_data = batch_val,
             callbacks = [ReportCheckpointCallback()],
             class_weight = weights,
-            verbose = 1
+            verbose = 0
         )
         # Checkpointing
         session.report({

From edd47b359bbc22385b005acefd3bcf8fb01ef877 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 22:04:47 -0500
Subject: [PATCH 81/92] linear NN training

---
 src/models/kerasTF/binary_models.py           |   4 +-
 src/models/kerasTF/models.py                  | 163 ++++++++++-------
 .../{models_linear.py => models_parallel.py}  | 167 +++++++-----------
 3 files changed, 167 insertions(+), 167 deletions(-)
 rename src/models/kerasTF/{models_linear.py => models_parallel.py} (76%)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index b03165b..ae99f5c 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -12,7 +12,7 @@
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Parent class / models
-from models.kerasTF.models import KerasTFModels
+from models.kerasTF.models_parallel import KerasTFModels
 from models.kerasTF.build_neural_networks import *
 
 # Training
@@ -20,7 +20,7 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
-from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model
+from models.kerasTF.models_parallel import train_func_CPU, train_func_GPU, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 7429b7a..adc81e3 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -156,6 +156,12 @@ def fit(self, datasets):
             ds = ds.materialize()
             datasets[name] = ds
 
+        if self._nb_GPU > 0:
+            self._fit_GPU(datasets)
+        else:
+            self._fit_CPU(datasets)
+
+    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -166,22 +172,16 @@ def fit(self, datasets):
             'weights': self._weights
         }
 
-        if self._nb_GPU > 0:
-            train_func = train_func_GPU
-        else:
-            train_func = train_func_CPU
-
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func,
+            train_loop_per_worker=train_func_CPU,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker,
-                    'GPU' : self._nb_GPU_per_worker
+                    'CPU': self._nb_CPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -194,6 +194,21 @@ def fit(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
+    def _fit_GPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'taxa': self.taxa,
+            'workdir':self._workdir,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        self._model_ckpt = train_func_GPU(datasets, train_params)
+
     # Models predicting
     #########################################################################################################
 
@@ -226,23 +241,41 @@ def _predict_proba(self, ds):
 
             ds = ds.materialize()
 
-            self._predictor = BatchPredictor.from_checkpoint(
-                self._model_ckpt,
-                TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-            )
-
-            probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+            if self._nb_GPU > 0:
+                probabilities = self._predict_proba_GPU(ds)
+            else:
+                probabilities = self._predict_proba_CPU(ds)
             
             return probabilities
         else:
             raise ValueError('No data to predict')
 
+    def _predict_proba_CPU(self, ds):
+        print('_predict_proba_CPU')
+        self._predictor = BatchPredictor.from_checkpoint(
+            self._model_ckpt,
+            TensorflowPredictor,
+            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+        )
+        predictions = self._predictor.predict(
+            data = ds,
+            feature_columns = [TENSOR_COLUMN_NAME],
+            batch_size = self.batch_size,
+        )
+
+        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+
+        return probabilities
+    
+    def _predict_proba_GPU(self, ds):
+        print('_predict_proba_GPU')
+        model = load_model(self._model_ckpt)
+        probabilities = []
+        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
+            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
+
+        return probabilities
+
     @abstractmethod
     def _get_abs_pred(self):
         """
@@ -316,62 +349,62 @@ def train_func_CPU(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
-
-def train_func_GPU(config):
+def train_func_GPU(datasets, config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
     size = config.get('size')
     nb_cls = config.get('nb_cls')
+    taxa = config.get('taxa')
+    workdir = config.get('workdir')
     model = config.get('model')
     weights = config.get('weights')
 
-    # Model construction
-    strategy = tf.distribute.MirroredStrategy()
-    with strategy.scope():
-        model = build_model(model, nb_cls, size)
+    checkpoint = os.path.join(workdir, model)
 
     # Data
-    train_data = session.get_dataset_shard('train')
-    val_data = session.get_dataset_shard('validation')
+    train_ds = datasets['train']
+    val_ds = datasets['validation']
+
+    # Convert datasets to tensorflow ds & generator
+    train_ds = train_ds.iterator().to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+    val_ds = val_ds.to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+
+    # Model construction
+    model = build_model(model, nb_cls, size)
+
+    # Callbacks
+    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
+    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
+    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
+    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
+    csv = CSVLogger(model_csv)
+
+    # Training
+    hist = model.fit(
+        train_ds,
+        epochs = epochs,
+        validation_data = val_ds,
+        callbacks = [modelckpt, early, csv],
+        class_weight = weights,
+        verbose = 1
+    )
+
+    # Checkpointing
+    best_model = np.argmin(hist.history['val_loss']) + 1
+    best_model = f'{best_model:03d}.hdf5'
+    best_model = os.path.join(checkpoint, taxa, best_model)
+    
+    return best_model
 
-    for _ in range(epochs):
-        batch_train = train_data.to_tf(
-            feature_columns = TENSOR_COLUMN_NAME,
-            label_columns = LABELS_COLUMN_NAME,
-            batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
-        )
-        batch_val = val_data.to_tf(
-            feature_columns = TENSOR_COLUMN_NAME,
-            label_columns = LABELS_COLUMN_NAME,
-            batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
-        )
-        # Training
-        history = model.fit(
-            x = batch_train,
-            validation_data = batch_val,
-            callbacks = [ReportCheckpointCallback()],
-            class_weight = weights,
-            verbose = 0
-        )
-        # Checkpointing
-        session.report({
-            'accuracy': history.history['accuracy'][0],
-            'loss': history.history['loss'][0],
-            'val_accuracy': history.history['val_accuracy'][0],
-            'val_loss': history.history['val_loss'][0],
-        },
-            checkpoint=TensorflowCheckpoint.from_model(model)
-        )
-        gc.collect()
-        tf.keras.backend.clear_session()
-    del model
-    gc.collect()
-    tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':
diff --git a/src/models/kerasTF/models_linear.py b/src/models/kerasTF/models_parallel.py
similarity index 76%
rename from src/models/kerasTF/models_linear.py
rename to src/models/kerasTF/models_parallel.py
index adc81e3..bfa04cb 100644
--- a/src/models/kerasTF/models_linear.py
+++ b/src/models/kerasTF/models_parallel.py
@@ -109,9 +109,9 @@ def __init__(
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = self._nb_GPU #6
+            self._n_workers = self._nb_GPU / 2 #6
             self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
-            self._nb_GPU_per_worker = 1
+            self._nb_GPU_per_worker = 2
         else:
             self._use_gpu = False
             self._n_workers = int(self._nb_CPU_training * 0.2)
@@ -156,12 +156,6 @@ def fit(self, datasets):
             ds = ds.materialize()
             datasets[name] = ds
 
-        if self._nb_GPU > 0:
-            self._fit_GPU(datasets)
-        else:
-            self._fit_CPU(datasets)
-
-    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -172,16 +166,22 @@ def _fit_CPU(self, datasets):
             'weights': self._weights
         }
 
+        if self._nb_GPU > 0:
+            train_func = train_func_GPU
+        else:
+            train_func = train_func_CPU
+
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func_CPU,
+            train_loop_per_worker=train_func,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker
+                    'CPU': self._nb_CPU_per_worker,
+                    'GPU' : self._nb_GPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -194,21 +194,6 @@ def _fit_CPU(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
-    def _fit_GPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'taxa': self.taxa,
-            'workdir':self._workdir,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        self._model_ckpt = train_func_GPU(datasets, train_params)
-
     # Models predicting
     #########################################################################################################
 
@@ -241,41 +226,23 @@ def _predict_proba(self, ds):
 
             ds = ds.materialize()
 
-            if self._nb_GPU > 0:
-                probabilities = self._predict_proba_GPU(ds)
-            else:
-                probabilities = self._predict_proba_CPU(ds)
+            self._predictor = BatchPredictor.from_checkpoint(
+                self._model_ckpt,
+                TensorflowPredictor,
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+            )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+            )
+
+            probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
             
             return probabilities
         else:
             raise ValueError('No data to predict')
 
-    def _predict_proba_CPU(self, ds):
-        print('_predict_proba_CPU')
-        self._predictor = BatchPredictor.from_checkpoint(
-            self._model_ckpt,
-            TensorflowPredictor,
-            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-        )
-        predictions = self._predictor.predict(
-            data = ds,
-            feature_columns = [TENSOR_COLUMN_NAME],
-            batch_size = self.batch_size,
-        )
-
-        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
-
-        return probabilities
-    
-    def _predict_proba_GPU(self, ds):
-        print('_predict_proba_GPU')
-        model = load_model(self._model_ckpt)
-        probabilities = []
-        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
-            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
-
-        return probabilities
-
     @abstractmethod
     def _get_abs_pred(self):
         """
@@ -349,62 +316,62 @@ def train_func_CPU(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
-def train_func_GPU(datasets, config):
+
+def train_func_GPU(config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
     size = config.get('size')
     nb_cls = config.get('nb_cls')
-    taxa = config.get('taxa')
-    workdir = config.get('workdir')
     model = config.get('model')
     weights = config.get('weights')
 
-    checkpoint = os.path.join(workdir, model)
-
-    # Data
-    train_ds = datasets['train']
-    val_ds = datasets['validation']
-
-    # Convert datasets to tensorflow ds & generator
-    train_ds = train_ds.iterator().to_tf(
-        feature_columns = TENSOR_COLUMN_NAME,
-        label_columns = LABELS_COLUMN_NAME,
-        batch_size = batch_size
-    )
-    val_ds = val_ds.to_tf(
-        feature_columns = TENSOR_COLUMN_NAME,
-        label_columns = LABELS_COLUMN_NAME,
-        batch_size = batch_size
-    )
-
     # Model construction
-    model = build_model(model, nb_cls, size)
+    strategy = tf.distribute.MirroredStrategy()
+    with strategy.scope():
+        model = build_model(model, nb_cls, size)
 
-    # Callbacks
-    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
-    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
-    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
-    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
-    csv = CSVLogger(model_csv)
-
-    # Training
-    hist = model.fit(
-        train_ds,
-        epochs = epochs,
-        validation_data = val_ds,
-        callbacks = [modelckpt, early, csv],
-        class_weight = weights,
-        verbose = 1
-    )
-
-    # Checkpointing
-    best_model = np.argmin(hist.history['val_loss']) + 1
-    best_model = f'{best_model:03d}.hdf5'
-    best_model = os.path.join(checkpoint, taxa, best_model)
-    
-    return best_model
+    # Data
+    train_data = session.get_dataset_shard('train')
+    val_data = session.get_dataset_shard('validation')
 
+    for _ in range(epochs):
+        batch_train = train_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        batch_val = val_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        # Training
+        history = model.fit(
+            x = batch_train,
+            validation_data = batch_val,
+            callbacks = [ReportCheckpointCallback()],
+            class_weight = weights,
+            verbose = 0
+        )
+        # Checkpointing
+        session.report({
+            'accuracy': history.history['accuracy'][0],
+            'loss': history.history['loss'][0],
+            'val_accuracy': history.history['val_accuracy'][0],
+            'val_loss': history.history['val_loss'][0],
+        },
+            checkpoint=TensorflowCheckpoint.from_model(model)
+        )
+        gc.collect()
+        tf.keras.backend.clear_session()
+    del model
+    gc.collect()
+    tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':

From d4b1ea4846b1914530900a22c93e3df2a5e9770b Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 22:12:02 -0500
Subject: [PATCH 82/92] keras bad import

---
 src/models/kerasTF/binary_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index ae99f5c..05cab3c 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -12,7 +12,7 @@
 from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer
 
 # Parent class / models
-from models.kerasTF.models_parallel import KerasTFModels
+from models.kerasTF.models import KerasTFModels
 from models.kerasTF.build_neural_networks import *
 
 # Training

From f12c6573e86e225c083a29d05a07d64118871595 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 22:26:05 -0500
Subject: [PATCH 83/92] parallel NN training + more GPU / worker

---
 src/models/kerasTF/binary_models.py           |   2 +-
 src/models/kerasTF/models.py                  | 169 +++++++-----------
 .../{models_parallel.py => models_linear.py}  | 167 ++++++++++-------
 3 files changed, 169 insertions(+), 169 deletions(-)
 rename src/models/kerasTF/{models_parallel.py => models_linear.py} (76%)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index 05cab3c..b03165b 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -20,7 +20,7 @@
 from ray.air import session
 # from ray.air.integrations.keras import Callback
 from ray.air.config import ScalingConfig
-from models.kerasTF.models_parallel import train_func_CPU, train_func_GPU, build_model
+from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model
 from ray.air.integrations.keras import ReportCheckpointCallback
 from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint
 
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index adc81e3..1e69445 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -109,9 +109,9 @@ def __init__(
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = self._nb_GPU #6
-            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
-            self._nb_GPU_per_worker = 1
+            self._n_workers = self._nb_GPU / 2 # 3
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 8
+            self._nb_GPU_per_worker = 2
         else:
             self._use_gpu = False
             self._n_workers = int(self._nb_CPU_training * 0.2)
@@ -156,12 +156,6 @@ def fit(self, datasets):
             ds = ds.materialize()
             datasets[name] = ds
 
-        if self._nb_GPU > 0:
-            self._fit_GPU(datasets)
-        else:
-            self._fit_CPU(datasets)
-
-    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -172,16 +166,22 @@ def _fit_CPU(self, datasets):
             'weights': self._weights
         }
 
+        if self._nb_GPU > 0:
+            train_func = train_func_GPU
+        else:
+            train_func = train_func_CPU
+
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func_CPU,
+            train_loop_per_worker=train_func,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker
+                    'CPU': self._nb_CPU_per_worker,
+                    'GPU' : self._nb_GPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -194,21 +194,6 @@ def _fit_CPU(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
-    def _fit_GPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'taxa': self.taxa,
-            'workdir':self._workdir,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        self._model_ckpt = train_func_GPU(datasets, train_params)
-
     # Models predicting
     #########################################################################################################
 
@@ -241,41 +226,23 @@ def _predict_proba(self, ds):
 
             ds = ds.materialize()
 
-            if self._nb_GPU > 0:
-                probabilities = self._predict_proba_GPU(ds)
-            else:
-                probabilities = self._predict_proba_CPU(ds)
+            self._predictor = BatchPredictor.from_checkpoint(
+                self._model_ckpt,
+                TensorflowPredictor,
+                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+            )
+            predictions = self._predictor.predict(
+                data = ds,
+                feature_columns = [TENSOR_COLUMN_NAME],
+                batch_size = self.batch_size,
+            )
+
+            probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
             
             return probabilities
         else:
             raise ValueError('No data to predict')
 
-    def _predict_proba_CPU(self, ds):
-        print('_predict_proba_CPU')
-        self._predictor = BatchPredictor.from_checkpoint(
-            self._model_ckpt,
-            TensorflowPredictor,
-            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-        )
-        predictions = self._predictor.predict(
-            data = ds,
-            feature_columns = [TENSOR_COLUMN_NAME],
-            batch_size = self.batch_size,
-        )
-
-        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
-
-        return probabilities
-    
-    def _predict_proba_GPU(self, ds):
-        print('_predict_proba_GPU')
-        model = load_model(self._model_ckpt)
-        probabilities = []
-        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
-            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
-
-        return probabilities
-
     @abstractmethod
     def _get_abs_pred(self):
         """
@@ -349,62 +316,62 @@ def train_func_CPU(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
-def train_func_GPU(datasets, config):
+
+def train_func_GPU(config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
     size = config.get('size')
     nb_cls = config.get('nb_cls')
-    taxa = config.get('taxa')
-    workdir = config.get('workdir')
     model = config.get('model')
     weights = config.get('weights')
 
-    checkpoint = os.path.join(workdir, model)
-
-    # Data
-    train_ds = datasets['train']
-    val_ds = datasets['validation']
-
-    # Convert datasets to tensorflow ds & generator
-    train_ds = train_ds.iterator().to_tf(
-        feature_columns = TENSOR_COLUMN_NAME,
-        label_columns = LABELS_COLUMN_NAME,
-        batch_size = batch_size
-    )
-    val_ds = val_ds.to_tf(
-        feature_columns = TENSOR_COLUMN_NAME,
-        label_columns = LABELS_COLUMN_NAME,
-        batch_size = batch_size
-    )
-
     # Model construction
-    model = build_model(model, nb_cls, size)
+    strategy = tf.distribute.MirroredStrategy()
+    with strategy.scope():
+        model = build_model(model, nb_cls, size)
 
-    # Callbacks
-    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
-    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
-    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
-    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
-    csv = CSVLogger(model_csv)
-
-    # Training
-    hist = model.fit(
-        train_ds,
-        epochs = epochs,
-        validation_data = val_ds,
-        callbacks = [modelckpt, early, csv],
-        class_weight = weights,
-        verbose = 1
-    )
-
-    # Checkpointing
-    best_model = np.argmin(hist.history['val_loss']) + 1
-    best_model = f'{best_model:03d}.hdf5'
-    best_model = os.path.join(checkpoint, taxa, best_model)
-    
-    return best_model
+    # Data
+    train_data = session.get_dataset_shard('train')
+    val_data = session.get_dataset_shard('validation')
 
+    for _ in range(epochs):
+        batch_train = train_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        batch_val = val_data.to_tf(
+            feature_columns = TENSOR_COLUMN_NAME,
+            label_columns = LABELS_COLUMN_NAME,
+            batch_size = batch_size,
+            local_shuffle_buffer_size = batch_size,
+            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+        )
+        # Training
+        history = model.fit(
+            x = batch_train,
+            validation_data = batch_val,
+            callbacks = [ReportCheckpointCallback()],
+            class_weight = weights,
+            verbose = 0
+        )
+        # Checkpointing
+        session.report({
+            'accuracy': history.history['accuracy'][0],
+            'loss': history.history['loss'][0],
+            'val_accuracy': history.history['val_accuracy'][0],
+            'val_loss': history.history['val_loss'][0],
+        },
+            checkpoint=TensorflowCheckpoint.from_model(model)
+        )
+        gc.collect()
+        tf.keras.backend.clear_session()
+    del model
+    gc.collect()
+    tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':
diff --git a/src/models/kerasTF/models_parallel.py b/src/models/kerasTF/models_linear.py
similarity index 76%
rename from src/models/kerasTF/models_parallel.py
rename to src/models/kerasTF/models_linear.py
index bfa04cb..adc81e3 100644
--- a/src/models/kerasTF/models_parallel.py
+++ b/src/models/kerasTF/models_linear.py
@@ -109,9 +109,9 @@ def __init__(
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = self._nb_GPU / 2 #6
+            self._n_workers = self._nb_GPU #6
             self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
-            self._nb_GPU_per_worker = 2
+            self._nb_GPU_per_worker = 1
         else:
             self._use_gpu = False
             self._n_workers = int(self._nb_CPU_training * 0.2)
@@ -156,6 +156,12 @@ def fit(self, datasets):
             ds = ds.materialize()
             datasets[name] = ds
 
+        if self._nb_GPU > 0:
+            self._fit_GPU(datasets)
+        else:
+            self._fit_CPU(datasets)
+
+    def _fit_CPU(self, datasets):
         # Training parameters
         train_params = {
             'batch_size': self.batch_size,
@@ -166,22 +172,16 @@ def fit(self, datasets):
             'weights': self._weights
         }
 
-        if self._nb_GPU > 0:
-            train_func = train_func_GPU
-        else:
-            train_func = train_func_CPU
-
         # Define trainer / tuner
         self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func,
+            train_loop_per_worker=train_func_CPU,
             train_loop_config=train_params,
             scaling_config=ScalingConfig(
                 trainer_resources={'CPU': self._nb_CPU_data},
                 num_workers=self._n_workers,
                 use_gpu=self._use_gpu,
                 resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker,
-                    'GPU' : self._nb_GPU_per_worker
+                    'CPU': self._nb_CPU_per_worker
                 }
             ),
             run_config=RunConfig(
@@ -194,6 +194,21 @@ def fit(self, datasets):
         training_result = self._trainer.fit()
         self._model_ckpt = training_result.best_checkpoints[0][0]
 
+    def _fit_GPU(self, datasets):
+        # Training parameters
+        train_params = {
+            'batch_size': self.batch_size,
+            'epochs': self._training_epochs,
+            'size': self._nb_kmers,
+            'nb_cls': self._nb_classes,
+            'taxa': self.taxa,
+            'workdir':self._workdir,
+            'model': self.classifier,
+            'weights': self._weights
+        }
+
+        self._model_ckpt = train_func_GPU(datasets, train_params)
+
     # Models predicting
     #########################################################################################################
 
@@ -226,23 +241,41 @@ def _predict_proba(self, ds):
 
             ds = ds.materialize()
 
-            self._predictor = BatchPredictor.from_checkpoint(
-                self._model_ckpt,
-                TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-            )
-
-            probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+            if self._nb_GPU > 0:
+                probabilities = self._predict_proba_GPU(ds)
+            else:
+                probabilities = self._predict_proba_CPU(ds)
             
             return probabilities
         else:
             raise ValueError('No data to predict')
 
+    def _predict_proba_CPU(self, ds):
+        print('_predict_proba_CPU')
+        self._predictor = BatchPredictor.from_checkpoint(
+            self._model_ckpt,
+            TensorflowPredictor,
+            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+        )
+        predictions = self._predictor.predict(
+            data = ds,
+            feature_columns = [TENSOR_COLUMN_NAME],
+            batch_size = self.batch_size,
+        )
+
+        probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+
+        return probabilities
+    
+    def _predict_proba_GPU(self, ds):
+        print('_predict_proba_GPU')
+        model = load_model(self._model_ckpt)
+        probabilities = []
+        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
+            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
+
+        return probabilities
+
     @abstractmethod
     def _get_abs_pred(self):
         """
@@ -316,62 +349,62 @@ def train_func_CPU(config):
     gc.collect()
     tf.keras.backend.clear_session()
 
-
-def train_func_GPU(config):
+def train_func_GPU(datasets, config):
     # Parameters
     batch_size = config.get('batch_size', 128)
     epochs = config.get('epochs', 10)
     size = config.get('size')
     nb_cls = config.get('nb_cls')
+    taxa = config.get('taxa')
+    workdir = config.get('workdir')
     model = config.get('model')
     weights = config.get('weights')
 
-    # Model construction
-    strategy = tf.distribute.MirroredStrategy()
-    with strategy.scope():
-        model = build_model(model, nb_cls, size)
+    checkpoint = os.path.join(workdir, model)
 
     # Data
-    train_data = session.get_dataset_shard('train')
-    val_data = session.get_dataset_shard('validation')
+    train_ds = datasets['train']
+    val_ds = datasets['validation']
+
+    # Convert datasets to tensorflow ds & generator
+    train_ds = train_ds.iterator().to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+    val_ds = val_ds.to_tf(
+        feature_columns = TENSOR_COLUMN_NAME,
+        label_columns = LABELS_COLUMN_NAME,
+        batch_size = batch_size
+    )
+
+    # Model construction
+    model = build_model(model, nb_cls, size)
+
+    # Callbacks
+    model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5')
+    model_csv = os.path.join(checkpoint, taxa, 'training_log.csv')
+    modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto')
+    early = EarlyStopping(monitor='val_loss', mode='auto', patience=10)
+    csv = CSVLogger(model_csv)
+
+    # Training
+    hist = model.fit(
+        train_ds,
+        epochs = epochs,
+        validation_data = val_ds,
+        callbacks = [modelckpt, early, csv],
+        class_weight = weights,
+        verbose = 1
+    )
+
+    # Checkpointing
+    best_model = np.argmin(hist.history['val_loss']) + 1
+    best_model = f'{best_model:03d}.hdf5'
+    best_model = os.path.join(checkpoint, taxa, best_model)
+    
+    return best_model
 
-    for _ in range(epochs):
-        batch_train = train_data.to_tf(
-            feature_columns = TENSOR_COLUMN_NAME,
-            label_columns = LABELS_COLUMN_NAME,
-            batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
-        )
-        batch_val = val_data.to_tf(
-            feature_columns = TENSOR_COLUMN_NAME,
-            label_columns = LABELS_COLUMN_NAME,
-            batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
-        )
-        # Training
-        history = model.fit(
-            x = batch_train,
-            validation_data = batch_val,
-            callbacks = [ReportCheckpointCallback()],
-            class_weight = weights,
-            verbose = 0
-        )
-        # Checkpointing
-        session.report({
-            'accuracy': history.history['accuracy'][0],
-            'loss': history.history['loss'][0],
-            'val_accuracy': history.history['val_accuracy'][0],
-            'val_loss': history.history['val_loss'][0],
-        },
-            checkpoint=TensorflowCheckpoint.from_model(model)
-        )
-        gc.collect()
-        tf.keras.backend.clear_session()
-    del model
-    gc.collect()
-    tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':

From 11e02a1013ec07d60dde01d1ad11ec44391267b1 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 22:30:12 -0500
Subject: [PATCH 84/92] adjust nb workers to int

---
 src/models/kerasTF/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 1e69445..52527c0 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -109,7 +109,7 @@ def __init__(
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = self._nb_GPU / 2 # 3
+            self._n_workers = int(self._nb_GPU / 2) # 3
             self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 8
             self._nb_GPU_per_worker = 2
         else:

From 8ba9dbe9f116ce3979ef541a984f0c4103643e95 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 23:02:55 -0500
Subject: [PATCH 85/92] ise all gpus to avoid oom while training

---
 src/models/kerasTF/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 52527c0..ac02efd 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -109,9 +109,9 @@ def __init__(
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = int(self._nb_GPU / 2) # 3
-            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 8
-            self._nb_GPU_per_worker = 2
+            self._n_workers = 1 # int(self._nb_GPU / 2) # 3
+            self._nb_CPU_per_worker = self._nb_CPU_training # int(self._nb_CPU_training / self._n_workers) # 8
+            self._nb_GPU_per_worker = self._nb_GPU # 1
         else:
             self._use_gpu = False
             self._n_workers = int(self._nb_CPU_training * 0.2)

From 468781a43c6b3a8ff976ad82fbbe514d916fc00a Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Wed, 20 Dec 2023 23:14:10 -0500
Subject: [PATCH 86/92] NN no random shuffle / iter

---
 src/models/kerasTF/models.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index ac02efd..4790656 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -109,9 +109,9 @@ def __init__(
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = 1 # int(self._nb_GPU / 2) # 3
-            self._nb_CPU_per_worker = self._nb_CPU_training # int(self._nb_CPU_training / self._n_workers) # 8
-            self._nb_GPU_per_worker = self._nb_GPU # 1
+            self._n_workers = self._nb_GPU # 6
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
+            self._nb_GPU_per_worker = 1
         else:
             self._use_gpu = False
             self._n_workers = int(self._nb_CPU_training * 0.2)
@@ -283,15 +283,15 @@ def train_func_CPU(config):
             feature_columns = TENSOR_COLUMN_NAME,
             label_columns = LABELS_COLUMN_NAME,
             batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+            # local_shuffle_buffer_size = batch_size,
+            # local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         batch_val = val_data.to_tf(
             feature_columns = TENSOR_COLUMN_NAME,
             label_columns = LABELS_COLUMN_NAME,
             batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+            # local_shuffle_buffer_size = batch_size,
+            # local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         # Training
         history = model.fit(
@@ -340,15 +340,15 @@ def train_func_GPU(config):
             feature_columns = TENSOR_COLUMN_NAME,
             label_columns = LABELS_COLUMN_NAME,
             batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+            # local_shuffle_buffer_size = batch_size,
+            # local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         batch_val = val_data.to_tf(
             feature_columns = TENSOR_COLUMN_NAME,
             label_columns = LABELS_COLUMN_NAME,
             batch_size = batch_size,
-            local_shuffle_buffer_size = batch_size,
-            local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
+            # local_shuffle_buffer_size = batch_size,
+            # local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         # Training
         history = model.fit(

From 07a223294fc9e85c6c84d2b829168b2ad86ff7ca Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 21 Dec 2023 20:24:48 -0500
Subject: [PATCH 87/92] NN mixed precision

---
 src/models/kerasTF/build_neural_networks.py | 24 +++++++++++++--------
 src/models/kerasTF/models.py                | 11 +++++-----
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 12893f3..1a0fec0 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -4,10 +4,11 @@
 from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
 from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape, AveragePooling1D
 
-
-
+from tensorflow.keras import mixed_precision
 from models.kerasTF.attentionLayer import AttentionWeightedAverage
 
+mixed_precision.set_global_policy('mixed_float16')
+
 __author__ = "Nicolas de Montigny"
 
 __all__ = ['build_attention','build_LSTM','build_deepLSTM','build_LSTM_attention','build_CNN','build_wideCNN']
@@ -28,7 +29,8 @@ def build_attention(nb_features):
 
     x = Dense(128, activation = "relu")(x)
     x = Dropout(0.1)(x)
-    x = Dense(1, activation = "sigmoid")(x)
+    x = Dense(1)(x)
+    x = Activation(activation = "sigmoid", dtype = 'float32')(x)
 
     model = Model(inputs = inputs, outputs = x)
     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'], jit_compile = True)
@@ -48,8 +50,9 @@ def build_LSTM(nb_features):
 
     x = LSTM(128, dropout = 0.1)(inputs)
 
-    x = Dense(1, activation = 'tanh')(x)
-    
+    x = Dense(1)(x)
+    x = Activation(activation = "tanh", dtype = 'float32')(x)
+
     model = Model(inputs = inputs, outputs = x)
     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
@@ -82,7 +85,9 @@ def build_deepLSTM(nb_features):
     net = Dense(10, activation='relu', name='D_%d'%10)(net)
     net = Dropout(0.1,name='fr_same')(net)
 
-    outputs = Dense(1, activation='sigmoid', name='score')(net)
+    net = Dense(1)(net)
+    outputs = Activation(activation = "sigmoid", dtype = 'float32')(net)
+
     model = Model(inputs=inputs, outputs=outputs)
     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
@@ -108,7 +113,7 @@ def build_LSTM_attention(nb_features, nb_classes):
     net = Dropout(0.2)(net)
     net = Flatten()(net)
     net = Dense(nb_classes)(net)
-    outputs = Activation('softmax')(net)
+    outputs = Activation('softmax', dtype = 'float32')(net)
     model = Model(inputs = inputs, outputs = outputs)
     model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
@@ -135,7 +140,7 @@ def build_CNN(nb_features, nb_classes):
     model.add(Activation('relu'))
     model.add(Dropout(0.5))
     model.add(Dense(nb_classes))
-    model.add(Activation('softmax'))
+    model.add(Activation('softmax', dtype = 'float32'))
     model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
     return model
@@ -172,7 +177,8 @@ def build_wideCNN(nb_features, nb_classes):
     net = Dropout(0.5)(net)
 
     net = Dense(nb_classes)(net)
-    outputs = Activation('softmax')(net)
+    outputs = Activation('softmax', dtype = 'float32')(net)
+    
     model = Model(inputs = inputs, outputs = outputs)
     model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True)
 
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 4790656..67d7fb9 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -47,6 +47,7 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 warnings.filterwarnings('ignore')
 
+
 class KerasTFModels(ModelsUtils, ABC):
     """
     Class used to build, train and predict models using Ray with Keras Tensorflow backend
@@ -100,17 +101,17 @@ def __init__(
         )
         # Parameters
         # Initialize hidden
-        self._nb_CPU_data = int(os.cpu_count() * 0.2) # 6
-        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 26
-        self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 6
+        self._nb_CPU_data = int(os.cpu_count() * 0.2) # 9
+        self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 39
+        self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 4
         # Initialize empty
         self._nb_CPU_per_worker = 0
         self._nb_GPU_per_worker = 0
         # Computing variables
         if self._nb_GPU > 0:
             self._use_gpu = True
-            self._n_workers = self._nb_GPU # 6
-            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4
+            self._n_workers = self._nb_GPU # 4
+            self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 9
             self._nb_GPU_per_worker = 1
         else:
             self._use_gpu = False

From 265682300c8b3e57291ae79e984568bddc4cadd1 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 21 Dec 2023 21:07:22 -0500
Subject: [PATCH 88/92] distribution strategy in TF

---
 src/models/kerasTF/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 67d7fb9..5dce3a2 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -328,7 +328,7 @@ def train_func_GPU(config):
     weights = config.get('weights')
 
     # Model construction
-    strategy = tf.distribute.MirroredStrategy()
+    strategy = tf.distribute.MultiWorkerMirroredStrategy()
     with strategy.scope():
         model = build_model(model, nb_cls, size)
 

From 01946bcdce3ac5e5aa4f2cef2189ab5e1e329300 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Thu, 21 Dec 2023 23:14:42 -0500
Subject: [PATCH 89/92] Keras NN Mirrored Workers fo CCDB GPUs

---
 src/models/kerasTF/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 5dce3a2..67d7fb9 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -328,7 +328,7 @@ def train_func_GPU(config):
     weights = config.get('weights')
 
     # Model construction
-    strategy = tf.distribute.MultiWorkerMirroredStrategy()
+    strategy = tf.distribute.MirroredStrategy()
     with strategy.scope():
         model = build_model(model, nb_cls, size)
 

From 32bebc53e4af127ec2b7f84355d070994973536c Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Fri, 22 Dec 2023 14:40:59 -0500
Subject: [PATCH 90/92] bagging for NN predictions

---
 src/models/kerasTF/binary_models.py         | 157 -------------------
 src/models/kerasTF/build_neural_networks.py |   6 +-
 src/models/kerasTF/models.py                |  61 +++++---
 src/models/kerasTF/multiclass_models.py     | 161 --------------------
 4 files changed, 45 insertions(+), 340 deletions(-)

diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py
index b03165b..0a80e2d 100644
--- a/src/models/kerasTF/binary_models.py
+++ b/src/models/kerasTF/binary_models.py
@@ -102,166 +102,9 @@ def __init__(
         elif self.classifier == 'deeplstm':
             print('Training bacterial / host classifier based on Deep LSTM Neural Network')
         
-    # Data preprocessing
-    #########################################################################################################
-    """
-    def preprocess(self, ds, scaling = False, scaler_file = None):
-        print('preprocess')
-        # Labels encoding
-        self._encoder = ModelLabelEncoder(self.taxa)
-        self._encoder.fit(ds)
-
-        # Labels mapping
-        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
-        self._nb_classes = len(labels)
-        self._encoded = np.arange(len(labels))
-        labels = np.append(labels, 'Unknown')
-        self._encoded = np.append(self._encoded, -1)
-        
-        for (label, encoded) in zip(labels, self._encoded):
-            self._labels_map[label] = encoded
-        
-        # Class weights
-        self._weights = self._compute_weights()
-        
-        # Scaling
-        # self._scaler = TensorTfIdfTransformer(
-        #     features = self.kmers,
-        #     file = scaler_file
-        # )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        # self._scaler.fit(ds)
-    """
-    # Model training
-    #########################################################################################################
-
-    """
-    def fit(self, datasets):
-        print('fit')
-        # Preprocessing loop
-        for name, ds in datasets.items():
-            # ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            # ds = self._scaler.transform(ds)
-            ds = ds.materialize()
-            datasets[name] = ds
-
-        if self._nb_GPU > 0:
-            self._fit_GPU(datasets)
-        else:
-            self._fit_CPU(datasets)
-
-    def _fit_CPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        # Define trainer / tuner
-        self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func_CPU,
-            train_loop_config=train_params,
-            scaling_config=ScalingConfig(
-                trainer_resources={'CPU': self._nb_CPU_data},
-                num_workers=self._n_workers,
-                use_gpu=self._use_gpu,
-                resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker
-                }
-            ),
-            run_config=RunConfig(
-                name=self.classifier,
-                local_dir=self._workdir,
-            ),
-            datasets=datasets,
-        )
-
-        training_result = self._trainer.fit()
-        self._model_ckpt = training_result.best_checkpoints[0][0]
-
-    def _fit_GPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'taxa': self.taxa,
-            'workdir':self._workdir,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        self._model_ckpt = train_func_GPU(datasets, train_params)
-    """
     # Model predicting
     #########################################################################################################
-
-    """
-    def predict(self, ds):
-        print('predict')
-        # Predict with model
-        probabilities = self._predict_proba(ds)
-        # Convert predictions to labels
-        predictions = self._get_abs_pred(probabilities)
-        # Return decoded labels
-        return self._label_decode(predictions)
-
-    def predict_proba(self, ds, threshold = 0.8):
-        print('predict_proba')
-        # Predict with model
-        probabilities = self._predict_proba(ds)
-        # Convert predictions to labels with threshold
-        predictions = self._get_threshold_pred(probabilities, threshold)
-        # Return decoded labels
-        return self._label_decode(predictions)
     
-    def _predict_proba(self, ds):
-        if ds.count() > 0:
-            if len(ds.schema().names) > 1:
-                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
-                ds = ds.drop_columns(col_2_drop)
-
-            ds = ds.materialize()
-
-            if self._nb_GPU > 0:
-                probabilities = self._predict_proba_GPU(ds)
-            else:
-                probabilities = self._predict_proba_CPU(ds)
-            
-            return probabilities
-
-        else:
-            raise ValueError('No data to predict')
-    def _predict_proba_CPU(self, ds):
-        print('_predict_proba_CPU')
-        self._predictor = BatchPredictor.from_checkpoint(
-            self._model_ckpt,
-            TensorflowPredictor,
-            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-        )
-        predictions = self._predictor.predict(
-            data = ds,
-            feature_columns = [TENSOR_COLUMN_NAME],
-            batch_size = self.batch_size,
-        )
-
-        probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
-
-        return predictions
-    
-    def _predict_proba_GPU(self, ds):
-        print('_predict_proba_GPU')
-        model = load_model(self._model_ckpt)
-        probabilities = []
-        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
-            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
-    """
     def _get_abs_pred(self, predictions):
         print('_get_abs_pred')
         return np.round(np.ravel(predictions))
diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py
index 1a0fec0..eb36d77 100644
--- a/src/models/kerasTF/build_neural_networks.py
+++ b/src/models/kerasTF/build_neural_networks.py
@@ -1,3 +1,4 @@
+import tensorflow as tf
 
 from keras.models import Model, Sequential
 from tensorflow.keras import mixed_precision
@@ -7,7 +8,10 @@
 from tensorflow.keras import mixed_precision
 from models.kerasTF.attentionLayer import AttentionWeightedAverage
 
-mixed_precision.set_global_policy('mixed_float16')
+if len(tf.config.list_physical_devices('GPU')) > 0:
+    mixed_precision.set_global_policy('mixed_float16')
+else:
+    mixed_precision.set_global_policy('mixed_bfloat16')
 
 __author__ = "Nicolas de Montigny"
 
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index 67d7fb9..c916afa 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -4,6 +4,8 @@
 import numpy as np
 import pandas as pd
 
+from glob import glob
+
 # Class construction
 from abc import ABC, abstractmethod
 
@@ -193,8 +195,13 @@ def fit(self, datasets):
         )
 
         training_result = self._trainer.fit()
-        self._model_ckpt = training_result.best_checkpoints[0][0]
-
+        # self._model_ckpt = training_result.best_checkpoints[0][0]
+        self._model_ckpt = glob(
+            os.path.join(
+                os.path.dirname(training_result.best_checkpoints[0][0].path),'checkpoint_*'
+            )
+        )
+                
     # Models predicting
     #########################################################################################################
 
@@ -219,26 +226,38 @@ def predict_proba(self, ds, threshold = 0.8):
     def _predict_proba(self, ds):
         print('_predict_proba')
         if ds.count() > 0:
-            if len(ds.schema().names) > 1:
-                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
-                ds = ds.drop_columns(col_2_drop)
+            # if len(ds.schema().names) > 1:
+            #     col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
+            #     ds = ds.drop_columns(col_2_drop)
 
             ds = self._scaler.transform(ds)
-
             ds = ds.materialize()
 
-            self._predictor = BatchPredictor.from_checkpoint(
-                self._model_ckpt,
-                TensorflowPredictor,
-                model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            )
-            predictions = self._predictor.predict(
-                data = ds,
-                feature_columns = [TENSOR_COLUMN_NAME],
-                batch_size = self.batch_size,
-            )
-
-            probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
+            def predict_func(data):
+                X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME])
+                pred = np.zeros((len(X), len(self._labels_map)-1))
+                for ckpt in self._model_ckpt:
+                    ckpt = TensorflowCheckpoint.from_directory(ckpt)
+                    predictor = TensorflowPredictor().from_checkpoint(ckpt, model_definition = lambda: build_model('cnn', self._nb_classes, self._nb_kmers))
+                    proba = predictor.predict(X)
+                    pred += proba['predictions']
+                pred = pred / len(self._model_ckpt)
+                return {'predictions' : pred}
+            
+            probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
+            probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
+            
+            # self._predictor = BatchPredictor.from_checkpoint(
+            #     self._model_ckpt,
+            #     TensorflowPredictor,
+            #     model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
+            # )
+            # predictions = self._predictor.predict(
+            #     data = ds,
+            #     feature_columns = [TENSOR_COLUMN_NAME],
+            #     batch_size = self.batch_size,
+            # )
+            # probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
             
             return probabilities
         else:
@@ -370,9 +389,9 @@ def train_func_GPU(config):
         )
         gc.collect()
         tf.keras.backend.clear_session()
-    del model
-    gc.collect()
-    tf.keras.backend.clear_session()
+    # del model
+    # gc.collect()
+    # tf.keras.backend.clear_session()
 
 def build_model(classifier, nb_cls, nb_kmers):
     if classifier == 'attention':
diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py
index 99bf6bd..4e564da 100644
--- a/src/models/kerasTF/multiclass_models.py
+++ b/src/models/kerasTF/multiclass_models.py
@@ -99,170 +99,9 @@ def __init__(
         )
         self._nb_classes = None
 
-    # Data preprocessing
-    #########################################################################################################
-
-    """
-    def preprocess(self, ds, scaling = False, scaler_file = None):
-        print('preprocess')
-        # Labels encoding
-        self._encoder = ModelLabelEncoder(self.taxa)
-        self._encoder.fit(ds)
-
-        # Labels mapping
-        labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys())
-        self._nb_classes = len(labels)
-        self._encoded = np.arange(len(labels))
-        labels = np.append(labels, 'Unknown')
-        self._encoded = np.append(self._encoded, -1)
-
-        for (label, encoded) in zip(labels, self._encoded):
-            self._labels_map[label] = encoded
-        
-        # Class weights
-        self._weights = self._compute_weights()
-        
-        # Scaling
-        # self._scaler = TensorTfIdfTransformer(
-        #     features = self.kmers,
-        #     file = scaler_file
-        # )
-        # self._scaler = TensorMinMaxScaler(self._nb_kmers)
-        # self._scaler.fit(ds)
-    """
-    # Models training
-    #########################################################################################################
-
-    """
-    def fit(self, datasets):
-        print('fit')
-        # Preprocessing loop
-        for name, ds in datasets.items():
-            # ds = ds.drop_columns(['id'])
-            ds = self._encoder.transform(ds)
-            # ds = self._scaler.transform(ds)
-            ds = ds.materialize()
-            datasets[name] = ds
-
-        if self._nb_GPU > 0:
-            self._fit_GPU(datasets)
-        else:
-            self._fit_CPU(datasets)
-
-    def _fit_CPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        # Define trainer / tuner
-        self._trainer = TensorflowTrainer(
-            train_loop_per_worker=train_func_CPU,
-            train_loop_config=train_params,
-            scaling_config=ScalingConfig(
-                trainer_resources={'CPU': self._nb_CPU_data},
-                num_workers=self._n_workers,
-                use_gpu=self._use_gpu,
-                resources_per_worker={
-                    'CPU': self._nb_CPU_per_worker
-                }
-            ),
-            run_config=RunConfig(
-                name=self.classifier,
-                local_dir=self._workdir,
-            ),
-            datasets=datasets,
-        )
-
-        training_result = self._trainer.fit()
-        self._model_ckpt = training_result.best_checkpoints[0][0]
-    
-    def _fit_GPU(self, datasets):
-        # Training parameters
-        train_params = {
-            'batch_size': self.batch_size,
-            'epochs': self._training_epochs,
-            'size': self._nb_kmers,
-            'nb_cls': self._nb_classes,
-            'taxa': self.taxa,
-            'workdir':self._workdir,
-            'model': self.classifier,
-            'weights': self._weights
-        }
-
-        self._model_ckpt = train_func_GPU(datasets, train_params)
-        """
     # Models predicting
     #########################################################################################################
 
-    """
-    def predict(self, ds):
-        print('predict')
-        # Predict with model
-        probabilities = self._predict_proba(ds)
-        # Convert predictions to labels
-        predictions = self._get_abs_pred(probabilities)
-        # Return decoded labels
-        return self._label_decode(predictions)
-    
-    def predict_proba(self, ds, threshold = 0.8):
-        print('predict_proba')
-        # Predict with model
-        probabilities = self._predict_proba(ds)
-        # Convert predictions to labels with threshold
-        predictions = self._get_threshold_pred(probabilities, threshold)
-        # Return decoded labels
-        return self._label_decode(predictions)
-
-    def _predict_proba(self, ds):
-        print('_predict_proba')
-        if ds.count() > 0:
-            if len(ds.schema().names) > 1:
-                col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
-                ds = ds.drop_columns(col_2_drop)
-
-            ds = ds.materialize()
-
-            if self._nb_GPU > 0:
-                probabilities = self._predict_proba_GPU(ds)
-            else:
-                probabilities = self._predict_proba_CPU(ds)
-            
-            return probabilities
-        else:
-            raise ValueError('No data to predict')
-
-    def _predict_proba_CPU(self, ds):
-        print('_predict_proba_CPU')
-        self._predictor = BatchPredictor.from_checkpoint(
-            self._model_ckpt,
-            TensorflowPredictor,
-            model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-        )
-        predictions = self._predictor.predict(
-            data = ds,
-            feature_columns = [TENSOR_COLUMN_NAME],
-            batch_size = self.batch_size,
-        )
-
-        probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
-
-        return probabilities
-    
-    def _predict_proba_GPU(self, ds):
-        print('_predict_proba_GPU')
-        model = load_model(self._model_ckpt)
-        probabilities = []
-        for batch in ds.iter_tf_batches(batch_size = self.batch_size):
-            probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME]))
-        
-        return probabilities
-    """
     def _get_abs_pred(self, predictions):
         print('_get_abs_pred')
         return np.argmax(predictions, axis = 1)

From b685c4c5d3bfbb0f59661b6fe7f26013ec97aa62 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 23 Dec 2023 20:38:11 -0500
Subject: [PATCH 91/92] scripts default options

---
 src/Caribou_classification.py          |  2 +-
 src/Caribou_classification_train_cv.py |  2 +-
 src/Caribou_extraction.py              |  2 +-
 src/Caribou_pipeline.py                |  4 ++--
 src/models/kerasTF/models.py           | 17 ++---------------
 5 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index 9a9473c..eef5813 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -108,7 +108,7 @@ def bacteria_classification(opt):
     # Optional datasets
     parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
     # Parameters
-    parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
+    parser.add_argument('-model','--model_type', default='sgd', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
     parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py
index f6d1422..09d957f 100644
--- a/src/Caribou_classification_train_cv.py
+++ b/src/Caribou_classification_train_cv.py
@@ -104,7 +104,7 @@ def bacteria_classification_train_cv(opt):
     parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
     parser.add_argument('-t','--test', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the test dataset')
     # Parameters
-    parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
+    parser.add_argument('-model','--model_type', required=True, choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train')
     parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py
index 6228230..90ceebe 100644
--- a/src/Caribou_extraction.py
+++ b/src/Caribou_extraction.py
@@ -127,7 +127,7 @@ def bacteria_extraction(opt):
     parser.add_argument('-m','--merged', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the merged bacteria and host databases')
     parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset')
     # Parameters
-    parser.add_argument('-model','--model_type', default=None, choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
+    parser.add_argument('-model','--model_type', default='linearsvm', choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train')
     parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32')
     parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100')
     parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved')
diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py
index a6fdb0b..97b42f8 100644
--- a/src/Caribou_pipeline.py
+++ b/src/Caribou_pipeline.py
@@ -43,8 +43,8 @@ def caribou(opt):
     # settings
     k_length = config.getint('settings', 'k', fallback = 35)
     cv = config.getboolean('settings', 'cross_validation', fallback = True)
-    binary_classifier = config.get('settings', 'host_extractor', fallback = 'attention')
-    multi_classifier = config.get('settings', 'bacteria_classifier', fallback = 'lstm_attention')
+    binary_classifier = config.get('settings', 'host_extractor', fallback = 'linearsvm')
+    multi_classifier = config.get('settings', 'bacteria_classifier', fallback = 'sgd')
     training_batch_size = config.getint('settings', 'training_batch_size', fallback = 32)
     training_epochs = config.getint('settings','neural_network_training_iterations', fallback = 100)
     classif_threshold = config.getfloat('settings', 'classification_threshold', fallback = 0.8)
diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py
index c916afa..3bcea47 100644
--- a/src/models/kerasTF/models.py
+++ b/src/models/kerasTF/models.py
@@ -226,9 +226,6 @@ def predict_proba(self, ds, threshold = 0.8):
     def _predict_proba(self, ds):
         print('_predict_proba')
         if ds.count() > 0:
-            # if len(ds.schema().names) > 1:
-            #     col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME]
-            #     ds = ds.drop_columns(col_2_drop)
 
             ds = self._scaler.transform(ds)
             ds = ds.materialize()
@@ -247,18 +244,6 @@ def predict_func(data):
             probabilities = ds.map_batches(predict_func, batch_format = 'numpy')
             probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions'])
             
-            # self._predictor = BatchPredictor.from_checkpoint(
-            #     self._model_ckpt,
-            #     TensorflowPredictor,
-            #     model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers)
-            # )
-            # predictions = self._predictor.predict(
-            #     data = ds,
-            #     feature_columns = [TENSOR_COLUMN_NAME],
-            #     batch_size = self.batch_size,
-            # )
-            # probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions'])
-            
             return probabilities
         else:
             raise ValueError('No data to predict')
@@ -314,6 +299,7 @@ def train_func_CPU(config):
             # local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         # Training
+        # TODO: Move epochs to model.fit instead of in loop?
         history = model.fit(
             x = batch_train,
             validation_data = batch_val,
@@ -371,6 +357,7 @@ def train_func_GPU(config):
             # local_shuffle_seed = int(np.random.randint(1,10000, size = 1))
         )
         # Training
+        # TODO: Move epochs to model.fit instead of in loop?
         history = model.fit(
             x = batch_train,
             validation_data = batch_val,

From a7dd1ece9490c104b0c37e1d2cca91fcf5021084 Mon Sep 17 00:00:00 2001
From: Nicolas de Montigny <nicolas.de.montigny0323@gmail.com>
Date: Sat, 23 Dec 2023 21:34:32 -0500
Subject: [PATCH 92/92] recurrent predictions debug

---
 src/Caribou_classification.py |  2 +-
 src/models/classification.py  | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py
index eef5813..1aae153 100644
--- a/src/Caribou_classification.py
+++ b/src/Caribou_classification.py
@@ -70,7 +70,7 @@ def bacteria_classification(opt):
         outdirs = outdirs,
         db_name = opt['database_name'],
         clf_multiclass = opt['model_type'],
-        taxa = 'domain',
+        taxa = lst_taxas,
         batch_size = opt['batch_size'],
         training_epochs = opt['training_epochs'],
         scaling = scaling
diff --git a/src/models/classification.py b/src/models/classification.py
index c164bf8..a965347 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -76,7 +76,7 @@ def fit(self, datasets):
         self._valid_assign_taxas()
         self._valid_classifier()
         tax_map = self._verify_model_trained()
-        
+
         self._fit(datasets, tax_map)
         
     def predict(self, dataset):
@@ -263,12 +263,12 @@ def _remove_unknown(self, ds, predict):
             'ids' : ids,
             'predictions' : predict
         })
-        mapping = mapping[mapping['predictions'] != -1]
+        mapping = mapping[mapping['predictions'] != 'Unknown']
         ids = mapping['ids']
         predict = mapping['predictions']
 
         def remove_unknown(df):
-            df = df[df['ids'].isin(ids)]
+            df = df[df['id'].isin(ids)]
             return df
         
         ds = ds.map_batches(remove_unknown, batch_format = 'pandas')
@@ -351,6 +351,7 @@ def _valid_assign_taxas(self):
             self._taxas = [self._taxas]
         else:
             raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract")
+        
         self._valid_taxas()
         self._taxas = [taxa for taxa in self._database_data['taxas'] if taxa in self._taxas]
         self._taxas.reverse()
@@ -443,7 +444,8 @@ def _save_dataset(self, ds, taxa):
             model = self._classifier_binary
         else:
             model = self._classifier_multiclass
-        file = os.path.join(self._outdirs['results'], f'data_classified_{model}_{taxa}.parquet')
+        file = os.path.join(self._outdirs['results_dir'], f'data_classified_{model}_{taxa}')
+
         ds.write_parquet(file)
         return file
     
\ No newline at end of file