From 14c529839cb29efe09e77157d9516d154732e661 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 11 Oct 2023 10:56:53 -0400 Subject: [PATCH 01/92] faster method for dropping pandas columns --- setup.cfg | 2 +- src/Caribou_reduce_features.py | 10 ++----- src/data/reduction/chi2_selection.py | 17 ++++++----- src/data/reduction/occurence_exclusion.py | 36 ++++++++++------------- 4 files changed, 29 insertions(+), 36 deletions(-) diff --git a/setup.cfg b/setup.cfg index 917c168..6e6c672 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = Caribou -version = 1.3.1 +version = 1.4.1 url = https://github.com/bioinfoUQAM/Caribou/wiki author = Nicolas de Montigny author_email = de_montigny.nicolas@courrier.uqam.ca diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 2ca05fc..3e735c7 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -38,11 +38,6 @@ def features_reduction(opt): if opt['model_type'] is None: opt['model_type'] = 'cnn' """ - - # Validate training parameters - verify_positive_int(opt['batch_size'], 'batch_size') - verify_positive_int(opt['training_epochs'], 'number of training iterations') - outdirs = define_create_outdirs(opt['outdir']) # Initialize cluster @@ -70,6 +65,9 @@ def features_reduction(opt): # Save reduced dataset data['profile'] = f"{data['profile']}_reduced" ds.write_parquet(data['profile']) + # Save reduced K-mers + with open(os.path.join(outdirs["data_dir"],'kmers_list.txt'),'w') as handle: + handle.writelines("%s\n" % item for item in data['kmers']) # Save reduced data path, ext = os.path.splitext(opt['dataset']) data_file = f'{path}_reduced{ext}' @@ -111,8 +109,6 @@ def chi2selection(ds, kmers): parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files') parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced') # Parameters - parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') - parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled') args = parser.parse_args() diff --git a/src/data/reduction/chi2_selection.py b/src/data/reduction/chi2_selection.py index ee1dff9..f0b9897 100644 --- a/src/data/reduction/chi2_selection.py +++ b/src/data/reduction/chi2_selection.py @@ -27,7 +27,7 @@ def __init__(self, features: List[str], threshold: float = 0.05): def _fit(self, ds: Dataset) -> Preprocessor: mean_chi = [] cols_keep = [] - cols_drop = [] + # cols_drop = [] # Compute chi2 over batches for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'): X = batch[TENSOR_COLUMN_NAME].to_numpy() @@ -49,22 +49,23 @@ def _fit(self, ds: Dataset) -> Preprocessor: cols_keep = self.features warn('No values were found to have a chi2 p-value under the threshold, all features will be kept.\ You can try running this feature selector again with a different threshold to reduce the number of features') - else: - cols_drop = list(set(self.features).difference(set(cols_keep))) + # else: + # cols_drop = list(set(self.features).difference(set(cols_keep))) + # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} + self.stats_ = {'cols_keep' : cols_keep} - self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} return self def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: - _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) - cols_drop = self.stats_['cols_drop'] + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] tensor_col = df[TENSOR_COLUMN_NAME] tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col.drop(cols_drop, axis = 1) - tensor_col = tensor_col.to_numpy() + tensor_col = tensor_col[cols_keep].to_numpy() + # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index 88a80bf..2e87273 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -29,28 +29,29 @@ def _fit(self, ds: Dataset) -> Preprocessor: occurences += np.count_nonzero(batch, axis = 0) # Include / Exclude by sorted position - cols_drop = [] + # cols_drop = [] cols_keep = pd.Series(occurences, index = self.features) cols_keep = cols_keep.sort_values(ascending = True) # Long operation - cols_drop.extend(cols_keep.iloc[0 : self.num_features].index) - cols_drop.extend(cols_keep.iloc[(self._nb_features - self.num_features) : self._nb_features].index) + # cols_drop.extend(cols_keep.iloc[0 : self.num_features].index) + # cols_drop.extend(cols_keep.iloc[(self._nb_features - self.num_features) : self._nb_features].index) cols_keep = cols_keep.iloc[self.num_features : (self._nb_features - self.num_features)] cols_keep = list(cols_keep.index) - self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} + # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} + self.stats_ = {'cols_keep' : cols_keep} return self def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: - _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) - cols_drop = self.stats_['cols_drop'] + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] tensor_col = df[TENSOR_COLUMN_NAME] tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col.drop(cols_drop, axis = 1) - tensor_col = tensor_col.to_numpy() + tensor_col = tensor_col[cols_keep].to_numpy() + # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) @@ -85,29 +86,24 @@ def _fit(self, ds: Dataset) -> Preprocessor: cols_keep = pd.Series(occurences, index = self.features) cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')] cols_keep = list(cols_keep.index) - cols_drop = list(set(self.features).difference(set(cols_keep))) - # cols_drop = [] - # occurences = pd.Series(occurences, index = self.features) - # cols_drop.extend(occurences[low_treshold > occurences].index) - # cols_drop.extend(occurences[occurences < high_treshold].index) - - # cols_keep = list(set(self.features).difference(set(cols_drop))) + # cols_drop = list(set(self.features).difference(set(cols_keep))) + # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} - self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} + self.stats_ = {'cols_keep' : cols_keep} return self def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: - _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) - cols_drop = self.stats_['cols_drop'] + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] tensor_col = df[TENSOR_COLUMN_NAME] tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col.drop(cols_drop, axis = 1) - tensor_col = tensor_col.to_numpy() + tensor_col = tensor_col[cols_keep].to_numpy() + # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) From e88db22c7054650034cf00d6c9fcd211039fd60c Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 11 Oct 2023 20:40:51 -0400 Subject: [PATCH 02/92] parallelization for training reducers --- src/Caribou_reduce_features.py | 15 ++-- src/data/kmers.py | 2 +- src/data/reduction/chi2_selection.py | 34 +++++++--- src/data/reduction/occurence_exclusion.py | 83 +++++++++++++++++++++-- 4 files changed, 112 insertions(+), 22 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 3e735c7..0f36aa3 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -46,12 +46,9 @@ def features_reduction(opt): # Features reduction ################################################################################ """ - Brute -> Affined (20% recursive removal == 40% of original) - 1. OccurenceExclusion - 2. LowVarSelection - 3. Chi2 + SelectPercentile(50) / SelectKBest(X) - 4. TruncatedSVD + text -> LSA - 5. KRFE (require to train an estimator) + Brute force -> Features statistically related to classes + 1. OccurenceExclusion (5% extremes) + 2. Chi2 + SelectKBest() (<0.05 p-value) """ # Load data @@ -81,6 +78,7 @@ def occurence_exclusion(ds, kmers): features = kmers, percent = 0.05 ) + ds = preprocessor.fit_transform(ds) kmers = preprocessor.stats_['cols_keep'] @@ -93,6 +91,11 @@ def chi2selection(ds, kmers): features = kmers, threshold = 0.05 ) + # TODO : PARALLELIZE FITTING LIKE IN OCCURENCES + import sys + preprocessor.fit(ds) + sys.exit() + ds = preprocessor.fit_transform(ds) kmers = preprocessor.stats_['cols_keep'] diff --git a/src/data/kmers.py b/src/data/kmers.py index 1e84479..655b320 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -342,7 +342,7 @@ def _kmers_tokenization(self): self.df = tokenizer.transform(self.df) if self.method == 'seen': self.kmers_list = tokenizer.stats_['tokens(sequence)'] - self._kmers_reduction() + # self._kmers_reduction() def _kmers_reduction(self): # Exclusion of columns occuring in less 5% / more 95% of the samples diff --git a/src/data/reduction/chi2_selection.py b/src/data/reduction/chi2_selection.py index f0b9897..9784ecb 100644 --- a/src/data/reduction/chi2_selection.py +++ b/src/data/reduction/chi2_selection.py @@ -28,22 +28,40 @@ def _fit(self, ds: Dataset) -> Preprocessor: mean_chi = [] cols_keep = [] # cols_drop = [] - # Compute chi2 over batches - for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'): - X = batch[TENSOR_COLUMN_NAME].to_numpy() + + # Function for parallel chi2 computing + def chi_sqr(batch): + X = batch[TENSOR_COLUMN_NAME] X = _unwrap_ndarray_object_type_if_needed(X) X = pd.DataFrame(X, columns = self.features) - y = batch['species'].to_numpy().ravel() - mean_chi.append(chi2(X, y)[1]) + y = batch['species'].ravel() + return {'chi' : [chi2(X, y)[1]]} + + + # Compute chi2 over batches + # for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'): + # X = batch[TENSOR_COLUMN_NAME].to_numpy() + # X = _unwrap_ndarray_object_type_if_needed(X) + # X = pd.DataFrame(X, columns = self.features) + # y = batch['species'].to_numpy().ravel() + # mean_chi.append(chi2(X, y)[1]) + + chi = ds.map_batches(chi_sqr, batch_format = 'numpy') + + for i, row in enumerate(chi.iter_rows()): + mean_chi.append(row['chi']) # Compute the mean of chi2 by feature mean_chi = np.array(mean_chi) mean_chi = np.mean(mean_chi, axis = 0) - cols_keep = pd.Series(mean_chi, index = self.features) - cols_keep = cols_keep[cols_keep <= self.threshold] - cols_keep = list(cols_keep.index) + # cols_keep = pd.Series(mean_chi, index = self.features) + # cols_keep = cols_keep[cols_keep <= self.threshold] + # cols_keep = list(cols_keep.index) + # Construct list of features to keep by position + cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi < self.threshold] + # Keep all features if none are under the threshold if len(cols_keep) == 0: cols_keep = self.features diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index 2e87273..097155d 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -59,11 +59,9 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: def __repr__(self): return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self.num_features!r})") - +""" class TensorPercentOccurenceExclusion(Preprocessor): - """ - Exclusion of the features present in less than (%) / more than (100% - %) across samples to be used as a Ray preprocessor. - """ + def __init__(self, features: List[str], percent : int = 0.05): # Parameters @@ -101,17 +99,88 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: tensor_col = df[TENSOR_COLUMN_NAME] tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col[cols_keep].to_numpy() # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) - + return df def __repr__(self): return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)") +""" def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: if len(df.loc[0, column]) != nb_features: - raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') + + + + +class TensorPercentOccurenceExclusion(Preprocessor): + """ + Exclusion of the features present in less than (%) / more than (100% - %) across samples to be used as a Ray preprocessor. + """ + + def __init__(self, features: List[str], percent : int = 0.05): + # Parameters + self.features = features + self.percent = percent + self._nb_features = len(features) + + def _fit(self, ds: Dataset) -> Preprocessor: + nb_samples = ds.count() + low_treshold = ceil((0 + self.percent) * nb_samples) + high_treshold = floor((1 - self.percent) * nb_samples) + + # Function for parallel occurences counting + def count_occurences(batch): + batch = batch[TENSOR_COLUMN_NAME] + return {'occurences' : [np.count_nonzero(batch, axis = 0)]} + + occur = ds.map_batches(count_occurences, batch_format = 'numpy') + + occurences = np.zeros(self._nb_features) + for batch in occur.iter_batches(batch_format = 'numpy'): + batch_occur = batch['occurences'].sum(axis = 0) + occurences += batch_occur + + # Construct list of features to keep by position + cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if low_treshold < occurence < high_treshold] + + self.stats_ = {'cols_keep' : cols_keep} + + """ + # Nb of occurences + for batch in ds.iter_batches(batch_format = 'numpy'): + batch = batch[TENSOR_COLUMN_NAME] + occurences += np.count_nonzero(batch, axis = 0) + # Include / Exclude by occurences thresholds across samples + cols_keep = pd.Series(occurences, index = self.features) + cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')] + cols_keep = list(cols_keep.index) + + # cols_drop = list(set(self.features).difference(set(cols_keep))) + # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} + + self.stats_ = {'cols_keep' : cols_keep} + """ + + return self + + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] + + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = pd.DataFrame(tensor_col, columns = self.features) + tensor_col = tensor_col[cols_keep].to_numpy() + # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() + + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + return df + + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)") \ No newline at end of file From addfbbfb0b68f546603f0e22d05f6b7130b2f1dc Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 11 Oct 2023 22:28:08 -0400 Subject: [PATCH 03/92] reducers tested locally --- src/Caribou_reduce_features.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 0f36aa3..b2ffc5a 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -91,10 +91,6 @@ def chi2selection(ds, kmers): features = kmers, threshold = 0.05 ) - # TODO : PARALLELIZE FITTING LIKE IN OCCURENCES - import sys - preprocessor.fit(ds) - sys.exit() ds = preprocessor.fit_transform(ds) From 8df0ecac9ef5fb69cec864f626e7c8d1564b25e3 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 15 Oct 2023 23:34:24 -0400 Subject: [PATCH 04/92] features reduction test in local --- src/Caribou_reduce_features.py | 102 ++++++++++++--- src/data/kmers.py | 4 +- src/data/reduction/chi2_selection.py | 97 -------------- src/data/reduction/count_hashing.py | 1 - src/data/reduction/features_selection.py | 83 ++++++++++++ src/data/reduction/low_var_selection.py | 144 +++++++++------------ src/data/reduction/occurence_exclusion.py | 30 +---- src/models/kerasTF/models.py | 2 +- src/models/preprocessors/max_abs_scaler.py | 5 +- src/models/preprocessors/min_max_scaler.py | 4 +- src/models/sklearn/models.py | 2 +- src/utils.py | 2 +- 12 files changed, 242 insertions(+), 234 deletions(-) delete mode 100644 src/data/reduction/chi2_selection.py create mode 100644 src/data/reduction/features_selection.py diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index b2ffc5a..379c1ef 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -8,7 +8,11 @@ from time import time from pathlib import Path -from data.reduction.chi2_selection import TensorChi2Selection +from ray.data.preprocessors import Chain, LabelEncoder + +from models.preprocessors.min_max_scaler import TensorMinMaxScaler +from data.reduction.low_var_selection import TensorLowVarSelection +from data.reduction.features_selection import TensorFeaturesSelection from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion __author__ = "Nicolas de Montigny" @@ -32,12 +36,6 @@ def features_reduction(opt): # Verification of k length k_length, kmers_list = verify_kmers_list_length(k_length, opt['kmers_list']) - # Not sure if needed for training KRFE - """ - # Verify that model type is valid / choose default depending on host presence - if opt['model_type'] is None: - opt['model_type'] = 'cnn' - """ outdirs = define_create_outdirs(opt['outdir']) # Initialize cluster @@ -47,36 +45,84 @@ def features_reduction(opt): ################################################################################ """ Brute force -> Features statistically related to classes - 1. OccurenceExclusion (5% extremes) - 2. Chi2 + SelectKBest() (<0.05 p-value) + 1. OccurenceExclusion (10% extremes) + 2. LowVarSelection () + 3. Chi2 + SelectPercentile() (75% best values) """ # Load data ds = ray.data.read_parquet(data['profile']) - # Iterate over methods for exp results + ds_train = ray.data.read_parquet(data['profile']) + # Time the computation of transformations t_start = time() - ds, kmers_list = occurence_exclusion(ds, kmers_list) - ds, data['kmers'] = chi2selection(ds, kmers_list) + ds, data['kmers'] = exclude_select(ds, ds_train, kmers_list, data['taxas'][0]) + # ds, kmers_list = occurence_exclusion(ds, kmers_list) + # ds, kmers_list = low_var_selection(ds,kmers_list) + # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) t_end = time() t_reduction = t_end - t_start # Save reduced dataset data['profile'] = f"{data['profile']}_reduced" ds.write_parquet(data['profile']) # Save reduced K-mers - with open(os.path.join(outdirs["data_dir"],'kmers_list.txt'),'w') as handle: + with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle: handle.writelines("%s\n" % item for item in data['kmers']) # Save reduced data path, ext = os.path.splitext(opt['dataset']) data_file = f'{path}_reduced{ext}' save_Xy_data(data, data_file) - print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} using the combined occurence and chi2 methods from the original dataset in {t_reduction} seconds.") + print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.") + +def exclude_select(ds, ds_train, kmers, taxa): + # Occurence exclusion + excluder = TensorPercentOccurenceExclusion( + features = kmers, + percent = 0.1 # remove features present in less than 10% samples + ) + + ds = excluder.fit_transform(ds) + ds_train = excluder.transform(ds_train) + + kmers = excluder.stats_['cols_keep'] + + varier = TensorLowVarSelection( + features = kmers, + threshold = 0.1, # remove features with less than 10% variance + ) + + ds = varier.fit_transform(ds) + ds_train = varier.transform(ds_train) + + kmers = varier.stats_['cols_keep'] + + # Preprocessing + preprocessor = Chain( + LabelEncoder(taxa), + TensorMinMaxScaler(kmers), + ) + + ds_train = preprocessor.fit_transform(ds_train) -# Exclusion columns occuring in less / more than 10% of the columns + # Statistical features selection + selector = TensorFeaturesSelection( + features = kmers, + taxa = taxa, + threshold = 0.25, # remove lowest 25% significance + ) + + selector.fit(ds_train) + ds = selector.transform(ds) + + kmers = selector.stats_['cols_keep'] + + return ds, kmers + +# Exclusion columns occuring in less / more than 10% of the columns = 20% removed def occurence_exclusion(ds, kmers): preprocessor = TensorPercentOccurenceExclusion( features = kmers, - percent = 0.05 + percent = 0.1 # remove features present in less than 10% samples ) ds = preprocessor.fit_transform(ds) @@ -85,19 +131,33 @@ def occurence_exclusion(ds, kmers): return ds, kmers -# Chi2 evaluation of dependance between features and classes -def chi2selection(ds, kmers): - preprocessor = TensorChi2Selection( +# Exclusion of columns with variance lower than a certain threshold +def low_var_selection(ds, kmers): + preprocessor = TensorLowVarSelection( features = kmers, - threshold = 0.05 + threshold = 0.1, # remove features with less than 10% variance ) - + ds = preprocessor.fit_transform(ds) kmers = preprocessor.stats_['cols_keep'] return ds, kmers +# Chi2 evaluation of dependance between features and classes +def features_selection(ds, kmers, taxa): + preprocessor = TensorFeaturesSelection( + features = kmers, + taxa = taxa, + threshold = 0.25, # remove lowest 25% significance + ) + + ds = preprocessor.fit_transform(ds) + + kmers = preprocessor.stats_['cols_keep'] + + return ds, kmers + # Argument parsing from CLI ################################################################################ diff --git a/src/data/kmers.py b/src/data/kmers.py index 655b320..18efe97 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -16,7 +16,7 @@ from data.extraction.given_kmers_vectorizer import GivenKmersVectorizer # Features selection -from data.reduction.chi2_selection import TensorChi2Selection +from data.reduction.features_selection import TensorFeaturesSelection from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion __author__ = ['Amine Remita', 'Nicolas de Montigny'] @@ -355,7 +355,7 @@ def _kmers_reduction(self): self.kmers_list = excluder.stats_['cols_keep'] # Chi2 evaluation of dependance between features and classes - selector = TensorChi2Selection( + selector = TensorFeaturesSelection( features = self.kmers_list, threshold = 0.05 ) diff --git a/src/data/reduction/chi2_selection.py b/src/data/reduction/chi2_selection.py deleted file mode 100644 index 9784ecb..0000000 --- a/src/data/reduction/chi2_selection.py +++ /dev/null @@ -1,97 +0,0 @@ - -import numpy as np -import pandas as pd - -from typing import List -from warnings import warn -from ray.data import Dataset -from sklearn.feature_selection import chi2 -from ray.data.preprocessor import Preprocessor -from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed - -TENSOR_COLUMN_NAME = '__value__' - -class TensorChi2Selection(Preprocessor): - """ - Custom implementation of SelectKBest with Chi2 inspired by sklearn.feature_selection.SelectPercentile and sklearn.feature_selection.chi2 features selector to be used as a Ray preprocessor. - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2 - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest - """ - - def __init__(self, features: List[str], threshold: float = 0.05): - # Parameters - self.features = features - self.threshold = threshold - self._nb_features = len(features) - - def _fit(self, ds: Dataset) -> Preprocessor: - mean_chi = [] - cols_keep = [] - # cols_drop = [] - - # Function for parallel chi2 computing - def chi_sqr(batch): - X = batch[TENSOR_COLUMN_NAME] - X = _unwrap_ndarray_object_type_if_needed(X) - X = pd.DataFrame(X, columns = self.features) - y = batch['species'].ravel() - return {'chi' : [chi2(X, y)[1]]} - - - # Compute chi2 over batches - # for batch in ds.iter_batches(batch_size = 5, batch_format = 'pandas'): - # X = batch[TENSOR_COLUMN_NAME].to_numpy() - # X = _unwrap_ndarray_object_type_if_needed(X) - # X = pd.DataFrame(X, columns = self.features) - # y = batch['species'].to_numpy().ravel() - # mean_chi.append(chi2(X, y)[1]) - - chi = ds.map_batches(chi_sqr, batch_format = 'numpy') - - for i, row in enumerate(chi.iter_rows()): - mean_chi.append(row['chi']) - - # Compute the mean of chi2 by feature - mean_chi = np.array(mean_chi) - mean_chi = np.mean(mean_chi, axis = 0) - - # cols_keep = pd.Series(mean_chi, index = self.features) - # cols_keep = cols_keep[cols_keep <= self.threshold] - # cols_keep = list(cols_keep.index) - - # Construct list of features to keep by position - cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi < self.threshold] - - # Keep all features if none are under the threshold - if len(cols_keep) == 0: - cols_keep = self.features - warn('No values were found to have a chi2 p-value under the threshold, all features will be kept.\ - You can try running this feature selector again with a different threshold to reduce the number of features') - # else: - # cols_drop = list(set(self.features).difference(set(cols_keep))) - # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} - self.stats_ = {'cols_keep' : cols_keep} - - return self - - def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: - # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) - cols_keep = self.stats_['cols_keep'] - - tensor_col = df[TENSOR_COLUMN_NAME] - tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) - tensor_col = pd.DataFrame(tensor_col, columns = self.features) - - tensor_col = tensor_col[cols_keep].to_numpy() - # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() - - df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) - - return df - - def __repr__(self): - return (f"{self.__class__.__name__}(features={self._nb_features!r}, threshold={self.threshold!r})") - -def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: - if len(df.loc[0, column]) != nb_features: - raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file diff --git a/src/data/reduction/count_hashing.py b/src/data/reduction/count_hashing.py index 6171112..1b6506e 100644 --- a/src/data/reduction/count_hashing.py +++ b/src/data/reduction/count_hashing.py @@ -24,7 +24,6 @@ def __init__(self, features: List[str], num_features: int): self.num_features = num_features def _transform_pandas(self, df: pd.DataFrame): - # TODO(matt): Use sparse matrix for efficiency. def row_feature_hasher(row): hash_counts = collections.defaultdict(int) for feature in self.features: diff --git a/src/data/reduction/features_selection.py b/src/data/reduction/features_selection.py new file mode 100644 index 0000000..c07b515 --- /dev/null +++ b/src/data/reduction/features_selection.py @@ -0,0 +1,83 @@ +import logging + +import numpy as np +import pandas as pd + +from typing import List +from warnings import warn +from ray.data import Dataset + +from sklearn.feature_selection import chi2 +from sklearn.feature_selection import f_classif, f_oneway + +from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +TENSOR_COLUMN_NAME = '__value__' + +class TensorFeaturesSelection(Preprocessor): + """ + Custom implementation of SelectKBest with Chi2 inspired by sklearn.feature_selection.SelectPercentile and sklearn.feature_selection.chi2 features selector to be used as a Ray preprocessor. + https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2 + https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest + """ + + def __init__(self, features: List[str], taxa: str, threshold: float = 0.5): + # Parameters + self.taxa = taxa + self.features = features + self.threshold = threshold + self._nb_features = len(features) + + def _fit(self, ds: Dataset) -> Preprocessor: + # Function for parallel stats computing + def stats(batch): + X = batch[TENSOR_COLUMN_NAME] + X = _unwrap_ndarray_object_type_if_needed(X) + X = pd.DataFrame(X, columns = self.features) + y = batch[self.taxa].ravel() + return {'chi' : [chi2(X, y)[0]]} + + mean_chi = [] + cols_keep = [] + + # Chi batches means extraction + chi = ds.map_batches(stats, batch_format = 'numpy', batch_size = 32) + for i, row in enumerate(chi.iter_rows()): + mean_chi.append(row['chi']) + + # Chi mean of batches means computing + mean_chi = np.array(mean_chi) + mean_chi = np.nanmean(mean_chi, axis = 0) + + # Determine the threshold from distribution of chi values + self.threshold = np.nanquantile(mean_chi, self.threshold) + + # Keep features with values higher than the threshold + cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi > self.threshold] + + self.stats_ = {'cols_keep' : cols_keep} + + return self + + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] + + if len(cols_keep) < self._nb_features: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = pd.DataFrame(tensor_col, columns = self.features) + + tensor_col = tensor_col[cols_keep].to_numpy() + + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + return df + + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})") + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file diff --git a/src/data/reduction/low_var_selection.py b/src/data/reduction/low_var_selection.py index 52ecb77..7fa6561 100644 --- a/src/data/reduction/low_var_selection.py +++ b/src/data/reduction/low_var_selection.py @@ -5,6 +5,7 @@ from typing import List from ray.data import Dataset from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed TENSOR_COLUMN_NAME = '__value__' @@ -16,103 +17,76 @@ class TensorLowVarSelection(Preprocessor): """ def __init__( self, - features_list : List[str], - threshold: float = np.inf, - nb_keep : int = np.inf, + features : List[str], + threshold: float = 0.1, ): - self.features_list = features_list - if 'id' in self.features_list: - self.features_list.remove('id') - self.nb_features = len(self.features_list) + self.features = features self.threshold = threshold - self.nb_keep = nb_keep - self.removed_features = [] + self._nb_features = len(features) def _fit(self, ds: Dataset) -> Preprocessor: - nb_records = ds.count() - # - sum_arr = np.zeros(self.nb_features) - mean_arr = np.zeros(self.nb_features) - sqr_dev_arr = np.zeros(self.nb_features) - var_arr = np.zeros(self.nb_features) - # - def sum_func(arr, sum_arr): - return np.add(sum_arr, np.sum(arr, axis=0)) + cols_keep = [] + nb_samples = ds.count() + sum_arr = np.zeros(self._nb_features) + mean_arr = np.zeros(self._nb_features) + sqr_dev_arr = np.zeros(self._nb_features) + var_arr = np.zeros(self._nb_features) + + # Function for parallel sum computing + def get_sums(batch): + df = batch[TENSOR_COLUMN_NAME] + df = _unwrap_ndarray_object_type_if_needed(df) + return({'sum' : [np.sum(df, axis = 0)]}) + + # Sum per column + sums = ds.map_batches(get_sums, batch_format = 'pandas') + for row in sums.iter_rows(): + sum_arr += row['sum'] + + # Mean per column + mean_arr = sum_arr / nb_samples + + # Function for parallel squared deviation computing + def get_sqr_dev(batch): + df = batch[TENSOR_COLUMN_NAME] + df = _unwrap_ndarray_object_type_if_needed(df) + return({'sqr_dev' : [np.sum(np.power(np.subtract(df, mean_arr), 2), axis = 0)]}) + + # Sum of deviation per column + sqr_devs = ds.map_batches(get_sqr_dev, batch_format = 'pandas') + for row in sqr_devs.iter_rows(): + sqr_dev_arr += row['sqr_dev'] + + # Variance per column + var_arr = sqr_dev_arr / nb_samples + + # Compute the threshold from distribution of variance values + self.threshold = np.nanquantile(var_arr, self.threshold) + + # Keep features with values higher than the threshold + cols_keep = [self.features[i] for i, var in enumerate(var_arr) if var > self.threshold] + + self.stats_ = {'cols_keep' : cols_keep} - def mean_func(arr, nb_records): - return np.divide(arr, nb_records) - - def sqr_dev_func(arr, mean_arr, sqr_dev_arr): - return np.add(sqr_dev_arr, np.sum(np.power(np.subtract(arr, mean_arr), 2), axis = 0)) - - if self.nb_keep != np.inf or self.threshold != np.inf: - # Get sum per column - for batch in ds.iter_batches( - batch_size = 100, - batch_format = 'numpy' - ): - sum_arr = sum_func(batch, sum_arr) - # Get mean per column - mean_arr = mean_func(sum_arr, nb_records) - # Get sum of deviation - for batch in ds.iter_batches( - batch_size = 100, - batch_format = 'numpy' - ): - sqr_dev_arr = sqr_dev_func(batch, mean_arr, sqr_dev_arr) - # Get variance per column - var_arr = mean_func(sqr_dev_arr, nb_records) - p10 = int(0.1 * self.nb_features) - - if self.nb_keep != np.inf and (self.nb_keep + (p10 * 2)) < self.nb_features: - var_mapping = {ind : var_arr[ind] for ind in np.arange(self.nb_features)} - keep_arr = np.ravel(np.sort(var_arr)) - keep_arr = keep_arr[p10:(len(keep_arr) - p10)] - keep_arr = np.random.choice(keep_arr, self.nb_keep) - remove_arr = np.ravel(np.sort(var_arr)) - remove_arr = np.array([ind for ind in remove_arr if ind not in keep_arr]) + return self - # Switch values from keep_arr to remove if number is discordant - if len(keep_arr) > self.nb_keep: - nb_switch = len(keep_arr) - self.nb_keep - remove_arr = np.insert(remove_arr, 0, keep_arr[:nb_switch]) - keep_arr = keep_arr[nb_switch:] - elif len(keep_arr) < self.nb_keep: - nb_switch = self.nb_keep - len(keep_arr) - keep_arr = np.insert(keep_arr, 0, remove_arr[nb_switch:]) - remove_arr = remove_arr[:nb_switch] - # Loop to assign values to remove - for k, v in var_mapping.items(): - if v in remove_arr: - pos_v = int(np.where(remove_arr == v)[0][0]) - remove_arr = np.delete(remove_arr, pos_v) - self.stats_.append(k) - self.removed_features = [self.features_list[ind] for ind in self.stats_] + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] - elif self.threshold != np.inf: - for ind in np.arange(self.nb_features): - variance = var_arr[ind] - if variance <= self.threshold: - self.stats_.append(ind) - self.removed_features = [self.features_list[ind] for ind in self.stats_] + if len(cols_keep) < self._nb_features: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = pd.DataFrame(tensor_col, columns = self.features) - return self + tensor_col = tensor_col[cols_keep].to_numpy() - def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: - if len(self.stats_) > 0 : - _validate_df(df, TENSOR_COLUMN_NAME, self.nb_features) - df_out = pd.DataFrame(columns = [TENSOR_COLUMN_NAME]) + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) - for ind, row in enumerate(df.iterrows()): - tensor = np.delete(row[1].to_numpy()[0], self.stats_, axis=0) - df_out.loc[ind, TENSOR_COLUMN_NAME] = tensor - - return df_out - else: - return df + return df def __repr__(self): - return (f"{self.__class__.__name__}(threshold={self.threshold!r}, nb_keep={self.nb_keep!r})") + return (f"{self.__class__.__name__}(features={self._nb_features!r}, threshold={self.threshold!r})") def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: if len(df.loc[0, column]) != nb_features: diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index 097155d..78ef429 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -150,35 +150,19 @@ def count_occurences(batch): self.stats_ = {'cols_keep' : cols_keep} - """ - # Nb of occurences - for batch in ds.iter_batches(batch_format = 'numpy'): - batch = batch[TENSOR_COLUMN_NAME] - occurences += np.count_nonzero(batch, axis = 0) - # Include / Exclude by occurences thresholds across samples - cols_keep = pd.Series(occurences, index = self.features) - cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')] - cols_keep = list(cols_keep.index) - - # cols_drop = list(set(self.features).difference(set(cols_keep))) - # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} - - self.stats_ = {'cols_keep' : cols_keep} - """ - return self def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) cols_keep = self.stats_['cols_keep'] - tensor_col = df[TENSOR_COLUMN_NAME] - tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) - tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col[cols_keep].to_numpy() - # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() - - df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + if len(cols_keep) < self._nb_features: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = pd.DataFrame(tensor_col, columns = self.features) + tensor_col = tensor_col[cols_keep].to_numpy() + + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) return df diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 165f558..96ef9cc 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -9,8 +9,8 @@ from shutil import rmtree # Preprocessing -from models.preprocessors import TensorMinMaxScaler from ray.data.preprocessors import LabelEncoder, Chain +from models.preprocessors.min_max_scaler import TensorMinMaxScaler from src.models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder # Parent class / models diff --git a/src/models/preprocessors/max_abs_scaler.py b/src/models/preprocessors/max_abs_scaler.py index 3914113..cce6b8c 100644 --- a/src/models/preprocessors/max_abs_scaler.py +++ b/src/models/preprocessors/max_abs_scaler.py @@ -5,6 +5,7 @@ from ray.data.preprocessor import Preprocessor from ray.data.extensions.tensor_extension import TensorArray +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed TENSOR_COLUMN_NAME = '__value__' @@ -38,7 +39,9 @@ def _transform_pandas(self, batch: pd.DataFrame): """ Transform the given dataset to pandas dataframe. """ - df = pd.DataFrame(np.vstack(batch[TENSOR_COLUMN_NAME]), columns = self._features_list) + df = batch[TENSOR_COLUMN_NAME] + df = _unwrap_ndarray_object_type_if_needed(df) + df = pd.DataFrame(df, columns = self._features_list) for i, col in enumerate(self._features_list): df[col] = df[col].apply(value_transform, args=[self._absmax[i]]) diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py index b430525..ebf560a 100644 --- a/src/models/preprocessors/min_max_scaler.py +++ b/src/models/preprocessors/min_max_scaler.py @@ -4,6 +4,7 @@ from ray.data.dataset import Dataset from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed TENSOR_COLUMN_NAME = '__value__' @@ -54,7 +55,8 @@ def _transform_pandas(self, batch: pd.DataFrame): """ min = self.stats_['min'] max = self.stats_['max'] - df = np.vstack(batch[TENSOR_COLUMN_NAME].to_numpy()) + df = batch[TENSOR_COLUMN_NAME] + df = _unwrap_ndarray_object_type_if_needed(df) diff = max - min diff[diff == 0] = 1 diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index b9aa459..4f2308d 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -9,7 +9,7 @@ # Preprocessing from ray.data.preprocessors import Chain, BatchMapper, LabelEncoder -from models.preprocessing.ray_tensor_min_max import TensorMinMaxScaler +from models.preprocessors.min_max_scaler import TensorMinMaxScaler from src.models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder # Training diff --git a/src/utils.py b/src/utils.py index f5133e6..f7e36a6 100644 --- a/src/utils.py +++ b/src/utils.py @@ -52,8 +52,8 @@ def init_ray_cluster(workdir): object_store_memory = mem * frac, _temp_dir = str(workdir), ) - ray.data.DataContext.get_current().execution_options.verbose_progress = True logging.getLogger("ray").setLevel(logging.WARNING) + ray.data.DataContext.get_current().execution_options.verbose_progress = True except ValueError : ray.shutdown() frac -= 0.05 From 787ee8f401e0e59b3d423dbef7a7f8fa783376bf Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 15 Oct 2023 23:45:06 -0400 Subject: [PATCH 05/92] integration of features reduction to kmers_collection --- src/Caribou_reduce_features.py | 62 ++++------------------------------ src/data/kmers.py | 27 +++++++++++---- 2 files changed, 26 insertions(+), 63 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 379c1ef..a68d9d3 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -10,7 +10,6 @@ from ray.data.preprocessors import Chain, LabelEncoder -from models.preprocessors.min_max_scaler import TensorMinMaxScaler from data.reduction.low_var_selection import TensorLowVarSelection from data.reduction.features_selection import TensorFeaturesSelection from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion @@ -46,19 +45,17 @@ def features_reduction(opt): """ Brute force -> Features statistically related to classes 1. OccurenceExclusion (10% extremes) - 2. LowVarSelection () + 2. LowVarSelection (variance > 10%) 3. Chi2 + SelectPercentile() (75% best values) """ # Load data ds = ray.data.read_parquet(data['profile']) - ds_train = ray.data.read_parquet(data['profile']) # Time the computation of transformations t_start = time() - ds, data['kmers'] = exclude_select(ds, ds_train, kmers_list, data['taxas'][0]) - # ds, kmers_list = occurence_exclusion(ds, kmers_list) - # ds, kmers_list = low_var_selection(ds,kmers_list) - # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) + ds, kmers_list = occurence_exclusion(ds, kmers_list) + ds, kmers_list = low_var_selection(ds,kmers_list) + ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) t_end = time() t_reduction = t_end - t_start # Save reduced dataset @@ -74,51 +71,7 @@ def features_reduction(opt): print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.") -def exclude_select(ds, ds_train, kmers, taxa): - # Occurence exclusion - excluder = TensorPercentOccurenceExclusion( - features = kmers, - percent = 0.1 # remove features present in less than 10% samples - ) - - ds = excluder.fit_transform(ds) - ds_train = excluder.transform(ds_train) - - kmers = excluder.stats_['cols_keep'] - - varier = TensorLowVarSelection( - features = kmers, - threshold = 0.1, # remove features with less than 10% variance - ) - - ds = varier.fit_transform(ds) - ds_train = varier.transform(ds_train) - - kmers = varier.stats_['cols_keep'] - - # Preprocessing - preprocessor = Chain( - LabelEncoder(taxa), - TensorMinMaxScaler(kmers), - ) - - ds_train = preprocessor.fit_transform(ds_train) - - # Statistical features selection - selector = TensorFeaturesSelection( - features = kmers, - taxa = taxa, - threshold = 0.25, # remove lowest 25% significance - ) - - selector.fit(ds_train) - ds = selector.transform(ds) - - kmers = selector.stats_['cols_keep'] - - return ds, kmers - -# Exclusion columns occuring in less / more than 10% of the columns = 20% removed +# Exclusion of columns occuring in less / more than 10% of the columns = 20% removed def occurence_exclusion(ds, kmers): preprocessor = TensorPercentOccurenceExclusion( features = kmers, @@ -126,12 +79,11 @@ def occurence_exclusion(ds, kmers): ) ds = preprocessor.fit_transform(ds) - kmers = preprocessor.stats_['cols_keep'] return ds, kmers -# Exclusion of columns with variance lower than a certain threshold +# Exclusion of columns with less than 10% variance def low_var_selection(ds, kmers): preprocessor = TensorLowVarSelection( features = kmers, @@ -139,7 +91,6 @@ def low_var_selection(ds, kmers): ) ds = preprocessor.fit_transform(ds) - kmers = preprocessor.stats_['cols_keep'] return ds, kmers @@ -153,7 +104,6 @@ def features_selection(ds, kmers, taxa): ) ds = preprocessor.fit_transform(ds) - kmers = preprocessor.stats_['cols_keep'] return ds, kmers diff --git a/src/data/kmers.py b/src/data/kmers.py index 18efe97..50e12ba 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -16,6 +16,7 @@ from data.extraction.given_kmers_vectorizer import GivenKmersVectorizer # Features selection +from data.reduction.low_var_selection import TensorLowVarSelection from data.reduction.features_selection import TensorFeaturesSelection from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion @@ -342,25 +343,37 @@ def _kmers_tokenization(self): self.df = tokenizer.transform(self.df) if self.method == 'seen': self.kmers_list = tokenizer.stats_['tokens(sequence)'] - # self._kmers_reduction() + self._kmers_reduction() def _kmers_reduction(self): - # Exclusion of columns occuring in less 5% / more 95% of the samples + """ + Brute force -> Features statistically related to classes + 1. OccurenceExclusion (10% extremes) + 2. LowVarSelection (variance > 10%) + 3. Chi2 + SelectPercentile() (75% best values) + """ + # Exclusion of columns occuring in less / more than 10% of the columns = 20% removed excluder = TensorPercentOccurenceExclusion( features = self.kmers_list, - percent = 0.05 + percent = 0.1 ) self.df = excluder.fit_transform(self.df) - self.kmers_list = excluder.stats_['cols_keep'] - # Chi2 evaluation of dependance between features and classes + # Exclusion of columns with less than 10% variance + varier = TensorLowVarSelection( + features = self.kmers_list, + threshold = 0.1, + ) + self.df = varier.fit_transform(self.df) + self.kmers_list = varier.stats_['cols_keep'] + + # Chi2 evaluation of dependance between features and classes to keep 75% most significative selector = TensorFeaturesSelection( features = self.kmers_list, - threshold = 0.05 + threshold = 0.25 ) self.df = selector.fit_transform(self.df) - self.kmers_list = selector.stats_['cols_keep'] def _write_dataset(self): From 319cb1967cd8706c72078d81b22c62d86f3885eb Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 16 Oct 2023 00:03:20 -0400 Subject: [PATCH 06/92] occurences debug --- src/Caribou_reduce_features.py | 4 ++-- src/data/reduction/features_selection.py | 2 +- src/data/reduction/low_var_selection.py | 4 ++-- src/data/reduction/occurence_exclusion.py | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index a68d9d3..bcb76b0 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -54,8 +54,8 @@ def features_reduction(opt): # Time the computation of transformations t_start = time() ds, kmers_list = occurence_exclusion(ds, kmers_list) - ds, kmers_list = low_var_selection(ds,kmers_list) - ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) + # ds, kmers_list = low_var_selection(ds,kmers_list) + # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) t_end = time() t_reduction = t_end - t_start # Save reduced dataset diff --git a/src/data/reduction/features_selection.py b/src/data/reduction/features_selection.py index c07b515..c2a02be 100644 --- a/src/data/reduction/features_selection.py +++ b/src/data/reduction/features_selection.py @@ -43,7 +43,7 @@ def stats(batch): # Chi batches means extraction chi = ds.map_batches(stats, batch_format = 'numpy', batch_size = 32) - for i, row in enumerate(chi.iter_rows()): + for row in chi.iter_rows(): mean_chi.append(row['chi']) # Chi mean of batches means computing diff --git a/src/data/reduction/low_var_selection.py b/src/data/reduction/low_var_selection.py index 7fa6561..0212c8c 100644 --- a/src/data/reduction/low_var_selection.py +++ b/src/data/reduction/low_var_selection.py @@ -39,7 +39,7 @@ def get_sums(batch): return({'sum' : [np.sum(df, axis = 0)]}) # Sum per column - sums = ds.map_batches(get_sums, batch_format = 'pandas') + sums = ds.map_batches(get_sums, batch_format = 'numpy') for row in sums.iter_rows(): sum_arr += row['sum'] @@ -53,7 +53,7 @@ def get_sqr_dev(batch): return({'sqr_dev' : [np.sum(np.power(np.subtract(df, mean_arr), 2), axis = 0)]}) # Sum of deviation per column - sqr_devs = ds.map_batches(get_sqr_dev, batch_format = 'pandas') + sqr_devs = ds.map_batches(get_sqr_dev, batch_format = 'numpy') for row in sqr_devs.iter_rows(): sqr_dev_arr += row['sqr_dev'] diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index 78ef429..f333215 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -132,18 +132,18 @@ def _fit(self, ds: Dataset) -> Preprocessor: nb_samples = ds.count() low_treshold = ceil((0 + self.percent) * nb_samples) high_treshold = floor((1 - self.percent) * nb_samples) + occurences = np.zeros(self._nb_features) # Function for parallel occurences counting def count_occurences(batch): batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) return {'occurences' : [np.count_nonzero(batch, axis = 0)]} occur = ds.map_batches(count_occurences, batch_format = 'numpy') - occurences = np.zeros(self._nb_features) - for batch in occur.iter_batches(batch_format = 'numpy'): - batch_occur = batch['occurences'].sum(axis = 0) - occurences += batch_occur + for row in occur.iter_rows(): + occurences += row['occurences'] # Construct list of features to keep by position cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if low_treshold < occurence < high_treshold] From fe50a5a1facecb0d615858a6efcef1228957890b Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 17 Oct 2023 15:33:41 -0400 Subject: [PATCH 07/92] remove feat reduction from kmers extraction --- src/data/kmers.py | 2 +- src/data/reduction/occurence_exclusion.py | 67 ++--------------------- 2 files changed, 6 insertions(+), 63 deletions(-) diff --git a/src/data/kmers.py b/src/data/kmers.py index 50e12ba..71148d4 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -343,7 +343,7 @@ def _kmers_tokenization(self): self.df = tokenizer.transform(self.df) if self.method == 'seen': self.kmers_list = tokenizer.stats_['tokens(sequence)'] - self._kmers_reduction() + # self._kmers_reduction() def _kmers_reduction(self): """ diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index f333215..cbd7af1 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -29,11 +29,8 @@ def _fit(self, ds: Dataset) -> Preprocessor: occurences += np.count_nonzero(batch, axis = 0) # Include / Exclude by sorted position - # cols_drop = [] cols_keep = pd.Series(occurences, index = self.features) cols_keep = cols_keep.sort_values(ascending = True) # Long operation - # cols_drop.extend(cols_keep.iloc[0 : self.num_features].index) - # cols_drop.extend(cols_keep.iloc[(self._nb_features - self.num_features) : self._nb_features].index) cols_keep = cols_keep.iloc[self.num_features : (self._nb_features - self.num_features)] cols_keep = list(cols_keep.index) @@ -51,7 +48,6 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: tensor_col = pd.DataFrame(tensor_col, columns = self.features) tensor_col = tensor_col[cols_keep].to_numpy() - # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) @@ -59,63 +55,6 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: def __repr__(self): return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self.num_features!r})") -""" -class TensorPercentOccurenceExclusion(Preprocessor): - - - def __init__(self, features: List[str], percent : int = 0.05): - # Parameters - self.features = features - self.percent = percent - self._nb_features = len(features) - - def _fit(self, ds: Dataset) -> Preprocessor: - nb_samples = ds.count() - low_treshold = ceil((0 + self.percent) * nb_samples) - high_treshold = floor((1 - self.percent) * nb_samples) - - # Nb of occurences - occurences = np.zeros(self._nb_features) - for batch in ds.iter_batches(batch_format = 'numpy'): - batch = batch[TENSOR_COLUMN_NAME] - occurences += np.count_nonzero(batch, axis = 0) - - # Include / Exclude by occurences thresholds across samples - cols_keep = pd.Series(occurences, index = self.features) - cols_keep = cols_keep[cols_keep.between(low_treshold, high_treshold, inclusive = 'neither')] - cols_keep = list(cols_keep.index) - - # cols_drop = list(set(self.features).difference(set(cols_keep))) - # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} - - self.stats_ = {'cols_keep' : cols_keep} - - return self - - def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: - # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) - cols_keep = self.stats_['cols_keep'] - - tensor_col = df[TENSOR_COLUMN_NAME] - tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) - tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col[cols_keep].to_numpy() - # tensor_col = tensor_col.drop(cols_keep, axis = 1).to_numpy() - - df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) - - return df - - def __repr__(self): - return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)") -""" - -def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: - if len(df.loc[0, column]) != nb_features: - raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') - - - class TensorPercentOccurenceExclusion(Preprocessor): """ @@ -167,4 +106,8 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: return df def __repr__(self): - return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)") \ No newline at end of file + return (f"{self.__class__.__name__}(features={self._nb_features!r}, percent={self.percent!r}%)") + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file From 400d1abc903400c883b89467133c3bd224b90747 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 19 Oct 2023 09:29:02 -0400 Subject: [PATCH 08/92] features reduction debug test --- src/Caribou_reduce_features.py | 13 ++++++++----- src/data/kmers.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index bcb76b0..270880c 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -6,6 +6,7 @@ from utils import * from time import time +from glob import glob from pathlib import Path from ray.data.preprocessors import Chain, LabelEncoder @@ -50,12 +51,14 @@ def features_reduction(opt): """ # Load data - ds = ray.data.read_parquet(data['profile']) + files_lst = glob(os.path.join(data['profile'], '*.parquet')) + ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # ds = ray.data.read_parquet(data['profile']) # Time the computation of transformations t_start = time() ds, kmers_list = occurence_exclusion(ds, kmers_list) - # ds, kmers_list = low_var_selection(ds,kmers_list) - # ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) + ds, kmers_list = low_var_selection(ds,kmers_list) + ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) t_end = time() t_reduction = t_end - t_start # Save reduced dataset @@ -75,7 +78,7 @@ def features_reduction(opt): def occurence_exclusion(ds, kmers): preprocessor = TensorPercentOccurenceExclusion( features = kmers, - percent = 0.1 # remove features present in less than 10% samples + percent = 0.05 # remove features present in less than 5% samples ) ds = preprocessor.fit_transform(ds) @@ -87,7 +90,7 @@ def occurence_exclusion(ds, kmers): def low_var_selection(ds, kmers): preprocessor = TensorLowVarSelection( features = kmers, - threshold = 0.1, # remove features with less than 10% variance + threshold = 0.05, # remove features with less than 5% variance ) ds = preprocessor.fit_transform(ds) diff --git a/src/data/kmers.py b/src/data/kmers.py index 71148d4..323e3a4 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -371,6 +371,7 @@ def _kmers_reduction(self): # Chi2 evaluation of dependance between features and classes to keep 75% most significative selector = TensorFeaturesSelection( features = self.kmers_list, + taxa = self.taxas[0], threshold = 0.25 ) self.df = selector.fit_transform(self.df) From b33ba2984045d7d53493c4a3b7d9f918dd76b5ec Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 19 Oct 2023 18:03:49 -0400 Subject: [PATCH 09/92] rectify imports --- src/models/kerasTF/models.py | 2 +- src/models/sklearn/models.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 96ef9cc..e6d57bf 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -14,7 +14,7 @@ from src.models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder # Parent class / models -from src.models.models_utils import ModelsUtils +from models.models_utils import ModelsUtils from models.kerasTF.build_neural_networks import * # Training diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 4f2308d..d2d7e85 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -10,7 +10,7 @@ # Preprocessing from ray.data.preprocessors import Chain, BatchMapper, LabelEncoder from models.preprocessors.min_max_scaler import TensorMinMaxScaler -from src.models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder +from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder # Training from sklearn.naive_bayes import MultinomialNB @@ -24,10 +24,10 @@ from ray.train.sklearn.sklearn_predictor import SklearnPredictor # Parent class -from src.models.models_utils import ModelsUtils -from src.models.sklearn.partial_trainer import SklearnPartialTrainer -from src.models.sklearn.tensor_predictor import SklearnTensorPredictor -from src.models.sklearn.probability_predictor import SklearnTensorProbaPredictor +from models.models_utils import ModelsUtils +from models.sklearn.partial_trainer import SklearnPartialTrainer +from models.sklearn.tensor_predictor import SklearnTensorPredictor +from models.sklearn.probability_predictor import SklearnTensorProbaPredictor __author__ = 'Nicolas de Montigny' From 6be0ed309fd08acd3995e2ea278cb1a9ded6c3e8 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 19 Oct 2023 18:43:45 -0400 Subject: [PATCH 10/92] imports rectify --- src/models/classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/classification.py b/src/models/classification.py index f26d7a9..c98042f 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -8,8 +8,8 @@ from glob import glob from shutil import rmtree from utils import load_Xy_data -from src.models.sklearn.models import SklearnModel -from src.models.kerasTF.models import KerasTFModel +from models.sklearn.models import SklearnModel +from models.kerasTF.models import KerasTFModel # Simulation class from models.reads_simulation import readsSimulation From db2b978e8d11deb2e230f1be4a414a5cea12cadc Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 19 Oct 2023 18:46:15 -0400 Subject: [PATCH 11/92] imports --- src/models/kerasTF/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index e6d57bf..f41828d 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -11,7 +11,7 @@ # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.preprocessors.min_max_scaler import TensorMinMaxScaler -from src.models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder +from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder # Parent class / models from models.models_utils import ModelsUtils From 4ee5a77c840f211e4dad9bf8e7c0c1560f6ff52c Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 22 Oct 2023 08:33:45 -0400 Subject: [PATCH 12/92] reduce 5 -> 10%, sklearn pd -> np --- src/Caribou_reduce_features.py | 4 +- src/models/encoders/model_label_encoder.py | 39 ++++++++ src/models/encoders/one_hot_tensor_encoder.py | 6 +- src/models/encoders/onesvm_label_encoder.py | 4 + src/models/kerasTF/build_neural_networks.py | 8 +- src/models/kerasTF/models.py | 89 +++++++++++-------- src/models/sklearn/models.py | 28 +++--- src/models/sklearn/partial_trainer.py | 30 ++++--- src/models/sklearn/probability_predictor.py | 2 +- src/models/sklearn/tensor_predictor.py | 2 +- 10 files changed, 137 insertions(+), 75 deletions(-) create mode 100644 src/models/encoders/model_label_encoder.py diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 270880c..d3e289c 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -78,7 +78,7 @@ def features_reduction(opt): def occurence_exclusion(ds, kmers): preprocessor = TensorPercentOccurenceExclusion( features = kmers, - percent = 0.05 # remove features present in less than 5% samples + percent = 0.1 # remove features present in less than 5% samples ) ds = preprocessor.fit_transform(ds) @@ -90,7 +90,7 @@ def occurence_exclusion(ds, kmers): def low_var_selection(ds, kmers): preprocessor = TensorLowVarSelection( features = kmers, - threshold = 0.05, # remove features with less than 5% variance + threshold = 0.1, # remove features with less than 5% variance ) ds = preprocessor.fit_transform(ds) diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py new file mode 100644 index 0000000..2ed90e1 --- /dev/null +++ b/src/models/encoders/model_label_encoder.py @@ -0,0 +1,39 @@ +from collections import Counter, OrderedDict +from functools import partial +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +import pandas.api.types + +from ray.data import Dataset +from ray.data.preprocessor import Preprocessor +from ray.data.preprocessors.encoder import _get_unique_value_indices, _validate_df + +LABELS_COLUMN_NAME = 'labels' + +class ModelLabelEncoder(Preprocessor): + """ + Custom implementation of Ray's LabelEncoder to set column name as it encodes labels. + """ + def __init__(self, label_column: str): + self.label_column = label_column + + def _fit(self, dataset: Dataset) -> Preprocessor: + self.stats_ = _get_unique_value_indices(dataset, [self.label_column]) + return self + + def _transform_pandas(self, df: pd.DataFrame): + _validate_df(df, self.label_column) + + def column_label_encoder(s: pd.Series): + s_values = self.stats_[f"unique_values({s.name})"] + return s.map(s_values) + + df[self.label_column] = df[self.label_column].transform(column_label_encoder) + df = df.rename(columns = {self.label_column : LABELS_COLUMN_NAME}) + + return df + + def __repr__(self): + return f"{self.__class__.__name__}(label_column={self.label_column!r})" \ No newline at end of file diff --git a/src/models/encoders/one_hot_tensor_encoder.py b/src/models/encoders/one_hot_tensor_encoder.py index 0adff44..8acd7fe 100644 --- a/src/models/encoders/one_hot_tensor_encoder.py +++ b/src/models/encoders/one_hot_tensor_encoder.py @@ -29,6 +29,9 @@ def _fit(self, dataset: Dataset) -> Preprocessor: def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: df = _validate_df(df, self.column) + values = self.stats_[f"unique_values({self.column})"] + nb_unique = len(values.keys()) + def tensor_col_encoding(label, nb_unique): tensor = np.zeros(nb_unique, dtype = np.int32) @@ -37,9 +40,6 @@ def tensor_col_encoding(label, nb_unique): return tensor - values = self.stats_[f"unique_values({self.column})"] - nb_unique = len(values.keys()) - df = df.assign(labels = lambda x: TensorArray([tensor_col_encoding(x.loc[ind,self.column], nb_unique) for ind in df.index])) return df diff --git a/src/models/encoders/onesvm_label_encoder.py b/src/models/encoders/onesvm_label_encoder.py index 14186fa..9464011 100644 --- a/src/models/encoders/onesvm_label_encoder.py +++ b/src/models/encoders/onesvm_label_encoder.py @@ -6,6 +6,8 @@ from ray.data.preprocessor import Preprocessor from ray.data.preprocessors.encoder import _get_unique_value_indices, _validate_df, LabelEncoder +LABELS_COLUMN_NAME = 'labels' + class OneClassSVMLabelEncoder(LabelEncoder): """ Class adapted from Ray's LabelEncoder class to encode labels as integer targets for Scikit-Learn SGDOneClassSVM model. @@ -33,4 +35,6 @@ def column_label_encoder(s: pd.Series): return s df[self.label_column] = df[self.label_column].transform(column_label_encoder) + df = df.rename(columns = {self.label_column : LABELS_COLUMN_NAME}) + return df diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index aed5532..97a8489 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -25,7 +25,7 @@ def build_attention(nb_kmers): x = Dense(128, activation = "relu")(x) x = Dropout(0.1)(x) - x = Dense(2, activation = "tanh")(x) + x = Dense(1, activation = "tanh")(x) model = Model(inputs = inputs, outputs = x) model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy']) @@ -45,7 +45,7 @@ def build_LSTM(nb_kmers): x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(x) - x = Dense(2, activation = 'tanh')(x) + x = Dense(1, activation = 'tanh')(x) model = Model(inputs = inputs, outputs = x) model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy']) @@ -77,9 +77,9 @@ def build_deepLSTM(nb_kmers): net = Dense(10, activation='relu', name='D_%d'%10)(net) net = Dropout(0.1,name='fr_same')(net) - outputs = Dense(2, activation='tanh', name='score')(net) + outputs = Dense(1, activation='sigmoid', name='score')(net) model = Model(inputs=inputs, outputs=outputs) - model.compile(loss=BinaryCrossentropy(from_logits = True), optimizer='adam', metrics=['accuracy']) + model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy']) return model diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index f41828d..39bd8d3 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -11,6 +11,7 @@ # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.preprocessors.min_max_scaler import TensorMinMaxScaler +from models.encoders.model_label_encoder import ModelLabelEncoder from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder # Parent class / models @@ -37,6 +38,9 @@ __all__ = ['KerasTFModel'] +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' + # Ignore warnings to have a more comprehensible output on stdout os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' warnings.filterwarnings('ignore') @@ -138,11 +142,17 @@ def preprocess(self, df): for row in df.iter_rows(): labels.append(row[self.taxa]) self._nb_classes = len(np.unique(labels)) - self._preprocessor = Chain( - TensorMinMaxScaler(self.kmers), - LabelEncoder(self.taxa), - OneHotTensorEncoder(self.taxa), - ) + if self._nb_classes == 2: + self._preprocessor = Chain( + TensorMinMaxScaler(self.kmers), + ModelLabelEncoder(self.taxa), + ) + else: + self._preprocessor = Chain( + TensorMinMaxScaler(self.kmers), + LabelEncoder(self.taxa), + OneHotTensorEncoder(self.taxa), + ) self._preprocessor.fit(df) def _label_decode(self, predict): @@ -220,6 +230,7 @@ def _fit_model(self, datasets): ), datasets=datasets, ) + training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] @@ -227,7 +238,7 @@ def predict(self, df, threshold=0.8): print('predict') if df.count() > 0: if len(df.schema().names) > 1: - col_2_drop = [col for col in df.schema().names if col != '__value__'] + col_2_drop = [col for col in df.schema().names if col != TENSOR_COLUMN_NAME] df = df.drop_columns(col_2_drop) # Preprocess @@ -235,12 +246,12 @@ def predict(self, df, threshold=0.8): print('number of classes :', self._nb_classes) - predictor = BatchPredictor.from_checkpoint( + self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, TensorflowPredictor, model_definition = lambda: build_model(self.classifier, self._nb_classes, len(self.kmers)) ) - predictions = predictor.predict( + predictions = self._predictor.predict( data = df, batch_size = self.batch_size ) @@ -258,46 +269,44 @@ def predict(self, df, threshold=0.8): def _prob_2_cls(self, predictions, threshold): print('_prob_2_cls') def map_predicted_label_binary(df, threshold): - # lower_threshold = 0.5 - (threshold * 0.5) - # upper_threshold = 0.5 + (threshold * 0.5) - predictions = pd.DataFrame({ - 'best_proba': [df['predictions'][i][np.argmax(df['predictions'][i])] for i in range(len(df))], - 'predicted_label': df["predictions"].map(lambda x: np.array(x).argmax()) # GET POSITION OF ARGMAX + df = np.ravel(df['predictions']) + lower_threshold = 0.5 - (threshold * 0.5) + upper_threshold = 0.5 + (threshold * 0.5) + predict = pd.DataFrame({ + 'proba': df, + 'predicted_label': np.full(len(df), -1) }) - print('map_predicted_label_binary') - print(predictions) - # predict = pd.DataFrame({ - # 'proba': df['predictions'], - # 'predicted_label': np.zeros(len(df), dtype = np.float32) - # }) # predict['predicted_label'] = np.round(predict['proba']) - # predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 - # predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 - return predictions['predicted_label'].to_numpy(dtype = np.int32) + predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 + predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 + return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} def map_predicted_label_multiclass(df, threshold): - predictions = pd.DataFrame({ - 'best_proba': [df['predictions'][i][np.argmax(df['predictions'][i])] for i in range(len(df))], - 'predicted_label': df["predictions"].map(lambda x: np.array(x).argmax()) + df = df['predictions'] + pred = pd.DataFrame({ + 'best_proba': [df[i][np.argmax(df[i])] for i in range(len(df))], + 'predicted_label': df.map(lambda x: np.array(x).argmax()) }) - predictions.loc[predictions['best_proba'] < threshold, 'predicted_label'] = -1 - return predictions['predicted_label'].to_numpy(dtype = np.int32) + pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1 + return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} if self._nb_classes == 2: + print('map_predicted_label_binary') fn = map_predicted_label_binary else: + print('map_predicted_label_multiclass') fn = map_predicted_label_multiclass predict = [] - for batch in predictions.iter_batches(batch_size = self.batch_size): - predict.append(lambda : fn(batch, threshold)) - - import sys - predictions.materialize() - print(predict) - sys.exit() + predictions = predictions.map_batches( + lambda batch : fn(batch, threshold), + batch_format = 'numpy', + batch_size = self.batch_size + ) + for row in predictions.iter_rows(): + predict.append(row['predictions']) - return np.concatenate(predict) + return predict # Training/building function outside of the class as mentioned on the Ray discussion @@ -317,6 +326,8 @@ def train_func(config): nb_cls = config.get('nb_cls') model = config.get('model') + + # Model construction model = build_model(model, nb_cls, size) @@ -326,15 +337,15 @@ def train_func(config): for _ in range(epochs): batch_train = train_data.to_tf( - feature_columns = '__value__', - label_columns = 'labels', + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, batch_size = batch_size, local_shuffle_buffer_size = batch_size, local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) batch_val = val_data.to_tf( - feature_columns = '__value__', - label_columns = 'labels', + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, batch_size = batch_size, local_shuffle_buffer_size = batch_size, local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index d2d7e85..9499230 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -8,27 +8,29 @@ from shutil import rmtree # Preprocessing -from ray.data.preprocessors import Chain, BatchMapper, LabelEncoder +from ray.data.preprocessors import Chain, BatchMapper +from models.encoders.model_label_encoder import ModelLabelEncoder from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder # Training from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDOneClassSVM, SGDClassifier +from models.sklearn.partial_trainer import SklearnPartialTrainer +from models.sklearn.tensor_predictor import SklearnTensorPredictor # Tuning -from ray.air.config import RunConfig, ScalingConfig +from ray.air.config import RunConfig # Predicting from ray.train.batch_predictor import BatchPredictor -from ray.train.sklearn.sklearn_predictor import SklearnPredictor +from models.sklearn.probability_predictor import SklearnTensorProbaPredictor # Parent class from models.models_utils import ModelsUtils -from models.sklearn.partial_trainer import SklearnPartialTrainer -from models.sklearn.tensor_predictor import SklearnTensorPredictor -from models.sklearn.probability_predictor import SklearnTensorProbaPredictor +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' __author__ = 'Nicolas de Montigny' @@ -105,7 +107,7 @@ def preprocess(self, df): self._encoded = np.array([1,-1], dtype = np.int32) labels = np.array(['bacteria', 'unknown'], dtype = object) else: - self._encoder = LabelEncoder(self.taxa) + self._encoder = ModelLabelEncoder(self.taxa) self._preprocessor = Chain( TensorMinMaxScaler(self.kmers), @@ -211,7 +213,6 @@ def _fit_model(self, datasets): # Define trainer self._trainer = SklearnPartialTrainer( estimator=self._clf, - label_column=self.taxa, labels_list=training_labels, features_list=self.kmers, params=self._train_params, @@ -238,10 +239,8 @@ def _predict_cv(self, df): print('_predict_cv') if df.count() > 0: predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} -# BATCHPREDICTOR DEPRECATED : https://docs.ray.io/en/releases-2.6.3/ray-air/api/doc/ray.train.batch_predictor.BatchPredictor.html#ray.train.batch_predictor.BatchPredictor -# MUST BE CHANGED TO MAP_BATCHES self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) - predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = ['__value__'], **predict_kwargs) + predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) predictions = np.array(predictions.to_pandas()).reshape(-1) return self._label_decode(predictions) @@ -255,12 +254,12 @@ def predict(self, df, threshold = 0.8): if self.classifier == 'onesvm': predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._models_collection['domain'], SklearnTensorPredictor) - predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = ['__value__'], **predict_kwargs) + predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) predictions = np.array(predictions.to_pandas()).reshape(-1) else: predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor) - predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = ['__value__'], **predict_kwargs) + predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) predictions = self._prob_2_cls(predictions, len(self._encoded), threshold) return self._label_decode(predictions) else: @@ -279,8 +278,7 @@ def map_predicted_label(df : pd.DataFrame): if nb_cls == 1: predict = np.round(abs(np.concatenate(predict.to_pandas()['predictions']))) else: - mapper = BatchMapper(map_predicted_label, batch_format = 'pandas') - predict = mapper.transform(predict) + predict = predict.map_batches(map_predicted_label, batch_format = 'pandas') predict = np.ravel(np.array(predict.to_pandas())) return predict \ No newline at end of file diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py index e6fa0d8..e2ff877 100644 --- a/src/models/sklearn/partial_trainer.py +++ b/src/models/sklearn/partial_trainer.py @@ -26,6 +26,8 @@ from ray.train.sklearn import SklearnTrainer +LABELS_COLUMN_NAME = 'labels' + simplefilter(action='ignore', category=FutureWarning) class SklearnPartialTrainer(SklearnTrainer): @@ -38,7 +40,6 @@ def __init__( *, estimator, datasets, - label_column = None, labels_list = None, features_list = None, params = None, @@ -57,7 +58,7 @@ def __init__( super().__init__( estimator = estimator, datasets = datasets, - label_column = label_column, + label_column = LABELS_COLUMN_NAME, params = params, scoring = scoring, cv = cv, @@ -204,17 +205,20 @@ def training_loop(self): start_time = time() for batch_X, batch_y in zip( epoch_X.iter_batches( - batch_size = self._batch_size, + # batch_size = self._batch_size, + batch_size = 1, batch_format = 'numpy' ), epoch_y.iter_batches( - batch_size = self._batch_size, + # batch_size = self._batch_size, + batch_size = 1, batch_format = 'numpy' ) ): if isinstance(batch_X, dict): batch_X = batch_X['__value__'] - + + """ try: batch_X = pd.DataFrame(batch_X, columns = self._features_list) except ValueError: @@ -224,6 +228,7 @@ def training_loop(self): Removing the last {} additionnal values, this may influence training.\ If this persists over multiple samples, please rerun the K-mers extraction".format(len(batch_X[i]) - len(self._features_list))) batch_X[i] = batch_X[i][:len(self._features_list)] + """ batch_y = np.ravel(batch_y[self.label_column]) try: self.estimator.partial_fit(batch_X, batch_y, classes = self._labels, **self.fit_params) @@ -240,8 +245,10 @@ def training_loop(self): )): X_calib_df[ind] = batch['__value__'] + """ X_calib = pd.DataFrame(X_calib_df, columns = self._features_list) - y_calib = y_calib.to_pandas() + """ + y_calib = y_calib.to_pandas().to_numpy() self.estimator = CalibratedClassifierCV( estimator = self.estimator, method = 'sigmoid', @@ -300,16 +307,19 @@ def _score_on_validation_sets( start_time = time() for batch, labels in zip(X_test.iter_batches( - batch_size = self._batch_size, + # batch_size = self._batch_size, + batch_size = 1, batch_format = 'numpy' ), y_test.iter_batches( - batch_size=self._batch_size, + # batch_size = self._batch_size, + batch_size = 1, batch_format = 'numpy' ) ): if isinstance(batch, dict): batch = batch['__value__'] + """ try: batch = pd.DataFrame(batch, columns = self._features_list) except ValueError: @@ -319,10 +329,10 @@ def _score_on_validation_sets( Removing the last {} additionnal values, this may influence training.\ If this persists over multiple samples, please rerun the K-mers extraction".format(len(batch[i]) - len(self._features_list))) batch[i] = batch[i][:len(self._features_list)] + """ + labels = np.ravel(labels[self.label_column]) - print(batch) - try: test_scores.append(_score(estimator, batch, labels, scorers)) except Exception: diff --git a/src/models/sklearn/probability_predictor.py b/src/models/sklearn/probability_predictor.py index b0f0b22..0b291bd 100644 --- a/src/models/sklearn/probability_predictor.py +++ b/src/models/sklearn/probability_predictor.py @@ -41,7 +41,7 @@ def _predict_pandas( if TENSOR_COLUMN_NAME in data: data = data[TENSOR_COLUMN_NAME] data = _unwrap_ndarray_object_type_if_needed(data) - data = pd.DataFrame(data, columns = features) + # data = pd.DataFrame(data, columns = features) with parallel_backend("ray", n_jobs=num_estimator_cpus): df = pd.DataFrame(self.estimator.predict_proba(data, **predict_kwargs)) diff --git a/src/models/sklearn/tensor_predictor.py b/src/models/sklearn/tensor_predictor.py index c03e4a8..e94538a 100644 --- a/src/models/sklearn/tensor_predictor.py +++ b/src/models/sklearn/tensor_predictor.py @@ -40,7 +40,7 @@ def _predict_pandas( data = data[TENSOR_COLUMN_NAME] data = _unwrap_ndarray_object_type_if_needed(data) - data = pd.DataFrame(data, columns = features) + # data = pd.DataFrame(data, columns = features) with parallel_backend("ray", n_jobs=num_estimator_cpus): df = pd.DataFrame(self.estimator.predict(data)) From b98e74d03e4e07a2ff6cf71b557c31913e82c762 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 23 Oct 2023 16:07:14 -0400 Subject: [PATCH 13/92] all models debugged in local --- src/models/kerasTF/models.py | 33 +++++++++++---------------- src/models/sklearn/models.py | 1 + src/models/sklearn/partial_trainer.py | 2 +- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 39bd8d3..d1e8443 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -139,6 +139,7 @@ def __init__( def preprocess(self, df): print('preprocess') labels = [] + encoded = [] for row in df.iter_rows(): labels.append(row[self.taxa]) self._nb_classes = len(np.unique(labels)) @@ -154,25 +155,23 @@ def preprocess(self, df): OneHotTensorEncoder(self.taxa), ) self._preprocessor.fit(df) + # Labels mapping + labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys()) + encoded = np.arange(len(labels)) + labels = np.append(labels, 'unknown') + encoded = np.append(encoded, -1) + self._labels_map = zip(labels, encoded) def _label_decode(self, predict): print('_label_decode') - if self._labels_map is None: - encoded = [] - encoded.append(-1) - labels = ['unknown'] - for k, v in self._preprocessor.preprocessors[1].stats_['unique_values({})'.format(self.taxa)].items(): - encoded.append(v) - labels.append(k) decoded = pd.Series(np.empty(len(predict), dtype=object)) - for label, coded in zip(labels, encoded): - decoded[predict == coded] = label + for label, encoded in self._labels_map: + decoded[predict == encoded] = label return np.array(decoded) def train(self, datasets, kmers_ds, cv = True): print('train') - if cv: self._cross_validation(datasets, kmers_ds) else: @@ -180,7 +179,6 @@ def train(self, datasets, kmers_ds, cv = True): def _cross_validation(self, datasets, kmers_ds): print('_cross_validation') - df_test = datasets.pop('test') self._fit_model(datasets) @@ -189,7 +187,7 @@ def _cross_validation(self, datasets, kmers_ds): for row in df_test.iter_rows(): y_true.append(row[self.taxa]) - y_pred = self.predict(df_test.drop_columns([self.taxa]), threshold = 0) + y_pred = self.predict(df_test.drop_columns([self.taxa]), threshold = 0.8) self._cv_score(y_true, y_pred) @@ -197,7 +195,6 @@ def _fit_model(self, datasets): print('_fit_model') # Preprocessing loop for name, ds in datasets.items(): - print(f'dataset preprocessing : {name}') ds = ds.drop_columns(['id']) ds = self._preprocessor.transform(ds) datasets[name] = ds @@ -244,8 +241,6 @@ def predict(self, df, threshold=0.8): # Preprocess df = self._preprocessor.preprocessors[0].transform(df) - print('number of classes :', self._nb_classes) - self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, TensorflowPredictor, @@ -256,8 +251,6 @@ def predict(self, df, threshold=0.8): batch_size = self.batch_size ) - print(predictions.to_pandas()) - # Convert predictions to labels predictions = self._prob_2_cls(predictions, threshold) @@ -276,7 +269,6 @@ def map_predicted_label_binary(df, threshold): 'proba': df, 'predicted_label': np.full(len(df), -1) }) - # predict['predicted_label'] = np.round(predict['proba']) predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} @@ -284,10 +276,11 @@ def map_predicted_label_binary(df, threshold): def map_predicted_label_multiclass(df, threshold): df = df['predictions'] pred = pd.DataFrame({ - 'best_proba': [df[i][np.argmax(df[i])] for i in range(len(df))], - 'predicted_label': df.map(lambda x: np.array(x).argmax()) + 'best_proba': [np.max(arr) for arr in df], + 'predicted_label' : [np.argmax(arr) for arr in df] }) pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1 + return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} if self._nb_classes == 2: diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 9499230..903fe93 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -114,6 +114,7 @@ def preprocess(self, df): self._encoder, ) self._preprocessor.fit(df) + # Labels mapping if self.classifier != 'onesvm': labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys()) self._encoded = np.arange(len(labels)) diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py index e2ff877..e9a51c6 100644 --- a/src/models/sklearn/partial_trainer.py +++ b/src/models/sklearn/partial_trainer.py @@ -255,7 +255,7 @@ def training_loop(self): cv = 'prefit', ) self.estimator.fit( - X_calib, + X_calib_df, y_calib, ) From 486036b9cc505a4a53b614295998811688372537 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 24 Oct 2023 12:52:06 -0400 Subject: [PATCH 14/92] features reduction to keep 25% best chi2 --- src/Caribou_reduce_features.py | 2 +- src/models/classification.py | 4 ++-- src/models/encoders/onesvm_label_encoder.py | 12 ++++-------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index d3e289c..851eb77 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -103,7 +103,7 @@ def features_selection(ds, kmers, taxa): preprocessor = TensorFeaturesSelection( features = kmers, taxa = taxa, - threshold = 0.25, # remove lowest 25% significance + threshold = 0.75, # Keep 25% higest results ) ds = preprocessor.fit_transform(ds) diff --git a/src/models/classification.py b/src/models/classification.py index c98042f..10e93dc 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -535,5 +535,5 @@ def split_sim_cv_ds(self, ds, data, name): ############################################################################### def convert_archaea_bacteria(df): - df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' - return df \ No newline at end of file + df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' + return df \ No newline at end of file diff --git a/src/models/encoders/onesvm_label_encoder.py b/src/models/encoders/onesvm_label_encoder.py index 9464011..1743f95 100644 --- a/src/models/encoders/onesvm_label_encoder.py +++ b/src/models/encoders/onesvm_label_encoder.py @@ -26,15 +26,11 @@ def _fit(self, dataset : Dataset) -> Preprocessor: def _transform_pandas(self, df: pd.DataFrame): _validate_df(df, self.label_column) + mapping = self.stats_[f"unique_values({self.label_column})"] + df[self.label_column] = df[self.label_column].str.lower() + df[self.label_column] = df[self.label_column].map(mapping) + df[self.label_column] = df[self.label_column].fillna(-1) - def column_label_encoder(s: pd.Series): - s_values = self.stats_[f"unique_values({s.name})"] - s = s.str.lower() - s = s.map(s_values) - s = s.fillna(-1) - return s - - df[self.label_column] = df[self.label_column].transform(column_label_encoder) df = df.rename(columns = {self.label_column : LABELS_COLUMN_NAME}) return df From 84dc20858822c76c3e5d0bb33f93108d2c464083 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 25 Oct 2023 17:57:05 -0400 Subject: [PATCH 15/92] read_parquet parallelism == -1 instead of nb of files --- src/Caribou_reduce_features.py | 4 +--- src/data/kmers.py | 2 +- src/models/classification.py | 18 +++++++++--------- .../sklearn}/scoring_one_svm.py | 0 src/supplement/sklearn_tuning.py | 18 +++++++++--------- 5 files changed, 20 insertions(+), 22 deletions(-) rename src/{supplement => models/sklearn}/scoring_one_svm.py (100%) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 851eb77..748691f 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -51,9 +51,7 @@ def features_reduction(opt): """ # Load data - files_lst = glob(os.path.join(data['profile'], '*.parquet')) - ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # ds = ray.data.read_parquet(data['profile']) + ds = ray.data.read_parquet(data['profile'], parallelism = -1) # Time the computation of transformations t_start = time() ds, kmers_list = occurence_exclusion(ds, kmers_list) diff --git a/src/data/kmers.py b/src/data/kmers.py index 323e3a4..5f3a2ff 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -324,7 +324,7 @@ def _make_ray_ds(self): self.df = self.df.repartition(int(self.df.count()/10)) else: self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet')) - self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(self._files_list)) + self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1) def _kmers_tokenization(self): print('_kmers_tokenization') diff --git a/src/models/classification.py b/src/models/classification.py index 10e93dc..245b804 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -112,7 +112,7 @@ def __init__( def execute_training_prediction(self, data2classify): print('execute_training_prediction') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) ids2classify = data2classify['ids'] for i, taxa in enumerate(self._taxas_order): if taxa in self._taxas: @@ -248,7 +248,7 @@ def _multiclass_training(self, taxa): def execute_classification(self, data2classify): print('execute_classification') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) ids = data2classify['ids'] if len(self.classified_data['sequence']) == 0: raise ValueError('Please train a model before executing classification') @@ -437,12 +437,12 @@ def _merge_database_host(self, database_data, host_data): if os.path.exists(self._merged_database_host['profile']): files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) cols2drop = [] for col in df_db.schema().names: @@ -470,7 +470,7 @@ def _load_training_data_merged(self, taxa): print('_load_training_data_merged') if self._classifier_binary == 'onesvm' and taxa == 'domain': files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val_test = self._merge_database_host(self._database_data, self._host_data) df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') @@ -492,7 +492,7 @@ def _load_training_data_merged(self, taxa): def _load_training_data(self): print('_load_training_data') files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') self._training_datasets = {'train': df_train, 'validation': df_val} @@ -512,7 +512,7 @@ def _sim_4_cv(self, df, kmers_ds, name): cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) return df def split_sim_cv_ds(self, ds, data, name): @@ -522,7 +522,7 @@ def split_sim_cv_ds(self, ds, data, name): ) if os.path.exists(ds_path): files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: cv_ds = ds.random_sample(0.1) if cv_ds.count() == 0: diff --git a/src/supplement/scoring_one_svm.py b/src/models/sklearn/scoring_one_svm.py similarity index 100% rename from src/supplement/scoring_one_svm.py rename to src/models/sklearn/scoring_one_svm.py diff --git a/src/supplement/sklearn_tuning.py b/src/supplement/sklearn_tuning.py index 272ddd7..76f88ad 100644 --- a/src/supplement/sklearn_tuning.py +++ b/src/supplement/sklearn_tuning.py @@ -21,7 +21,7 @@ # Preprocessing from ray.data.preprocessors import Chain, LabelEncoder # Training -from supplement.scoring_one_svm import ScoringSGDOneClassSVM +from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier # Tuning @@ -42,12 +42,12 @@ def merge_db_host(db_data, host_data): if os.path.exists(merged_db_host['profile']): files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) col2drop = [] for col in df_db.schema().names: @@ -84,7 +84,7 @@ def sim_4_cv(df, database_data, name): cv_sim = readsSimulation(database_data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, database_data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) return df def convert_archaea_bacteria(df): @@ -106,7 +106,7 @@ def split_val_test_ds(ds, data): test_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_test_data_K{len(data["kmers"][0])}') if os.path.exists(val_path): files_lst = glob(os.path.join(val_path, '*.parquet')) - val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) val_ds = val_ds.map_batches( convert_archaea_bacteria, batch_format = 'pandas' @@ -119,7 +119,7 @@ def split_val_test_ds(ds, data): val_ds = sim_4_cv(val_ds, data, 'validation') if os.path.exists(test_path): files_lst = glob(os.path.join(test_path, '*.parquet')) - test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) test_ds = test_ds.map_batches( convert_archaea_bacteria, batch_format = 'pandas' @@ -164,7 +164,7 @@ def split_val_test_ds(ds, data): val_ds, test_ds = split_val_test_ds(test_val_ds,test_val_data) db_data = verify_load_data(opt['data']) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) elif opt['classifier'] == 'linearsvm' and opt['taxa'] == 'domain': if opt['data_host'] is None: raise ValueError('To tune for a domain taxa, a host species is required.\ @@ -175,7 +175,7 @@ def split_val_test_ds(ds, data): else: db_data = verify_load_data(opt['data']) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) val_ds, test_ds = split_val_test_ds(train_ds, db_data) # Preprocessing From bcf1bf0e45ceeb0e5d94d26254723a791e4488fc Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 30 Oct 2023 07:25:58 -0400 Subject: [PATCH 16/92] revert parallelism to nb of files in bulk read --- src/Caribou_reduce_features.py | 4 +++- src/data/kmers.py | 3 ++- src/models/classification.py | 35 +++++++++++++++++++++------------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 748691f..21ccc6c 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -51,7 +51,9 @@ def features_reduction(opt): """ # Load data - ds = ray.data.read_parquet(data['profile'], parallelism = -1) + files_lst = glob(data['profile']) + ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # ds = ray.data.read_parquet(data['profile'], parallelism = -1) # Time the computation of transformations t_start = time() ds, kmers_list = occurence_exclusion(ds, kmers_list) diff --git a/src/data/kmers.py b/src/data/kmers.py index 5f3a2ff..13c1fa8 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -324,7 +324,8 @@ def _make_ray_ds(self): self.df = self.df.repartition(int(self.df.count()/10)) else: self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet')) - self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1) + self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(self._files_list)) + # self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1) def _kmers_tokenization(self): print('_kmers_tokenization') diff --git a/src/models/classification.py b/src/models/classification.py index 245b804..cc8fc5e 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -112,7 +112,8 @@ def __init__( def execute_training_prediction(self, data2classify): print('execute_training_prediction') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) ids2classify = data2classify['ids'] for i, taxa in enumerate(self._taxas_order): if taxa in self._taxas: @@ -129,9 +130,9 @@ def execute_training_prediction(self, data2classify): # Predicting try: if i == 0: - df2classify = self._classify_first(df2classify, taxa, ids2classify, file2classify) + df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile']) else: - df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, file2classify) + df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile']) except ValueError: print('Stopping classification prematurelly because there are no more sequences to classify') return taxa @@ -248,16 +249,17 @@ def _multiclass_training(self, taxa): def execute_classification(self, data2classify): print('execute_classification') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) ids = data2classify['ids'] if len(self.classified_data['sequence']) == 0: raise ValueError('Please train a model before executing classification') for i, taxa in enumerate(self.classified_data['sequence']): try: if i == 0: - df = self._classify_first(df, taxa, ids, df_file) + df = self._classify_first(df, taxa, ids, data2classify['profile']) else: - df = self._classify_subsequent(df, taxa, ids, df_file) + df = self._classify_subsequent(df, taxa, ids, data2classify['profile']) except ValueError: print('Stopping classification prematurelly because there are no more sequences to classify') return taxa @@ -437,12 +439,15 @@ def _merge_database_host(self, database_data, host_data): if os.path.exists(self._merged_database_host['profile']): files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) cols2drop = [] for col in df_db.schema().names: @@ -470,7 +475,8 @@ def _load_training_data_merged(self, taxa): print('_load_training_data_merged') if self._classifier_binary == 'onesvm' and taxa == 'domain': files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val_test = self._merge_database_host(self._database_data, self._host_data) df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') @@ -492,7 +498,8 @@ def _load_training_data_merged(self, taxa): def _load_training_data(self): print('_load_training_data') files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') self._training_datasets = {'train': df_train, 'validation': df_val} @@ -512,7 +519,8 @@ def _sim_4_cv(self, df, kmers_ds, name): cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) return df def split_sim_cv_ds(self, ds, data, name): @@ -522,7 +530,8 @@ def split_sim_cv_ds(self, ds, data, name): ) if os.path.exists(ds_path): files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: cv_ds = ds.random_sample(0.1) if cv_ds.count() == 0: From 7feeaa036b99874d9c57936500cd35091a373c10 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 1 Nov 2023 11:21:06 -0400 Subject: [PATCH 17/92] features selection using rdf + tf-idf --- requirements.txt | 2 + src/Caribou_reduce_features.py | 54 ++++++++++-- ...selection.py => chi_features_selection.py} | 2 +- src/data/reduction/occurence_exclusion.py | 9 +- src/data/reduction/rdf_features_selection.py | 85 +++++++++++++++++++ src/models/kerasTF/build_neural_networks.py | 15 ++-- src/models/preprocessors/max_abs_scaler.py | 16 ++-- src/models/preprocessors/min_max_scaler.py | 6 +- src/models/preprocessors/power_transformer.py | 10 +-- src/models/preprocessors/tfidf_transformer.py | 66 ++++++++++++++ src/models/sklearn/models.py | 13 +-- 11 files changed, 236 insertions(+), 42 deletions(-) rename src/data/reduction/{features_selection.py => chi_features_selection.py} (98%) create mode 100644 src/data/reduction/rdf_features_selection.py create mode 100644 src/models/preprocessors/tfidf_transformer.py diff --git a/requirements.txt b/requirements.txt index 2e05ef0..a8d4ad5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -93,3 +93,5 @@ Werkzeug==2.3.6 wrapt==1.15.0 yarl==1.9.2 zipp==3.16.2 +xgboost==2.0.1 +xgboost_ray==0.1.18 \ No newline at end of file diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 21ccc6c..879009f 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -4,15 +4,18 @@ import os.path import argparse +import numpy as np + from utils import * from time import time from glob import glob from pathlib import Path -from ray.data.preprocessors import Chain, LabelEncoder +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.low_var_selection import TensorLowVarSelection -from data.reduction.features_selection import TensorFeaturesSelection +from data.reduction.chi_features_selection import TensorChiFeaturesSelection +from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion __author__ = "Nicolas de Montigny" @@ -44,21 +47,33 @@ def features_reduction(opt): # Features reduction ################################################################################ """ + First option : Select features relevant to classification by Random Forest of decision trees + Brute force -> Features statistically related to classes 1. OccurenceExclusion (10% extremes) 2. LowVarSelection (variance > 10%) 3. Chi2 + SelectPercentile() (75% best values) """ + """ + TODO: Add to preprocessing in model training + 1. Replace the MinMaxScaling -> TfidfTransformer to scale down the impact of tokens that occur very frequently (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer) + 2. TruncatedSVD to reduce dimensions and keep 10 000 features ~PCA (https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD) + """ + # Load data - files_lst = glob(data['profile']) + files_lst = glob(os.path.join(data['profile'],'*.parquet')) ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) # ds = ray.data.read_parquet(data['profile'], parallelism = -1) # Time the computation of transformations t_start = time() - ds, kmers_list = occurence_exclusion(ds, kmers_list) - ds, kmers_list = low_var_selection(ds,kmers_list) - ds, data['kmers'] = features_selection(ds, kmers_list, data['taxas'][0]) + ds = tfidf_transform(ds, kmers_list) + ds, kmers_list = tree_relevant_features(ds, kmers_list, 'phylum') + print(len(kmers_list)) + if len(kmers_list) == 0: + ds, kmers_list = occurence_exclusion(ds, opt['kmers_list']) + ds, kmers_list = low_var_selection(ds,kmers_list) + ds, data['kmers'] = features_selection(ds, kmers_list, 'phylum') t_end = time() t_reduction = t_end - t_start # Save reduced dataset @@ -78,7 +93,7 @@ def features_reduction(opt): def occurence_exclusion(ds, kmers): preprocessor = TensorPercentOccurenceExclusion( features = kmers, - percent = 0.1 # remove features present in less than 5% samples + percent = 0.1 # remove features present in less than 10% samples ) ds = preprocessor.fit_transform(ds) @@ -100,7 +115,7 @@ def low_var_selection(ds, kmers): # Chi2 evaluation of dependance between features and classes def features_selection(ds, kmers, taxa): - preprocessor = TensorFeaturesSelection( + preprocessor = TensorChiFeaturesSelection( features = kmers, taxa = taxa, threshold = 0.75, # Keep 25% higest results @@ -108,9 +123,32 @@ def features_selection(ds, kmers, taxa): ds = preprocessor.fit_transform(ds) kmers = preprocessor.stats_['cols_keep'] + print(len(kmers)) return ds, kmers +# TF-IDF scaling of the features +def tfidf_transform(ds, kmers): + preprocessor = TensorTfIdfTransformer( + features = kmers + ) + ds = preprocessor.fit_transform(ds) + + return ds + +# Decision tree feature selection to keep only those identified as relevant to classification +def tree_relevant_features(ds, kmers, taxa): + preprocessor = TensorRDFFeaturesSelection( + features = kmers, + taxa = taxa + ) + preprocessor.fit_transform(ds) + + kmers = preprocessor.stats_['cols_keep'] + + return ds, kmers + + # Argument parsing from CLI ################################################################################ diff --git a/src/data/reduction/features_selection.py b/src/data/reduction/chi_features_selection.py similarity index 98% rename from src/data/reduction/features_selection.py rename to src/data/reduction/chi_features_selection.py index c2a02be..95fd013 100644 --- a/src/data/reduction/features_selection.py +++ b/src/data/reduction/chi_features_selection.py @@ -15,7 +15,7 @@ TENSOR_COLUMN_NAME = '__value__' -class TensorFeaturesSelection(Preprocessor): +class TensorChiFeaturesSelection(Preprocessor): """ Custom implementation of SelectKBest with Chi2 inspired by sklearn.feature_selection.SelectPercentile and sklearn.feature_selection.chi2 features selector to be used as a Ray preprocessor. https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2 diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index cbd7af1..17da804 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -19,7 +19,7 @@ def __init__(self, features: List[str], num_features: int): # Parameters self.features = features self._nb_features = len(features) - self.num_features = int((self._nb_features - num_features) / 2) + self._num_features = int(self._nb_features - num_features) def _fit(self, ds: Dataset) -> Preprocessor: # Nb of occurences @@ -31,7 +31,7 @@ def _fit(self, ds: Dataset) -> Preprocessor: # Include / Exclude by sorted position cols_keep = pd.Series(occurences, index = self.features) cols_keep = cols_keep.sort_values(ascending = True) # Long operation - cols_keep = cols_keep.iloc[self.num_features : (self._nb_features - self.num_features)] + cols_keep = cols_keep.iloc[0 : self._num_features] cols_keep = list(cols_keep.index) # self.stats_ = {'cols_keep' : cols_keep, 'cols_drop' : cols_drop} @@ -54,7 +54,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: return df def __repr__(self): - return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self.num_features!r})") + return (f"{self.__class__.__name__}(features={self._nb_features!r}, num_features={self._num_features!r})") class TensorPercentOccurenceExclusion(Preprocessor): """ @@ -69,7 +69,6 @@ def __init__(self, features: List[str], percent : int = 0.05): def _fit(self, ds: Dataset) -> Preprocessor: nb_samples = ds.count() - low_treshold = ceil((0 + self.percent) * nb_samples) high_treshold = floor((1 - self.percent) * nb_samples) occurences = np.zeros(self._nb_features) @@ -85,7 +84,7 @@ def count_occurences(batch): occurences += row['occurences'] # Construct list of features to keep by position - cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if low_treshold < occurence < high_treshold] + cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if occurence < high_treshold] self.stats_ = {'cols_keep' : cols_keep} diff --git a/src/data/reduction/rdf_features_selection.py b/src/data/reduction/rdf_features_selection.py new file mode 100644 index 0000000..4a12e44 --- /dev/null +++ b/src/data/reduction/rdf_features_selection.py @@ -0,0 +1,85 @@ +import os +import logging + +import numpy as np +import pandas as pd + +from typing import List +from warnings import warn +from ray.data import Dataset + +from xgboost import XGBRFClassifier + +from ray.air.config import ScalingConfig +from sklearn.preprocessing import LabelEncoder + + +from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +TENSOR_COLUMN_NAME = '__value__' + +class TensorRDFFeaturesSelection(Preprocessor): + """ + Wrapper class for using Random Forest Classifier from XGBoost in features selection as a Ray preprocessor. + XGBRFClassifier trains a random forest of decision trees that is used to determine the features that are most useful in classification. + https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/ + """ + + def __init__(self, features: List[str], taxa: str): + # Parameters + self.taxa = taxa + self.features = features + self._nb_features = len(features) + + def _fit(self, ds: Dataset) -> Preprocessor: + def xgboost_batch(arr: np.array): + # Labels data + y = arr[self.taxa] + encoder = LabelEncoder() + y = encoder.fit_transform(y) + # Features data + X = _unwrap_ndarray_object_type_if_needed(arr[TENSOR_COLUMN_NAME]) + X = pd.DataFrame(X, columns = self.features) + # XGBoost tree + tree = XGBRFClassifier() + tree.fit(X,y) + # Used features in the tree + tree = tree.get_booster() + relevant_features = tree.get_fscore() + relevant_features = [feat for feat in relevant_features.keys()] + + return {'features':[relevant_features]} + + cols_keep = [] + + relevant_features = ds.map_batches(xgboost_batch, batch_format = 'numpy') + for row in relevant_features.iter_rows(): + cols_keep.extend(row['features']) + cols_keep = np.unique(cols_keep) + + self.stats_ = {'cols_keep' : cols_keep} + + return self + + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + cols_keep = self.stats_['cols_keep'] + + if len(cols_keep) < self._nb_features and len(cols_keep) > 0 : + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = pd.DataFrame(tensor_col, columns = self.features) + + tensor_col = tensor_col[cols_keep].to_numpy() + + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + return df + + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})") + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 97a8489..81751dc 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -1,8 +1,11 @@ from keras.models import Model, Sequential +from tensorflow.keras import mixed_precision from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape + + from models.kerasTF.attentionLayer import AttentionWeightedAverage __author__ = "Nicolas de Montigny" @@ -28,7 +31,7 @@ def build_attention(nb_kmers): x = Dense(1, activation = "tanh")(x) model = Model(inputs = inputs, outputs = x) - model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy']) + model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy'], jit_compile = True) return model @@ -48,7 +51,7 @@ def build_LSTM(nb_kmers): x = Dense(1, activation = 'tanh')(x) model = Model(inputs = inputs, outputs = x) - model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy']) + model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -79,7 +82,7 @@ def build_deepLSTM(nb_kmers): outputs = Dense(1, activation='sigmoid', name='score')(net) model = Model(inputs=inputs, outputs=outputs) - model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy']) + model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -105,7 +108,7 @@ def build_LSTM_attention(nb_kmers, nb_classes): net = Dense(nb_classes)(net) outputs = Activation('softmax')(net) model = Model(inputs = inputs, outputs = outputs) - model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy']) + model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -131,7 +134,7 @@ def build_CNN(nb_kmers, nb_classes): model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) - model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy']) + model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -169,6 +172,6 @@ def build_wideCNN(nb_kmers, nb_classes): net = Dense(nb_classes)(net) outputs = Activation('softmax')(net) model = Model(inputs = inputs, outputs = outputs) - model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy']) + model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True) return model diff --git a/src/models/preprocessors/max_abs_scaler.py b/src/models/preprocessors/max_abs_scaler.py index cce6b8c..7148d4b 100644 --- a/src/models/preprocessors/max_abs_scaler.py +++ b/src/models/preprocessors/max_abs_scaler.py @@ -14,9 +14,9 @@ class TensorMaxAbsScaler(Preprocessor): Custom implementation of Ray's MaxAbsScaler for usage with tensor column in ray.data.dataset.Dataset. """ - def __init__(self, features_list): + def __init__(self, features): # Parameters - self._features_list = features_list + self._features = features # Empty inits self._absmax = None @@ -24,9 +24,9 @@ def _fit(self, dataset:ray.data.dataset.Dataset): """ Fit the MaxAbsScaler to the given dataset. """ - self._absmax = np.zeros(len(self._features_list), dtype = np.int32) + self._absmax = np.zeros(len(self._features), dtype = np.int32) for batch in dataset.iter_batches(batch_format = "numpy"): - for i in np.arange(len(self._features_list)): + for i in np.arange(len(self._features)): local_max = max(batch[TENSOR_COLUMN_NAME][:,i]) if local_max > self._absmax[i]: self._absmax[i] = local_max @@ -41,8 +41,8 @@ def _transform_pandas(self, batch: pd.DataFrame): """ df = batch[TENSOR_COLUMN_NAME] df = _unwrap_ndarray_object_type_if_needed(df) - df = pd.DataFrame(df, columns = self._features_list) - for i, col in enumerate(self._features_list): + df = pd.DataFrame(df, columns = self._features) + for i, col in enumerate(self._features): df[col] = df[col].apply(value_transform, args=[self._absmax[i]]) batch[TENSOR_COLUMN_NAME] = TensorArray(np.array(df)) @@ -55,7 +55,7 @@ def _transform_numpy(self, batch: dict): """ df = np.array(batch[TENSOR_COLUMN_NAME], dtype = np.float32) vecfunc = np.vectorize(value_transform) - for i in np.arange(len(self._features_list)): + for i in np.arange(len(self._features)): df[:,i] = vecfunc(df[:,i], self._absmax[i]) batch[TENSOR_COLUMN_NAME] = df @@ -63,7 +63,7 @@ def _transform_numpy(self, batch: dict): return batch def __repr__(self): - return f"{self.__class__.__name__}(columns={self._features_list!r})" + return f"{self.__class__.__name__}(columns={self._features!r})" # Function to map to the data, used by both data representations def value_transform(x, _min, _max): diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py index ebf560a..1cb6aa0 100644 --- a/src/models/preprocessors/min_max_scaler.py +++ b/src/models/preprocessors/min_max_scaler.py @@ -13,9 +13,9 @@ class TensorMinMaxScaler(Preprocessor): Custom implementation of Ray's MinMax Scaler for usage with tensor column in ray.data.dataset.Dataset. """ - def __init__(self, features_list): + def __init__(self, features): # Parameters - self._features_list = features_list + self._features = features def _fit(self, ds: Dataset) -> Preprocessor: """ @@ -23,7 +23,7 @@ def _fit(self, ds: Dataset) -> Preprocessor: """ min = [] max = [] - nb_features = len(self._features_list) + nb_features = len(self._features) def Min(dct): arr = dct[TENSOR_COLUMN_NAME] diff --git a/src/models/preprocessors/power_transformer.py b/src/models/preprocessors/power_transformer.py index 1cc9c8d..fd5cb2c 100644 --- a/src/models/preprocessors/power_transformer.py +++ b/src/models/preprocessors/power_transformer.py @@ -14,8 +14,8 @@ class TensorPowerTransformer(Preprocessor): """ Custom implementation of Ray's PowerTransformer for usage with tensor column in ray.data.dataset.Dataset. """ - def __init__(self, features_list: List[str]): - self._features_list = features_list + def __init__(self, features: List[str]): + self._features = features self.method = "yeo-johnson" self.stats_ = {} @@ -25,7 +25,7 @@ def _fit(self, ds: ray.data.dataset.Dataset): """ nb_samples = ds.count() dct_values = {} - for feature in self._features_list: + for feature in self._features: dct_values[feature] = np.zeros(nb_samples, dtype = np.int32) previous_pos = 0 @@ -33,7 +33,7 @@ def _fit(self, ds: ray.data.dataset.Dataset): for batch in ds.iter_batches(batch_format = 'numpy'): batch = batch[TENSOR_COLUMN_NAME] batch_size = len(batch) - for i, feature in enumerate(self._features_list): + for i, feature in enumerate(self._features): dct_values[feature][previous_pos:(previous_pos+batch_size)] = batch[:,i] previous_pos = previous_pos + batch_size @@ -49,7 +49,7 @@ def _transform_pandas(self, batch: pd.DataFrame): """ Transform the given dataset to pandas dataframe. """ - df = pd.DataFrame(np.vstack(batch[TENSOR_COLUMN_NAME]), columns = self._features_list) + df = pd.DataFrame(np.vstack(batch[TENSOR_COLUMN_NAME]), columns = self._features) for feature, transformer in self.stats_.items(): transformed = df[feature].to_numpy().reshape(-1,1) transformed = transformer.transform(transformed) diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py new file mode 100644 index 0000000..a6032fa --- /dev/null +++ b/src/models/preprocessors/tfidf_transformer.py @@ -0,0 +1,66 @@ + +import numpy as np +import pandas as pd +import scipy.sparse as sp + + +from ray.data.dataset import Dataset +from sklearn.preprocessing import normalize +from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +TENSOR_COLUMN_NAME = '__value__' + +class TensorTfIdfTransformer(Preprocessor): + """ + Custom implementation of TF-IDF transformation inspired by sklearn.feature_extraction.text.TfidfTransformer features scaler to be used as a Ray preprocessor. + https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer + TF-IDF transformation is used to scale down the impact of tokens that occur very frequently and scale up the impact of those that occur very rarely. + """ + + def __init__(self, features): + # Parameters + self._features = features + self._nb_features = len(features) + + def _fit(self, ds: Dataset) -> Preprocessor: + nb_samples = ds.count() + + # Nb of occurences + occurences = np.zeros(self._nb_features) + for batch in ds.iter_batches(batch_format = 'numpy'): + batch = batch[TENSOR_COLUMN_NAME] + occurences += np.count_nonzero(batch, axis = 0) + + idf = np.log(nb_samples / occurences) + 1 + + idf_diag = sp.diags( + idf, + offsets=0, + shape=(self._nb_features, self._nb_features), + format="csr", + dtype=np.float64, + ) + + self.stats_ = {'idf_diag' : idf_diag} + + return self + + def _transform_pandas(self, batch: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + idf_diag = self.stats_['idf_diag'] + + df = batch[TENSOR_COLUMN_NAME] + df = _unwrap_ndarray_object_type_if_needed(df) + + df = df * idf_diag + + df = normalize(df, norm = 'l2', copy = False) + + batch[TENSOR_COLUMN_NAME] = pd.Series(list(df)) + + return batch + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 903fe93..bc91fab 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -14,6 +14,7 @@ from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder # Training +from ray.air.config import ScalingConfig from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDOneClassSVM, SGDClassifier from models.sklearn.partial_trainer import SklearnPartialTrainer @@ -155,7 +156,7 @@ def _cross_validation(self, datasets, kmers_ds): y_true = list(y_true) y_pred = self._predict_cv(df_test.drop_columns([self.taxa])) - + self._cv_score(y_true, y_pred) def _build(self): @@ -221,11 +222,11 @@ def _fit_model(self, datasets): batch_size=self.batch_size, training_epochs=self._training_epochs, set_estimator_cpus=True, - # scaling_config=ScalingConfig( - # trainer_resources={ - # 'CPU': int(os.cpu_count()*0.6) - # } - # ), + scaling_config=ScalingConfig( + trainer_resources={ + 'CPU': int(os.cpu_count()*0.6) + } + ), run_config=RunConfig( name=self.classifier, local_dir=self._workdir From 58cbe4da4fea3f037db3b8c6b0af565a3e1cfed3 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 2 Nov 2023 09:47:12 -0400 Subject: [PATCH 18/92] add rdf features selection & svd reduction + reduction in training steps --- src/Caribou_reduce_features.py | 24 +++--- src/data/kmers.py | 40 +--------- src/data/reduction/rdf_features_selection.py | 5 -- src/data/reduction/truncated_svd_reduction.py | 80 +++++++++++++++++++ src/models/kerasTF/models.py | 35 ++++++-- src/models/models_utils.py | 2 + src/models/sklearn/models.py | 29 ++++--- 7 files changed, 143 insertions(+), 72 deletions(-) create mode 100644 src/data/reduction/truncated_svd_reduction.py diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 879009f..44c3b8a 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -11,11 +11,12 @@ from glob import glob from pathlib import Path -from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.low_var_selection import TensorLowVarSelection +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.chi_features_selection import TensorChiFeaturesSelection from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection +from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion __author__ = "Nicolas de Montigny" @@ -55,12 +56,6 @@ def features_reduction(opt): 3. Chi2 + SelectPercentile() (75% best values) """ - """ - TODO: Add to preprocessing in model training - 1. Replace the MinMaxScaling -> TfidfTransformer to scale down the impact of tokens that occur very frequently (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer) - 2. TruncatedSVD to reduce dimensions and keep 10 000 features ~PCA (https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD) - """ - # Load data files_lst = glob(os.path.join(data['profile'],'*.parquet')) ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) @@ -68,12 +63,11 @@ def features_reduction(opt): # Time the computation of transformations t_start = time() ds = tfidf_transform(ds, kmers_list) - ds, kmers_list = tree_relevant_features(ds, kmers_list, 'phylum') - print(len(kmers_list)) + ds, kmers_list = tree_relevant_features(ds, kmers_list, opt['taxa']) if len(kmers_list) == 0: ds, kmers_list = occurence_exclusion(ds, opt['kmers_list']) ds, kmers_list = low_var_selection(ds,kmers_list) - ds, data['kmers'] = features_selection(ds, kmers_list, 'phylum') + ds, data['kmers'] = features_selection(ds, kmers_list, opt['taxa']) t_end = time() t_reduction = t_end - t_start # Save reduced dataset @@ -148,6 +142,15 @@ def tree_relevant_features(ds, kmers, taxa): return ds, kmers +# Features decomposition for dimension reduction +def truncated_svd(ds, kmers): + preprocessor = TensorTruncatedSVDReduction( + features = kmers, + nb_components = 10 + ) + ds = preprocessor.fit_transform(ds) + + return ds # Argument parsing from CLI ################################################################################ @@ -159,6 +162,7 @@ def tree_relevant_features(ds, kmers, taxa): parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files') parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced') # Parameters + parser.add_argument('-t','--taxa', default='phylum', help='The taxonomic level to use for the classification, defaults to Phylum.') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled') args = parser.parse_args() diff --git a/src/data/kmers.py b/src/data/kmers.py index 13c1fa8..5a0fbe5 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -15,11 +15,6 @@ from data.extraction.seen_kmers_vectorizer import SeenKmersVectorizer from data.extraction.given_kmers_vectorizer import GivenKmersVectorizer -# Features selection -from data.reduction.low_var_selection import TensorLowVarSelection -from data.reduction.features_selection import TensorFeaturesSelection -from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion - __author__ = ['Amine Remita', 'Nicolas de Montigny'] __all__ = ['KmersCollection'] @@ -344,40 +339,7 @@ def _kmers_tokenization(self): self.df = tokenizer.transform(self.df) if self.method == 'seen': self.kmers_list = tokenizer.stats_['tokens(sequence)'] - # self._kmers_reduction() - - def _kmers_reduction(self): - """ - Brute force -> Features statistically related to classes - 1. OccurenceExclusion (10% extremes) - 2. LowVarSelection (variance > 10%) - 3. Chi2 + SelectPercentile() (75% best values) - """ - # Exclusion of columns occuring in less / more than 10% of the columns = 20% removed - excluder = TensorPercentOccurenceExclusion( - features = self.kmers_list, - percent = 0.1 - ) - self.df = excluder.fit_transform(self.df) - self.kmers_list = excluder.stats_['cols_keep'] - - # Exclusion of columns with less than 10% variance - varier = TensorLowVarSelection( - features = self.kmers_list, - threshold = 0.1, - ) - self.df = varier.fit_transform(self.df) - self.kmers_list = varier.stats_['cols_keep'] - - # Chi2 evaluation of dependance between features and classes to keep 75% most significative - selector = TensorFeaturesSelection( - features = self.kmers_list, - taxa = self.taxas[0], - threshold = 0.25 - ) - self.df = selector.fit_transform(self.df) - self.kmers_list = selector.stats_['cols_keep'] - + def _write_dataset(self): self.df.write_parquet(self.Xy_file) rmtree(self._tmp_dir) diff --git a/src/data/reduction/rdf_features_selection.py b/src/data/reduction/rdf_features_selection.py index 4a12e44..7b67d0c 100644 --- a/src/data/reduction/rdf_features_selection.py +++ b/src/data/reduction/rdf_features_selection.py @@ -1,16 +1,11 @@ -import os -import logging - import numpy as np import pandas as pd from typing import List -from warnings import warn from ray.data import Dataset from xgboost import XGBRFClassifier -from ray.air.config import ScalingConfig from sklearn.preprocessing import LabelEncoder diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py new file mode 100644 index 0000000..9e64773 --- /dev/null +++ b/src/data/reduction/truncated_svd_reduction.py @@ -0,0 +1,80 @@ +import numpy as np +import pandas as pd + +from typing import List +from warnings import warn +from ray.data import Dataset + +from sklearn.utils.extmath import randomized_svd + +from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +TENSOR_COLUMN_NAME = '__value__' + +class TensorTruncatedSVDReduction(Preprocessor): + """ + Custom class for using a mix of TruncatedSVD inspired by sklearn.decomposition.TruncatedSVD and applying a batched strategy inspired by sklearn.decomposition.IncrementalPCA to process batches in parallel. + This makes it possible to use the class as a Ray preprocessor in a features reduction strategy. + TruncatedSVD performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). + When it is applied following the TF-IDF normalisation, it becomes a latent semantic analysis (LSA). + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD + """ + + def __init__(self, features: List[str], nb_components: int = 10000): + # Parameters + self.features = features + self._nb_features = len(features) + self._nb_components = nb_components + + + def _fit(self, ds: Dataset) -> Preprocessor: + def svd_batch(arr: np.array): + df = arr['__value__'] + df = _unwrap_ndarray_object_type_if_needed(df) + U, Sigma, VT = randomized_svd( + df, + n_components = self._nb_components, + n_iter = 5, + n_oversamples = 10, + power_iteration_normalizer = 'LU', + random_state = None + ) + + return {'VT': [VT]} + + if self._nb_features > self._nb_components: + # Exec svd + components = [] + svd_vt = ds.map_batches(svd_batch, batch_format = 'numpy') + + for row in svd_vt.iter_rows(): + components.append(row['VT']) + + components = np.mean(components, axis = 0) + + self.stats_ = {'components' : components} + else: + warn('No features reduction to do because the number of features is already lower than the required number of components') + self.stats_ = {'components' : False} + + return self + + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + components = self.stats_['components'] + + if components is not False: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = np.dot(tensor_col, components.T) + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + return df + + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})") + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index d1e8443..79c763d 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -8,6 +8,11 @@ from glob import glob from shutil import rmtree +# Dimensions reduction +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer +from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection +from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction + # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.preprocessors.min_max_scaler import TensorMinMaxScaler @@ -144,19 +149,31 @@ def preprocess(self, df): labels.append(row[self.taxa]) self._nb_classes = len(np.unique(labels)) if self._nb_classes == 2: + self._encoder = ModelLabelEncoder(self.taxa) self._preprocessor = Chain( - TensorMinMaxScaler(self.kmers), - ModelLabelEncoder(self.taxa), + TensorTfIdfTransformer(self.kmers), + TensorRDFFeaturesSelection(self.kmers, self.taxa), ) else: - self._preprocessor = Chain( - TensorMinMaxScaler(self.kmers), + self._encoder = Chain( LabelEncoder(self.taxa), - OneHotTensorEncoder(self.taxa), + OneHotTensorEncoder(self.taxa) + ) + self._preprocessor = Chain( + TensorTfIdfTransformer(self.kmers), + TensorRDFFeaturesSelection(self.kmers, self.taxa), ) - self._preprocessor.fit(df) + + + self._encoder.fit(df) + df = self._preprocessor.fit_transform(df) + self._reductor = TensorTruncatedSVDReduction(self.kmers) + self._reductor.fit(df) # Labels mapping - labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys()) + if self._nb_classes == 2: + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + else: + labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys()) encoded = np.arange(len(labels)) labels = np.append(labels, 'unknown') encoded = np.append(encoded, -1) @@ -196,7 +213,9 @@ def _fit_model(self, datasets): # Preprocessing loop for name, ds in datasets.items(): ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) ds = self._preprocessor.transform(ds) + ds = self._reductor.transform(ds) datasets[name] = ds # Training parameters @@ -239,7 +258,7 @@ def predict(self, df, threshold=0.8): df = df.drop_columns(col_2_drop) # Preprocess - df = self._preprocessor.preprocessors[0].transform(df) + df = self._preprocessor.transform(df) self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 2f38ff8..c38ca25 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -85,7 +85,9 @@ def __init__( self._predict_ids = [] # Initialize Ray variables self._clf = None + self._encoder = None self._preprocessor = None + self._reductor = None self._model_ckpt = None self._trainer = None self._train_params = {} diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index bc91fab..2557952 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -7,8 +7,13 @@ from glob import glob from shutil import rmtree +# Dimensions reduction +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer +from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection +from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction + # Preprocessing -from ray.data.preprocessors import Chain, BatchMapper +from ray.data.preprocessors import Chain from models.encoders.model_label_encoder import ModelLabelEncoder from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder @@ -97,7 +102,6 @@ def __init__( ) # Parameters self._encoded = [] - self._encoder = None # Computes self._build() @@ -111,13 +115,18 @@ def preprocess(self, df): self._encoder = ModelLabelEncoder(self.taxa) self._preprocessor = Chain( - TensorMinMaxScaler(self.kmers), - self._encoder, + TensorTfIdfTransformer(self.kmers), + TensorRDFFeaturesSelection(self.kmers, self.taxa), ) - self._preprocessor.fit(df) + self._encoder.fit(df) + df = self._preprocessor.fit_transform(df) + self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep'] + self._reductor = TensorTruncatedSVDReduction(self.kmers) + self._reductor.fit(df) + # Labels mapping if self.classifier != 'onesvm': - labels = list(self._preprocessor.preprocessors[1].stats_[f'unique_values({self.taxa})'].keys()) + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) self._encoded = np.arange(len(labels)) labels = np.append(labels, 'unknown') self._encoded = np.append(self._encoded, -1) @@ -146,8 +155,6 @@ def _cross_validation(self, datasets, kmers_ds): self._fit_model(datasets) - df_test = self._preprocessor.preprocessors[0].transform(df_test) - y_true = [] for row in df_test.iter_rows(): y_true.append(row[self.taxa]) @@ -202,9 +209,10 @@ def _fit_model(self, datasets): print('_fit_model') for name, ds in datasets.items(): ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) ds = self._preprocessor.transform(ds) + ds = self._reductor.transform(ds) datasets[name] = ray.put(ds) - try: training_labels = self._encoded.copy() training_labels = np.delete( @@ -252,7 +260,8 @@ def _predict_cv(self, df): def predict(self, df, threshold = 0.8): print('predict') if df.count() > 0: - df = self._preprocessor.preprocessors[0].transform(df) + df = self._preprocessor.transform(df) + df = self._reductor.transform(df) if self.classifier == 'onesvm': predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._models_collection['domain'], SklearnTensorPredictor) From a9629932e4361d5da7eb158b8d2b5c2449144578 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 4 Nov 2023 10:19:45 -0400 Subject: [PATCH 19/92] features reduction 2.0 + tf-rdf scaling --- src/Caribou_reduce_features.py | 109 ++++++++----------- src/data/reduction/chi_features_selection.py | 5 +- src/data/reduction/low_var_selection.py | 7 +- src/data/reduction/occurence_exclusion.py | 5 +- src/data/reduction/rdf_features_selection.py | 7 +- src/models/sklearn/models.py | 1 + src/models/sklearn/partial_trainer.py | 45 ++++---- 7 files changed, 90 insertions(+), 89 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 44c3b8a..0157423 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -38,7 +38,7 @@ def features_reduction(opt): verify_file(opt['kmers_list']) # Verification of k length - k_length, kmers_list = verify_kmers_list_length(k_length, opt['kmers_list']) + k_length, kmers = verify_kmers_list_length(k_length, opt['kmers_list']) outdirs = define_create_outdirs(opt['outdir']) @@ -48,31 +48,38 @@ def features_reduction(opt): # Features reduction ################################################################################ """ - First option : Select features relevant to classification by Random Forest of decision trees - - Brute force -> Features statistically related to classes - 1. OccurenceExclusion (10% extremes) - 2. LowVarSelection (variance > 10%) - 3. Chi2 + SelectPercentile() (75% best values) + Two-step features reduction : + 0. Features scaling + 1. TF-IDF scaling (diminish impact of more present and augment impact of less present) + 1. Brute force features exclusion + 1. OccurenceExclusion (exclusion of features present in more than 95% of samples) + 2. LowVarSelection (exclusion of features with less than 5% variance) + 2. Statistical features selection + 1. Chi2 + SelectPercentile() (select 25% of features with highest Chi2 values) + 3. In training features selection + 1. RandomForestClassification (select features identified as useful for classification) + 2. TruncatedSVD decomposition (map the features to 10 000 decomposed features if there is still more) """ # Load data files_lst = glob(os.path.join(data['profile'],'*.parquet')) - ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # ds = ray.data.read_parquet(data['profile'], parallelism = -1) + export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) # Time the computation of transformations t_start = time() - ds = tfidf_transform(ds, kmers_list) - ds, kmers_list = tree_relevant_features(ds, kmers_list, opt['taxa']) - if len(kmers_list) == 0: - ds, kmers_list = occurence_exclusion(ds, opt['kmers_list']) - ds, kmers_list = low_var_selection(ds,kmers_list) - ds, data['kmers'] = features_selection(ds, kmers_list, opt['taxa']) + # Features scaling + train_ds = tfidf_transform(train_ds, kmers) + # Brute force features exclusion + train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers) + train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers) + # Statistical features selection + train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa']) + # Time the computation of transformations t_end = time() t_reduction = t_end - t_start # Save reduced dataset data['profile'] = f"{data['profile']}_reduced" - ds.write_parquet(data['profile']) + export_ds.write_parquet(data['profile']) # Save reduced K-mers with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle: handle.writelines("%s\n" % item for item in data['kmers']) @@ -83,74 +90,54 @@ def features_reduction(opt): print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.") -# Exclusion of columns occuring in less / more than 10% of the columns = 20% removed -def occurence_exclusion(ds, kmers): +# TF-IDF scaling of the features +def tfidf_transform(ds, kmers): + preprocessor = TensorTfIdfTransformer( + features = kmers + ) + ds = preprocessor.fit_transform(ds) + + return ds + +# Exclusion of columns occuring in more than 95% of the samples +def occurence_exclusion(train_ds, export_ds, kmers): preprocessor = TensorPercentOccurenceExclusion( features = kmers, - percent = 0.1 # remove features present in less than 10% samples + percent = 0.5 ) - ds = preprocessor.fit_transform(ds) + train_ds = preprocessor.fit_transform(train_ds) + export_ds = preprocessor.transform(export_ds) kmers = preprocessor.stats_['cols_keep'] - return ds, kmers + return train_ds, export_ds, kmers -# Exclusion of columns with less than 10% variance -def low_var_selection(ds, kmers): +# Exclusion of columns with less than 5% variance +def low_var_selection(train_ds, export_ds, kmers): preprocessor = TensorLowVarSelection( features = kmers, - threshold = 0.1, # remove features with less than 5% variance + threshold = 0.05, ) - ds = preprocessor.fit_transform(ds) + train_ds = preprocessor.fit_transform(train_ds) + export_ds = preprocessor.transform(export_ds) kmers = preprocessor.stats_['cols_keep'] - return ds, kmers + return train_ds, export_ds, kmers # Chi2 evaluation of dependance between features and classes -def features_selection(ds, kmers, taxa): +def features_selection(train_ds, export_ds, kmers, taxa): preprocessor = TensorChiFeaturesSelection( features = kmers, taxa = taxa, threshold = 0.75, # Keep 25% higest results ) - ds = preprocessor.fit_transform(ds) - kmers = preprocessor.stats_['cols_keep'] - print(len(kmers)) - - return ds, kmers - -# TF-IDF scaling of the features -def tfidf_transform(ds, kmers): - preprocessor = TensorTfIdfTransformer( - features = kmers - ) - ds = preprocessor.fit_transform(ds) - - return ds - -# Decision tree feature selection to keep only those identified as relevant to classification -def tree_relevant_features(ds, kmers, taxa): - preprocessor = TensorRDFFeaturesSelection( - features = kmers, - taxa = taxa - ) - preprocessor.fit_transform(ds) - + train_ds = preprocessor.fit_transform(train_ds) + export_ds = preprocessor.transform(export_ds) kmers = preprocessor.stats_['cols_keep'] - return ds, kmers - -# Features decomposition for dimension reduction -def truncated_svd(ds, kmers): - preprocessor = TensorTruncatedSVDReduction( - features = kmers, - nb_components = 10 - ) - ds = preprocessor.fit_transform(ds) - - return ds + return train_ds, export_ds, kmers # Argument parsing from CLI ################################################################################ diff --git a/src/data/reduction/chi_features_selection.py b/src/data/reduction/chi_features_selection.py index 95fd013..9f51310 100644 --- a/src/data/reduction/chi_features_selection.py +++ b/src/data/reduction/chi_features_selection.py @@ -56,7 +56,10 @@ def stats(batch): # Keep features with values higher than the threshold cols_keep = [self.features[i] for i, chi in enumerate(mean_chi) if chi > self.threshold] - self.stats_ = {'cols_keep' : cols_keep} + if 0 < len(cols_keep) : + self.stats_ = {'cols_keep' : cols_keep} + else: + self.stats_ = {'cols_keep' : self.features} return self diff --git a/src/data/reduction/low_var_selection.py b/src/data/reduction/low_var_selection.py index 0212c8c..912e65a 100644 --- a/src/data/reduction/low_var_selection.py +++ b/src/data/reduction/low_var_selection.py @@ -18,7 +18,7 @@ class TensorLowVarSelection(Preprocessor): def __init__( self, features : List[str], - threshold: float = 0.1, + threshold: float = 0.05, ): self.features = features self.threshold = threshold @@ -66,7 +66,10 @@ def get_sqr_dev(batch): # Keep features with values higher than the threshold cols_keep = [self.features[i] for i, var in enumerate(var_arr) if var > self.threshold] - self.stats_ = {'cols_keep' : cols_keep} + if 0 < len(cols_keep) : + self.stats_ = {'cols_keep' : cols_keep} + else: + self.stats_ = {'cols_keep' : self.features} return self diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index 17da804..fe9b45d 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -86,7 +86,10 @@ def count_occurences(batch): # Construct list of features to keep by position cols_keep = [self.features[i] for i, occurence in enumerate(occurences) if occurence < high_treshold] - self.stats_ = {'cols_keep' : cols_keep} + if 0 < len(cols_keep) : + self.stats_ = {'cols_keep' : cols_keep} + else: + self.stats_ = {'cols_keep' : self.features} return self diff --git a/src/data/reduction/rdf_features_selection.py b/src/data/reduction/rdf_features_selection.py index 7b67d0c..c2ad667 100644 --- a/src/data/reduction/rdf_features_selection.py +++ b/src/data/reduction/rdf_features_selection.py @@ -53,7 +53,10 @@ def xgboost_batch(arr: np.array): cols_keep.extend(row['features']) cols_keep = np.unique(cols_keep) - self.stats_ = {'cols_keep' : cols_keep} + if 0 < len(cols_keep) : + self.stats_ = {'cols_keep' : cols_keep} + else: + self.stats_ = {'cols_keep' : self.features} return self @@ -61,7 +64,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) cols_keep = self.stats_['cols_keep'] - if len(cols_keep) < self._nb_features and len(cols_keep) > 0 : + if len(cols_keep) < self._nb_features: tensor_col = df[TENSOR_COLUMN_NAME] tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) tensor_col = pd.DataFrame(tensor_col, columns = self.features) diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 2557952..fa8139e 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -188,6 +188,7 @@ def _build(self): 'eta0' : 0.001, 'n_jobs' : -1 } +# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems elif self.classifier == 'sgd': print('Training multiclass SGD classifier') self._clf = SGDClassifier() diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py index e9a51c6..f08dd7c 100644 --- a/src/models/sklearn/partial_trainer.py +++ b/src/models/sklearn/partial_trainer.py @@ -236,28 +236,29 @@ def training_loop(self): self.estimator.partial_fit(batch_X, batch_y, **self.fit_params) fit_time = time() - start_time - if len(self._labels) > 2: - with parallel_backend("ray", n_jobs=num_cpus): - X_calib_df = np.empty((X_calib.count(), len(self._features_list))) - for ind, batch in enumerate(X_calib.iter_batches( - batch_size = 1, - batch_format = 'numpy' - )): - X_calib_df[ind] = batch['__value__'] - - """ - X_calib = pd.DataFrame(X_calib_df, columns = self._features_list) - """ - y_calib = y_calib.to_pandas().to_numpy() - self.estimator = CalibratedClassifierCV( - estimator = self.estimator, - method = 'sigmoid', - cv = 'prefit', - ) - self.estimator.fit( - X_calib_df, - y_calib, - ) + # Calibrated classifier was meant to give the predict_proba method but all used models implement it and learning should be faster without it + # if len(self._labels) > 2: + # with parallel_backend("ray", n_jobs=num_cpus): + # X_calib_df = np.empty((X_calib.count(), len(self._features_list))) + # for ind, batch in enumerate(X_calib.iter_batches( + # batch_size = 1, + # batch_format = 'numpy' + # )): + # X_calib_df[ind] = batch['__value__'] + + # """ + # X_calib = pd.DataFrame(X_calib_df, columns = self._features_list) + # """ + # y_calib = y_calib.to_pandas().to_numpy() + # self.estimator = CalibratedClassifierCV( + # estimator = self.estimator, + # method = 'sigmoid', + # cv = 'prefit', + # ) + # self.estimator.fit( + # X_calib_df, + # y_calib, + # ) with tune.checkpoint_dir(step=1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f: From 940af1c10db17ab1a9ccac3066bf535aed6c3f9c Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 4 Nov 2023 22:41:36 -0400 Subject: [PATCH 20/92] script for sim --- setup.cfg | 1 + src/Caribou_classification.py | 40 +-- src/Caribou_classification_train_cv.py | 31 ++- src/Caribou_extraction.py | 27 +- src/Caribou_extraction_train_cv.py | 26 +- src/Caribou_pipeline.py | 2 +- src/Caribou_reduce_features.py | 6 +- src/Caribou_simulate_test_val.py | 90 +++++++ src/models/classification.py | 360 ++++++------------------- src/models/classification_old.py | 335 +++++++++++++++++++++++ src/models/kerasTF/models.py | 1 - src/supplement/sklearn_tuning.py | 39 +-- src/utils.py | 104 ++++++- 13 files changed, 715 insertions(+), 347 deletions(-) create mode 100644 src/Caribou_simulate_test_val.py create mode 100644 src/models/classification_old.py diff --git a/setup.cfg b/setup.cfg index 6e6c672..fce114a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,6 +34,7 @@ scripts = src/Caribou_pipeline.py src/Caribou_kmers.py src/Caribou_reduce_features.py + src/Caribou_simulate_test_val.py src/Caribou_extraction.py src/Caribou_classification.py src/Caribou_extraction_train_cv.py diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index d421389..9c29172 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -6,7 +6,7 @@ from utils import * from time import time from pathlib import Path -from models.classification import ClassificationMethods +from models.classification_old import ClassificationMethods __author__ = "Nicolas de Montigny" @@ -15,17 +15,8 @@ # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_classification(opt): - # Verify existence of files and load data - data_bacteria = verify_load_data(opt['data_bacteria']) - data_metagenome = verify_load_data(opt['data_metagenome']) - k_length = len(data_bacteria['kmers'][0]) - if opt['preclassified_data'] is not None: - preclassified_data = verify_load_preclassified(opt['preclassified_data']) - else: - preclassified_data = None - - # Verify that model type is valid / choose default depending on host presence + # Verify that model type is valid / choose default if opt['model_type'] is None: opt['model_type'] = 'cnn' @@ -35,22 +26,37 @@ def bacteria_classification(opt): outdirs = define_create_outdirs(opt['outdir']) + # Initialize cluster + init_ray_cluster(opt['workdir']) + +# Data loading +################################################################################ + + db_data, db_ds = verify_load_db(opt['data_bacteria']) + data_metagenome = verify_load_data(opt['data_metagenome']) + + k_length = len(db_data['kmers'][0]) + + if opt['preclassified_data'] is not None: + preclassified_data = verify_load_preclassified(opt['preclassified_data']) + else: + preclassified_data = None + # Validate and extract list of taxas if opt['taxa'] is not None: - lst_taxas = verify_taxas(opt['taxa'], data_bacteria['taxas']) + lst_taxas = verify_taxas(opt['taxa'], db_data['taxas']) else: - lst_taxas = data_bacteria['taxas'].copy() + lst_taxas = db_data['taxas'].copy() if 'domain' in lst_taxas: lst_taxas.remove('domain') - # Initialize cluster - init_ray_cluster(opt['workdir']) + val_ds = split_sim_dataset(db_ds, db_data, 'validation') # Definition of model for bacteria taxonomic classification + training ################################################################################ clf = ClassificationMethods( - database_k_mers = data_bacteria, + database_k_mers = db_data, k = k_length, outdirs = outdirs, database = opt['database_name'], @@ -66,7 +72,7 @@ def bacteria_classification(opt): ################################################################################ t_start = time() - end_taxa = clf.execute_training_prediction(data_metagenome) + end_taxa = clf.fit_predict(data_metagenome) t_end = time() t_classif = t_end - t_start clf_data = merge_save_data( diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index 856c7fe..d7d25a5 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -7,7 +7,7 @@ from time import time from pathlib import Path from logging import ERROR -from models.classification import ClassificationMethods +from models.classification_old import ClassificationMethods warnings.filterwarnings('ignore') @@ -18,11 +18,8 @@ # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_classification_train_cv(opt): - # Verify existence of files and load data - data_bacteria = verify_load_data(opt['data_bacteria']) - k_length = len(data_bacteria['kmers'][0]) - # Verify that model type is valid / choose default depending on host presence + # Verify that model type is valid / choose default if opt['model_type'] is None: opt['model_type'] = 'cnn' @@ -32,24 +29,34 @@ def bacteria_classification_train_cv(opt): outdirs = define_create_outdirs(opt['outdir']) + # Initialize cluster + init_ray_cluster(opt['workdir']) + +# Data loading +################################################################################ + + db_data, db_ds = verify_load_db(opt['data_bacteria']) + + k_length = len(db_data['kmers'][0]) + # Validate and extract list of taxas if opt['taxa'] is not None: - lst_taxas = verify_taxas(opt['taxa'], data_bacteria['taxas']) + lst_taxas = verify_taxas(opt['taxa'], db_data['taxas']) else: - lst_taxas = data_bacteria['taxas'].copy() + lst_taxas = db_data['taxas'].copy() if 'domain' in lst_taxas: lst_taxas.remove('domain') - - # Initialize cluster - init_ray_cluster(opt['workdir']) + + test_ds = split_sim_dataset(db_ds, db_data, 'test') + val_ds = split_sim_dataset(db_ds, db_data, 'validation') # Training and cross-validation of models for classification of bacterias ################################################################################ t_start = time() ClassificationMethods( - database_k_mers = data_bacteria, + database_k_mers = db_data, k = k_length, outdirs = outdirs, database = opt['database_name'], @@ -60,7 +67,7 @@ def bacteria_classification_train_cv(opt): training_epochs = opt['training_epochs'], verbose = opt['verbose'], cv = True - ).execute_training() + ).fit() t_end = time() t_classify = t_end - t_start print( diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py index c0dbaa0..d3ea11f 100644 --- a/src/Caribou_extraction.py +++ b/src/Caribou_extraction.py @@ -5,7 +5,7 @@ from utils import * from time import time from pathlib import Path -from models.classification import ClassificationMethods +from models.classification_old import ClassificationMethods __author__ = "Nicolas de Montigny" @@ -14,14 +14,6 @@ # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_extraction(opt): - # Verify existence of files and load data - data_bacteria = verify_load_data(opt['data_bacteria']) - if opt['data_host'] is not None: - data_host = verify_load_data(opt['data_host']) - verify_concordance_klength(len(data_bacteria['kmers'][0]), len(data_host['kmers'][0])) - data_metagenome = verify_load_data(opt['data_metagenome']) - - k_length = len(data_bacteria['kmers'][0]) # Verify that model type is valid / choose default depending on host presence if opt['host_name'] is None: @@ -38,11 +30,24 @@ def bacteria_extraction(opt): # Initialize cluster init_ray_cluster(opt['workdir']) +# Data loading +################################################################################ + + if opt['data_host'] is not None: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + else: + db_data, db_ds = verify_load_db(opt['data_bacteria']) + data_metagenome = verify_load_data(opt['data_metagenome']) + + k_length = len(db_data['kmers'][0]) + + val_ds = split_sim_dataset(db_ds, db_data, 'validation') + # Definition of model for bacteria extraction / host removal + execution ################################################################################ if opt['host_name'] is None: clf = ClassificationMethods( - database_k_mers = data_bacteria, + database_k_mers = (db_data, db_ds), k = k_length, outdirs = outdirs, database = opt['database_name'], @@ -55,7 +60,7 @@ def bacteria_extraction(opt): ) else: clf = ClassificationMethods( - database_k_mers = (data_bacteria, data_host), + database_k_mers = (db_data, db_ds), k = k_length, outdirs = outdirs, database = opt['database_name'], diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py index aafa701..fccb0c5 100644 --- a/src/Caribou_extraction_train_cv.py +++ b/src/Caribou_extraction_train_cv.py @@ -5,7 +5,7 @@ from utils import * from time import time from pathlib import Path -from models.classification import ClassificationMethods +from models.classification_old import ClassificationMethods __author__ = "Nicolas de Montigny" @@ -14,13 +14,6 @@ # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_extraction_train_cv(opt): - # Verify existence of files and load data - data_bacteria = verify_load_data(opt['data_bacteria']) - if opt['data_host'] is not None: - data_host = verify_load_data(opt['data_host']) - verify_concordance_klength(len(data_bacteria['kmers'][0]), len(data_host['kmers'][0])) - - k_length = len(data_bacteria['kmers'][0]) # Validate training parameters verify_positive_int(opt['batch_size'], 'batch_size') @@ -31,6 +24,19 @@ def bacteria_extraction_train_cv(opt): # Initialize cluster init_ray_cluster(opt['workdir']) +# Data loading +################################################################################ + + if opt['data_host'] is not None: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + else: + db_data, db_ds = verify_load_db(opt['data_bacteria']) + + k_length = len(db_data['kmers'][0]) + + test_ds = split_sim_dataset(db_ds, db_data, 'test') + val_ds = split_sim_dataset(db_ds, db_data, 'validation') + # Training and cross-validation of models for bacteria extraction / host removal ################################################################################ @@ -38,7 +44,7 @@ def bacteria_extraction_train_cv(opt): if opt['host_name'] is None: ClassificationMethods( - database_k_mers = data_bacteria, + database_k_mers = (db_data, db_ds), k = k_length, outdirs = outdirs, database = opt['database_name'], @@ -51,7 +57,7 @@ def bacteria_extraction_train_cv(opt): ).execute_training() else: ClassificationMethods( - database_k_mers = (data_bacteria, data_host), + database_k_mers = (db_data, db_ds), k = k_length, outdirs = outdirs, database = opt['database_name'], diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py index b81800b..f6b1fe5 100644 --- a/src/Caribou_pipeline.py +++ b/src/Caribou_pipeline.py @@ -9,7 +9,7 @@ from pathlib import Path from outputs.out import Outputs from data.build_data import build_load_save_data -from models.classification import ClassificationMethods +from models.classification_old import ClassificationMethods __author__ = 'Nicolas de Montigny' diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 0157423..412b8bc 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -15,8 +15,6 @@ from data.reduction.low_var_selection import TensorLowVarSelection from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.chi_features_selection import TensorChiFeaturesSelection -from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection -from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction from data.reduction.occurence_exclusion import TensorPercentOccurenceExclusion __author__ = "Nicolas de Montigny" @@ -25,7 +23,6 @@ """ This script computes features reduction to a given K-mers dataset and then applies it. -The method is based on the KRFE algorithm (Lebatteux et al., 2019) """ # Initialisation / validation of parameters from CLI @@ -126,11 +123,12 @@ def low_var_selection(train_ds, export_ds, kmers): return train_ds, export_ds, kmers # Chi2 evaluation of dependance between features and classes +# Select 25% of features with highest Chi2 values def features_selection(train_ds, export_ds, kmers, taxa): preprocessor = TensorChiFeaturesSelection( features = kmers, taxa = taxa, - threshold = 0.75, # Keep 25% higest results + threshold = 0.75, ) train_ds = preprocessor.fit_transform(train_ds) diff --git a/src/Caribou_simulate_test_val.py b/src/Caribou_simulate_test_val.py new file mode 100644 index 0000000..a1ee878 --- /dev/null +++ b/src/Caribou_simulate_test_val.py @@ -0,0 +1,90 @@ +#!/usr/bin python3 + +import argparse + +from utils import * +from time import time +from pathlib import Path + +__author__ = "Nicolas de Montigny" + +__all__ = ['simulation'] + +""" +This script simulate sequencing reads for validation and/or testing dataset(s) from a whole genome dataset +The dataset should be in the form of a k-mers counts matrix and could have the k-mers reduced as well +The script leverages the InSilicoSeq package for simulation of sequencing reads +""" + +# Initialisation / validation of parameters from CLI +################################################################################ +def simulation(opt): + """ + 1. Verify existence of files and load data + 2. Verify k-mers length concordance + 3. Initialize cluster + """ + if opt['hostset'] is not None: + db_data, db_ds = verify_load_host_merge(opt['dataset'], opt['hostset']) + else: + db_data, db_ds = verify_load_db(opt['dataset']) + + verify_file(opt['kmers_list']) + + outdirs = define_create_outdirs(opt['outdir']) + + init_ray_cluster(opt['workdir']) + +# Dataset(s) simulation +################################################################################ + """ + 1. Verify the datasets to simulate + 2. Split the database dataset (possibly merged) into required dataset + 3. Run the simulation for each dataset required + """ + t_test = None + t_val = None + if opt['test']: + t_s = time() + test_ds = split_dataset(db_ds, db_data, 'test') + if test_ds is not None: + sim_dataset(test_ds, db_data, 'test') + t_test = time() - t_s + if opt['validation']: + t_s = time() + val_ds = split_dataset(db_ds, db_data, 'validation') + if val_ds is not None: + sim_dataset(val_ds, db_data, 'validation') + t_val = time() - t_s + + if t_test is not None: + print(f'Caribou finished generating the test dataset in {t_test} seconds') + if t_val is not None: + print(f'Caribou finished generating the validation dataset simulated in {t_val} seconds') + +# Argument parsing from CLI +################################################################################ + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='This script simulate sequencing reads for validation and/or testing dataset(s) from a whole genome dataset') + # Database + parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') + parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files') + # Host + parser.add_argument('-dh','--hostset', default=None, type=Path, help='Path to .npz data for extracted k-mers profile of host') + parser.add_argument('-ds','--hostset_name', default=None, help='Name of the host database used to name files') + # Simulation flags + parser.add_argument('-v', '--validation', action='store_true', help='Flag argument for making a "validation"-named simulated dataset') + parser.add_argument('-t', '--test', action='store_true', help='Flag argument for making a "test"-named simulated dataset') + # Parameters + parser.add_argument('-l','--kmers_list', type=Path, default=None, help='Optional. PATH to a file containing a list of k-mers to be extracted after the simulation. Should be the same as the reference database') + parser.add_argument('-o','--outdir', required=True, type=Path, help='Path to folder for outputing tuning results') + parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled') + args = parser.parse_args() + + opt = vars(args) + + if not opt['test'] and not opt['validation']: + raise ValueError('Missing flags for datasets to simulate, please use the -v and/or -t flags to decide which dataset to generate.') + else: + simulation(opt) \ No newline at end of file diff --git a/src/models/classification.py b/src/models/classification.py index cc8fc5e..0b1412b 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -6,6 +6,7 @@ import pandas as pd from glob import glob +from typing import Dict from shutil import rmtree from utils import load_Xy_data from models.sklearn.models import SklearnModel @@ -45,18 +46,18 @@ class ClassificationMethods(): """ def __init__( self, - database_k_mers, - k, - outdirs, - database, - classifier_binary = 'deeplstm', - classifier_multiclass = 'widecnn', - taxa = None, - threshold = 0.8, - batch_size = 32, - training_epochs = 100, - verbose = True, - cv = False + database_k_mers: Dict, + k: int, + outdirs: Dict, + database: str, + classifier_binary: str = 'deeplstm', + classifier_multiclass: str = 'widecnn', + taxa: str = None, + threshold: float = 0.8, + batch_size: int = 32, + training_epochs: int = 100, + verbose: bool = True, + cv: bool = False ): # Parameters self._k = k @@ -104,20 +105,48 @@ def __init__( self._taxas_order.reverse() # Automatic executions self._verify_assign_taxas(taxa) - - # Main functions - ######################################################################################################### - # Wrapper function for training and predicting over each known taxa - def execute_training_prediction(self, data2classify): - print('execute_training_prediction') - files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - ids2classify = data2classify['ids'] - for i, taxa in enumerate(self._taxas_order): + # Public functions + ######################################################################################################### +# TODO: Revise documentation in heading +# TODO: Remove parameters from global if they are only required for certain functions +# TODO: Finish transfering the functions & calls from the old version +# TODO: Validation of params before execution of private functions + def fit(self): + """ + Wrapper function to call the fitting method + """ + # TODO: Pass training/validation data here + + def predict(self): + """ + Wrapper function to call the predicting method + """ + # TODO: Pass data to predict here + + def fit_predict(self): + """ + Wrapper function for calling fit and predict + """ + # TODO: Pass training/validation data here + # TODO: Pass data to predict here + + def cross_validation(self): + """ + Wrapper function to call the cross-validation method + """ + # TODO: Pass training/validation data here + # TODO: Pass testing data here + + # Private principal functions + ######################################################################################################### +# TODO: Pass training/validation data here + def _fit(self): + """ + Fit the given model to the training dataset + """ + for taxa in self._taxas_order: if taxa in self._taxas: - # Training if taxa in ['domain','bacteria','host']: clf = self._classifier_binary else: @@ -126,46 +155,44 @@ def execute_training_prediction(self, data2classify): self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') train = self._verify_load_data_model(self._data_file, self._model_file, taxa) if train: - self._train_model(taxa) - # Predicting - try: - if i == 0: - df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile']) + if taxa in ['domain','bacteria','host']: + self._binary_training(taxa) else: - df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile']) - except ValueError: - print('Stopping classification prematurelly because there are no more sequences to classify') - return taxa - return None + self._multiclass_training(taxa) - - # Execute training of model(s) - def execute_training(self): - print('execute_training') - for taxa in self._taxas_order: - if taxa in self._taxas: - if taxa in ['domain','bacteria','host']: - clf = self._classifier_binary +# TODO: Pass data to predict here + def _predict(self, data2classify): + """ + Predict the given data using the trained model + """ + files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + ids = data2classify['ids'] + if len(self.classified_data['sequence']) == 0: + raise ValueError('Please train a model before executing classification') + for i, taxa in enumerate(self.classified_data['sequence']): + try: + if i == 0: + df = self._classify_first(df, taxa, ids, data2classify['profile']) else: - clf = self._classifier_multiclass - self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz') - self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') - train = self._verify_load_data_model(self._data_file, self._model_file, taxa) - if train: - self._train_model(taxa) - - # Train model according to passed taxa - def _train_model(self, taxa): - print('_train_model') - if taxa in ['domain','bacteria','host']: - self._binary_training(taxa) - else: - self._multiclass_training(taxa) + df = self._classify_subsequent(df, taxa, ids, data2classify['profile']) + except ValueError: + print('Stopping classification prematurelly because there are no more sequences to classify') + return taxa + return None + + def _cross_validation(self): + """ + Execute cross-validation of a model by fitting a model and predicting over a test dataset + """ + # Private training secondary functions + ######################################################################################################### +# TODO: Remove data loading & verification from inside these functions def _binary_training(self, taxa): print('_binary_training') self._verify_classifier_binary() - self._load_training_data_merged(taxa) if self._classifier_binary == 'onesvm': self.models[taxa] = SklearnModel( self._classifier_binary, @@ -209,7 +236,7 @@ def _binary_training(self, taxa): self.models[taxa].preprocess(self._merged_training_datasets['train']) self.models[taxa].train(self._merged_training_datasets, self._merged_database_host, self._cv) - self._save_model(self._model_file, taxa) + self._save_model(self._model_file, taxa) def _multiclass_training(self, taxa): print('_multiclass_training') @@ -244,27 +271,10 @@ def _multiclass_training(self, taxa): self.models[taxa].preprocess(self._training_datasets['train']) self.models[taxa].train(self._training_datasets, self._database_data, self._cv) self._save_model(self._model_file, taxa) - - # Execute classification using trained model(s) over a given taxa - def execute_classification(self, data2classify): - print('execute_classification') - files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - ids = data2classify['ids'] - if len(self.classified_data['sequence']) == 0: - raise ValueError('Please train a model before executing classification') - for i, taxa in enumerate(self.classified_data['sequence']): - try: - if i == 0: - df = self._classify_first(df, taxa, ids, data2classify['profile']) - else: - df = self._classify_subsequent(df, taxa, ids, data2classify['profile']) - except ValueError: - print('Stopping classification prematurelly because there are no more sequences to classify') - return taxa - return None + # Private predicting secondary functions + ######################################################################################################### +# TODO: Revise these functions to parallelise with Ray + ease process # Classify sequences for first iteration def _classify_first(self, df, taxa, ids, df_file): print('_classify_first') @@ -351,198 +361,6 @@ def _extract_subset(self, df, df_file, ids, taxa, status): df_clf.write_parquet(clf_file) return df_clf, clf_file - # Utils functions + # Helper functions ######################################################################################################### - - # Verify taxas and assign to class variable - def _verify_assign_taxas(self, taxa): - print('_verify_assign_taxas') - if taxa is None: - self._taxas = self._database_data['taxas'].copy() - elif isinstance(taxa, list): - self._taxas = taxa - elif isinstance(taxa, str): - self._taxas = [taxa] - else: - raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract") - self._verify_taxas() - - # Verify if selected taxas are in database - def _verify_taxas(self): - print('_verify_taxas') - for taxa in self._taxas: - if taxa not in self._database_data['taxas']: - raise ValueError("Taxa {} not found in database".format(taxa)) - - # Caller function for verifying if the data and model already exist - def _verify_load_data_model(self, data_file, model_file, taxa): - print('_verify_load_data_model') - self._verify_files(data_file, taxa) - return self._verify_load_model(model_file, taxa) - - # Load extracted data if already exists - def _verify_files(self, file, taxa): - print('_verify_files') - self.classified_data['sequence'].append(taxa) - if os.path.isfile(file): - self.classified_data[taxa] = load_Xy_data(file) - else: - self.classified_data[taxa] = {} - - # Load model if already exists - def _verify_load_model(self, model_file, taxa): - print('_verify_load_model') - if os.path.exists(model_file): - with open(model_file, 'rb') as f: - self.models[taxa] = cloudpickle.load(f) - return False - else: - return True - - def _save_model(self, model_file, taxa): - print('_save_model') - with open(model_file, 'wb') as f: - cloudpickle.dump(self.models[taxa], f) - - def _verify_classifier_binary(self): - print('_verify_classifier_binary') - if self._classifier_binary == 'onesvm': - if self._cv == True and self._host == True: - pass - elif self._cv == True and self._host == False: - raise ValueError('Classifier One-Class SVM cannot be cross-validated with bacteria data only!\nEither add host data from parameters or choose to predict directly using this method') - elif self._cv == False and self._host == True: - raise ValueError('Classifier One-Class SVM cannot classify with host data!\nEither remove host data from parameters or choose another bacteria extraction method') - elif self._cv == False and self._host == False: - pass - elif self._classifier_binary == 'onesvm' and self._host == False: - pass - elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == True: - pass - elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == False: - raise ValueError('Classifier {} cannot classify without host data!\nEither add host data to config file or choose the One-Class SVM classifier'.format(self._classifier_binary)) - else: - raise ValueError('Invalid classifier option for bacteria extraction!\n\tModels implemented at this moment are :\n\tBacteria isolator : One Class SVM (onesvm)\n\tClassic algorithm : Linear SVM (linearsvm)\n\tNeural networks : Attention (attention), Shallow LSTM (lstm) and Deep LSTM (deeplstm)') - def _verify_classifier_multiclass(self): - print('_verify_classifier_multiclass') - if self._classifier_multiclass in ['sgd','mnb','lstm_attention','cnn','widecnn']: - pass - else: - raise ValueError('Invalid classifier option for bacteria classification!\n\tModels implemented at this moment are :\n\tClassic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)\n\tNeural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)') - - # Merge database and host reference data for bacteria extraction training - def _merge_database_host(self, database_data, host_data): - print('_merge_database_host') - self._merged_database_host = {} - self._merged_database_host['profile'] = f"{database_data['profile']}_host_merged" # Kmers profile - - if os.path.exists(self._merged_database_host['profile']): - files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - else: - files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - - cols2drop = [] - for col in df_db.schema().names: - if col not in ['id','domain','__value__']: - cols2drop.append(col) - df_db = df_db.drop_columns(cols2drop) - cols2drop = [] - for col in df_host.schema().names: - if col not in ['id','domain','__value__']: - cols2drop.append(col) - df_host = df_host.drop_columns(cols2drop) - - df_merged = df_db.union(df_host) - df_merged.write_parquet(self._merged_database_host['profile']) - - self._merged_database_host['ids'] = np.concatenate((database_data["ids"], host_data["ids"])) # IDs - self._merged_database_host['kmers'] = database_data["kmers"] # Features - self._merged_database_host['taxas'] = ['domain'] # Known taxas for classification - self._merged_database_host['fasta'] = (database_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation - - return df_merged - - # Load, merge db + host & simulate validation / test datasets - def _load_training_data_merged(self, taxa): - print('_load_training_data_merged') - if self._classifier_binary == 'onesvm' and taxa == 'domain': - files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val_test = self._merge_database_host(self._database_data, self._host_data) - df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_validation') - self._merged_training_datasets = {'train': df_train, 'validation': df_val} - if self._cv: - df_test = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_test') - self._merged_training_datasets['test'] = df_test - else: - df_train = self._merge_database_host(self._database_data, self._host_data) - df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_validation') - self._merged_training_datasets = {'train': df_train, 'validation': df_val} - if self._cv: - df_test = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_test') - self._merged_training_datasets['test'] = df_test - - # Load db & simulate validation / test datasets - def _load_training_data(self): - print('_load_training_data') - files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') - self._training_datasets = {'train': df_train, 'validation': df_val} - if self._cv: - df_test = self.split_sim_cv_ds(df_train,self._database_data, 'test') - self._training_datasets['test'] = df_test - - def _sim_4_cv(self, df, kmers_ds, name): - print('_sim_4_cv') - cols = ['id'] - cols.extend(kmers_ds['taxas']) - cls = pd.DataFrame(columns = cols) - for batch in df.iter_batches(batch_format = 'pandas'): - cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) - - sim_outdir = os.path.dirname(kmers_ds['profile']) - cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) - sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) - files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - return df - - def split_sim_cv_ds(self, ds, data, name): - ds_path = os.path.join( - os.path.dirname(data['profile']), - f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}' - ) - if os.path.exists(ds_path): - files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - else: - cv_ds = ds.random_sample(0.1) - if cv_ds.count() == 0: - nb_smpl = round(ds.count() * 0.1) - cv_ds = ds.random_shuffle().limit(nb_smpl) - cv_ds = self._sim_4_cv(cv_ds, data, name) - return cv_ds - -# Helper functions outside of class -############################################################################### - -def convert_archaea_bacteria(df): - df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' - return df \ No newline at end of file diff --git a/src/models/classification_old.py b/src/models/classification_old.py new file mode 100644 index 0000000..7419d92 --- /dev/null +++ b/src/models/classification_old.py @@ -0,0 +1,335 @@ +import os +import ray +import cloudpickle + +import numpy as np +import pandas as pd + +from glob import glob +from shutil import rmtree +from utils import load_Xy_data +from models.sklearn.models import SklearnModel +from models.kerasTF.models import KerasTFModel + +# Simulation class +from models.reads_simulation import readsSimulation + +__author__ = 'Nicolas de Montigny' + +__all__ = ['ClassificationMethods'] + +class ClassificationMethods(): + """ + Utilities class for classifying sequences from metagenomes using ray + + ---------- + Attributes + ---------- + + classified_data : dictionary + Dictionary containing the classified data for each classified taxonomic level + + models : dictionary + Dictionary containing the trained models for each taxonomic level + + ---------- + Methods + ---------- + + execute_training : launch the training of the models for the chosen taxonomic levels + no parameters to pass + + execute_classification : + data2classify : a dictionnary containing the data to classify produced by the function Caribou.src.data.build_data.build_X_data + + """ + def __init__( + self, + database_k_mers, + k, + outdirs, + database, + classifier_binary = 'deeplstm', + classifier_multiclass = 'widecnn', + taxa = None, + threshold = 0.8, + batch_size = 32, + training_epochs = 100, + verbose = True, + cv = False + ): + # Parameters + self._k = k + self._cv = cv + self._taxas = taxa + self._outdirs = outdirs + self._database = database + self._verbose = verbose + self._threshold = threshold + self._classifier_binary = classifier_binary + self._classifier_multiclass = classifier_multiclass + self._batch_size = batch_size + self._training_epochs = training_epochs + # Initialize with values + self.classified_data = { + 'sequence': [], + 'classification' : None, + 'classified_ids' : [], + 'unknown_ids' : [] + } + # Empty initializations + self.models = {} + self._host = False + self._taxas_order = [] + self._host_data = None + self._database_data = None + self._training_datasets = None + self._merged_training_datasets = None + self._merged_database_host = None + self.previous_taxa_unclassified = None + # Extract database data + if isinstance(database_k_mers, tuple): + self._host = True + self._database_data = database_k_mers[0] + self._host_data = database_k_mers[1] + else: + self._database_data = database_k_mers + # Remove 'id' from kmers if present + if 'id' in self._database_data['kmers']: + self._database_data['kmers'].remove('id') + if self._host and 'id' in self._host_data['kmers']: + self._host_data['kmers'].remove('id') + # Assign taxas order for top-down strategy + self._taxas_order = self._database_data['taxas'].copy() + self._taxas_order.reverse() + # Automatic executions + self._verify_assign_taxas(taxa) + + # Main functions + ######################################################################################################### + + # Wrapper function for training and predicting over each known taxa + def execute_training_prediction(self, data2classify): + print('execute_training_prediction') + files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) + df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + ids2classify = data2classify['ids'] + for i, taxa in enumerate(self._taxas_order): + if taxa in self._taxas: + # Training + if taxa in ['domain','bacteria','host']: + clf = self._classifier_binary + else: + clf = self._classifier_multiclass + self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz') + self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') + train = self._verify_load_data_model(self._data_file, self._model_file, taxa) + if train: + self._train_model(taxa) + # Predicting + try: + if i == 0: + df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile']) + else: + df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile']) + except ValueError: + print('Stopping classification prematurelly because there are no more sequences to classify') + return taxa + return None + + # Utils functions + ######################################################################################################### + + # Verify taxas and assign to class variable + def _verify_assign_taxas(self, taxa): + print('_verify_assign_taxas') + if taxa is None: + self._taxas = self._database_data['taxas'].copy() + elif isinstance(taxa, list): + self._taxas = taxa + elif isinstance(taxa, str): + self._taxas = [taxa] + else: + raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract") + self._verify_taxas() + + # Verify if selected taxas are in database + def _verify_taxas(self): + print('_verify_taxas') + for taxa in self._taxas: + if taxa not in self._database_data['taxas']: + raise ValueError("Taxa {} not found in database".format(taxa)) + + # Caller function for verifying if the data and model already exist + def _verify_load_data_model(self, data_file, model_file, taxa): + print('_verify_load_data_model') + self._verify_files(data_file, taxa) + return self._verify_load_model(model_file, taxa) + + # Load extracted data if already exists + def _verify_files(self, file, taxa): + print('_verify_files') + self.classified_data['sequence'].append(taxa) + if os.path.isfile(file): + self.classified_data[taxa] = load_Xy_data(file) + else: + self.classified_data[taxa] = {} + + # Load model if already exists + def _verify_load_model(self, model_file, taxa): + print('_verify_load_model') + if os.path.exists(model_file): + with open(model_file, 'rb') as f: + self.models[taxa] = cloudpickle.load(f) + return False + else: + return True + + def _save_model(self, model_file, taxa): + print('_save_model') + with open(model_file, 'wb') as f: + cloudpickle.dump(self.models[taxa], f) + + def _verify_classifier_binary(self): + print('_verify_classifier_binary') + if self._classifier_binary == 'onesvm': + if self._cv == True and self._host == True: + pass + elif self._cv == True and self._host == False: + raise ValueError('Classifier One-Class SVM cannot be cross-validated with bacteria data only!\nEither add host data from parameters or choose to predict directly using this method') + elif self._cv == False and self._host == True: + raise ValueError('Classifier One-Class SVM cannot classify with host data!\nEither remove host data from parameters or choose another bacteria extraction method') + elif self._cv == False and self._host == False: + pass + elif self._classifier_binary == 'onesvm' and self._host == False: + pass + elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == True: + pass + elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == False: + raise ValueError('Classifier {} cannot classify without host data!\nEither add host data to config file or choose the One-Class SVM classifier'.format(self._classifier_binary)) + else: + raise ValueError('Invalid classifier option for bacteria extraction!\n\tModels implemented at this moment are :\n\tBacteria isolator : One Class SVM (onesvm)\n\tClassic algorithm : Linear SVM (linearsvm)\n\tNeural networks : Attention (attention), Shallow LSTM (lstm) and Deep LSTM (deeplstm)') + + def _verify_classifier_multiclass(self): + print('_verify_classifier_multiclass') + if self._classifier_multiclass in ['sgd','mnb','lstm_attention','cnn','widecnn']: + pass + else: + raise ValueError('Invalid classifier option for bacteria classification!\n\tModels implemented at this moment are :\n\tClassic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)\n\tNeural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)') + + # Merge database and host reference data for bacteria extraction training + def _merge_database_host(self, database_data, host_data): + print('_merge_database_host') + self._merged_database_host = {} + self._merged_database_host['profile'] = f"{database_data['profile']}_host_merged" # Kmers profile + + if os.path.exists(self._merged_database_host['profile']): + files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + else: + files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + + cols2drop = [] + for col in df_db.schema().names: + if col not in ['id','domain','__value__']: + cols2drop.append(col) + df_db = df_db.drop_columns(cols2drop) + cols2drop = [] + for col in df_host.schema().names: + if col not in ['id','domain','__value__']: + cols2drop.append(col) + df_host = df_host.drop_columns(cols2drop) + + df_merged = df_db.union(df_host) + df_merged.write_parquet(self._merged_database_host['profile']) + + self._merged_database_host['ids'] = np.concatenate((database_data["ids"], host_data["ids"])) # IDs + self._merged_database_host['kmers'] = database_data["kmers"] # Features + self._merged_database_host['taxas'] = ['domain'] # Known taxas for classification + self._merged_database_host['fasta'] = (database_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation + + return df_merged + + # Load, merge db + host & simulate validation / test datasets + def _load_training_data_merged(self, taxa): + print('_load_training_data_merged') + if self._classifier_binary == 'onesvm' and taxa == 'domain': + files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') + df_val_test = self._merge_database_host(self._database_data, self._host_data) + df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') + df_val = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_validation') + self._merged_training_datasets = {'train': df_train, 'validation': df_val} + if self._cv: + df_test = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_test') + self._merged_training_datasets['test'] = df_test + else: + df_train = self._merge_database_host(self._database_data, self._host_data) + df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') + df_val = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_validation') + self._merged_training_datasets = {'train': df_train, 'validation': df_val} + if self._cv: + df_test = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_test') + self._merged_training_datasets['test'] = df_test + + # Load db & simulate validation / test datasets + def _load_training_data(self): + print('_load_training_data') + files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') + df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') + self._training_datasets = {'train': df_train, 'validation': df_val} + if self._cv: + df_test = self.split_sim_cv_ds(df_train,self._database_data, 'test') + self._training_datasets['test'] = df_test + + def _sim_4_cv(self, df, kmers_ds, name): + print('_sim_4_cv') + cols = ['id'] + cols.extend(kmers_ds['taxas']) + cls = pd.DataFrame(columns = cols) + for batch in df.iter_batches(batch_format = 'pandas'): + cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) + + sim_outdir = os.path.dirname(kmers_ds['profile']) + cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) + sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) + files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + return df + + def split_sim_cv_ds(self, ds, data, name): + ds_path = os.path.join( + os.path.dirname(data['profile']), + f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}' + ) + if os.path.exists(ds_path): + files_lst = glob(os.path.join(ds_path,'*.parquet')) + cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + else: + cv_ds = ds.random_sample(0.1) + if cv_ds.count() == 0: + nb_smpl = round(ds.count() * 0.1) + cv_ds = ds.random_shuffle().limit(nb_smpl) + cv_ds = self._sim_4_cv(cv_ds, data, name) + return cv_ds + +# Helper functions outside of class +############################################################################### + +def convert_archaea_bacteria(df): + df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' + return df \ No newline at end of file diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 79c763d..cd57ef5 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -164,7 +164,6 @@ def preprocess(self, df): TensorRDFFeaturesSelection(self.kmers, self.taxa), ) - self._encoder.fit(df) df = self._preprocessor.fit_transform(df) self._reductor = TensorTruncatedSVDReduction(self.kmers) diff --git a/src/supplement/sklearn_tuning.py b/src/supplement/sklearn_tuning.py index 76f88ad..ed52dc3 100644 --- a/src/supplement/sklearn_tuning.py +++ b/src/supplement/sklearn_tuning.py @@ -14,16 +14,19 @@ from utils import * from models.reads_simulation import readsSimulation from models.ray_tensor_min_max import TensorMinMaxScaler + # from ray.data.preprocessors import MinMaxScaler from src.models.sklearn.partial_trainer import SklearnPartialTrainer from src.models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder # Preprocessing from ray.data.preprocessors import Chain, LabelEncoder + # Training from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier + # Tuning from ray import tune from ray.tune import Tuner, TuneConfig @@ -42,41 +45,41 @@ def merge_db_host(db_data, host_data): if os.path.exists(merged_db_host['profile']): files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) else: files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) col2drop = [] - for col in df_db.schema().names: + for col in db_ds.schema().names: if col not in ['id','domain','__value__']: col2drop.append(col) - df_db = df_db.drop_columns(col2drop) + db_ds = db_ds.drop_columns(col2drop) col2drop = [] - for col in df_host.schema().names: + for col in host_ds.schema().names: if col not in ['id','domain','__value__']: col2drop.append(col) - df_host = df_host.drop_columns(col2drop) + host_ds = host_ds.drop_columns(col2drop) - df_merged = df_db.union(df_host) - df_merged.write_parquet(merged_db_host['profile']) + merged_ds = db_ds.union(host_ds) + merged_ds.write_parquet(merged_db_host['profile']) merged_db_host['ids'] = np.concatenate((db_data["ids"], host_data["ids"])) # IDs merged_db_host['kmers'] = db_data['kmers'] # Features merged_db_host['taxas'] = ['domain'] # Known taxas for classification merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation - return merged_db_host, df_merged + return merged_db_host, merged_ds -def sim_4_cv(df, database_data, name): +def sim_4_cv(ds, database_data, name): print('_sim_4_cv') k = len(database_data['kmers'][0]) cols = ['id'] cols.extend(database_data['taxas']) cls = pd.DataFrame(columns = cols) - for batch in df.iter_batches(batch_format = 'pandas'): + for batch in ds.iter_batches(batch_format = 'pandas'): cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) sim_outdir = os.path.dirname(database_data['profile']) @@ -84,8 +87,8 @@ def sim_4_cv(df, database_data, name): cv_sim = readsSimulation(database_data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, database_data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) - return df + ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + return ds def convert_archaea_bacteria(df): df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' @@ -106,7 +109,7 @@ def split_val_test_ds(ds, data): test_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_test_data_K{len(data["kmers"][0])}') if os.path.exists(val_path): files_lst = glob(os.path.join(val_path, '*.parquet')) - val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + val_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) val_ds = val_ds.map_batches( convert_archaea_bacteria, batch_format = 'pandas' @@ -119,7 +122,7 @@ def split_val_test_ds(ds, data): val_ds = sim_4_cv(val_ds, data, 'validation') if os.path.exists(test_path): files_lst = glob(os.path.join(test_path, '*.parquet')) - test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + test_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) test_ds = test_ds.map_batches( convert_archaea_bacteria, batch_format = 'pandas' @@ -164,7 +167,7 @@ def split_val_test_ds(ds, data): val_ds, test_ds = split_val_test_ds(test_val_ds,test_val_data) db_data = verify_load_data(opt['data']) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) elif opt['classifier'] == 'linearsvm' and opt['taxa'] == 'domain': if opt['data_host'] is None: raise ValueError('To tune for a domain taxa, a host species is required.\ @@ -175,7 +178,7 @@ def split_val_test_ds(ds, data): else: db_data = verify_load_data(opt['data']) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) val_ds, test_ds = split_val_test_ds(train_ds, db_data) # Preprocessing diff --git a/src/utils.py b/src/utils.py index f7e36a6..8b826b0 100644 --- a/src/utils.py +++ b/src/utils.py @@ -2,14 +2,19 @@ import ray import json import logging +import warnings + import numpy as np import pandas as pd import pyarrow as pa +from glob import glob from pathlib import Path from warnings import warn from psutil import virtual_memory +from models.reads_simulation import readsSimulation + __author__ = "Nicolas de Montigny" __all__ = [ @@ -36,7 +41,13 @@ 'verify_load_preclassified', 'merge_save_data', 'zip_X_y', - 'ensure_length_ds' + 'ensure_length_ds', + 'convert_archaea_bacteria', + 'verify_load_db', + 'verify_load_host_merge', + 'merge_db_host', + 'split_sim_dataset', + 'sim_dataset' ] # System @@ -58,7 +69,7 @@ def init_ray_cluster(workdir): ray.shutdown() frac -= 0.05 -# Data handling +# Data I/O ######################################################################################################### # Load data from file @@ -282,3 +293,92 @@ def ensure_length_ds(len_x, len_y): if len_x != len_y: raise ValueError( 'X and y have different lengths: {} and {}'.format(len_x, len_y)) + +# Datasets handling +######################################################################################################### + +def convert_archaea_bacteria(df): + df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' + return df + +def verify_load_db(db_data): + """ + Wrapper function for verifying and loading the db dataset + """ + db_data = verify_load_data(db_data) + files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') + + return db_data, db_ds + +def verify_load_host_merge(db_data, host_data): + """ + Wrapper function for verifying, loading and merging both datasets + """ + db_data = verify_load_data(db_data) + host_data = verify_load_data(host_data) + verify_concordance_klength(len(db_data['kmers'][0]), len(host_data['kmers'][0])) + merged_data, merged_ds = merge_db_host(db_data, host_data) + + return merged_data, merged_ds + +def merge_db_host(db_data, host_data): + """ + Merge the two databases along the rows axis + """ + merged_db_host = {} + merged_db_host['profile'] = f"{db_data['profile']}_host_merged" + + if os.path.exists(merged_db_host['profile']): + files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) + merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + else: + files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) + host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + + merged_ds = db_ds.union(host_ds) + merged_ds = merged_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') + merged_ds.write_parquet(merged_db_host['profile']) + + merged_db_host['ids'] = np.concatenate((db_data["ids"], host_data["ids"])) # IDs + merged_db_host['kmers'] = db_data['kmers'] # Features + merged_db_host['taxas'] = ['domain'] # Known taxas for classification + merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation + + return merged_db_host, merged_ds + +def split_sim_dataset(ds, data, name): + splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}') + if os.path.exists(splitted_path): + warnings.warn(f'Splitted dataset {name} already exists, skipping simulation') + return None + else: + splitted_ds = ds.random_sample(0.1) + if splitted_ds.count() == 0: + nb_samples = round(ds.count() * 0.1) + splitted_ds = ds.random_shuffle().limit(nb_samples) + + sim_dataset(ds, data, name) + return splitted_ds + +def sim_dataset(ds, data, name): + """ + Simulate the dataset from the database and generate its data + """ + k = len(data['kmers'][0]) + cols = ['id'] + cols.extend(data['taxas']) + cls = pd.DataFrame(columns = cols) + for batch in ds.iter_batches(batch_format = 'pandas'): + cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) + + sim_outdir = os.path.dirname(data['profile']) + cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) + sim_data = cv_sim.simulation(k, data['kmers']) + files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) + sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + return sim_ds + From a44dcf7459330b212917eb5aba09cbb77fc7e13e Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 4 Nov 2023 22:50:49 -0400 Subject: [PATCH 21/92] debug circular import --- src/Caribou_simulate_test_val.py | 9 +++----- src/models/reads_simulation.py | 39 ++++++++++++++++++++++++++++++-- src/utils.py | 39 +------------------------------- 3 files changed, 41 insertions(+), 46 deletions(-) diff --git a/src/Caribou_simulate_test_val.py b/src/Caribou_simulate_test_val.py index a1ee878..969f533 100644 --- a/src/Caribou_simulate_test_val.py +++ b/src/Caribou_simulate_test_val.py @@ -5,6 +5,7 @@ from utils import * from time import time from pathlib import Path +from models.reads_simulation import split_sim_dataset __author__ = "Nicolas de Montigny" @@ -46,15 +47,11 @@ def simulation(opt): t_val = None if opt['test']: t_s = time() - test_ds = split_dataset(db_ds, db_data, 'test') - if test_ds is not None: - sim_dataset(test_ds, db_data, 'test') + test_ds = split_sim_dataset(db_ds, db_data, 'test') t_test = time() - t_s if opt['validation']: t_s = time() - val_ds = split_dataset(db_ds, db_data, 'validation') - if val_ds is not None: - sim_dataset(val_ds, db_data, 'validation') + val_ds = split_sim_dataset(db_ds, db_data, 'validation') t_val = time() - t_s if t_test is not None: diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index 0c47f81..5437dbd 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -15,7 +15,7 @@ __author__ = "Nicolas de Montigny" -__all__ = ['ReadsSimulation'] +__all__ = ['ReadsSimulation','split_sim_dataset','sim_dataset'] # Reduce number of cpus used to reduce nb of tmp files # reduce number of reads generated @@ -203,4 +203,39 @@ def _verify_sim_arguments(self, k, kmers_list): elif k is not None and kmers_list is None: warn("K is provided but k-mers list is None, k-mers list will be generated") raise ValueError("k value was provided but not k-mers list, please provide a k-mers list or no k value") - return k, kmers_list \ No newline at end of file + return k, kmers_list + +# Helper functions +######################################################################################################### + +def split_sim_dataset(ds, data, name): + splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}') + if os.path.exists(splitted_path): + warnings.warn(f'Splitted dataset {name} already exists, skipping simulation') + return None + else: + splitted_ds = ds.random_sample(0.1) + if splitted_ds.count() == 0: + nb_samples = round(ds.count() * 0.1) + splitted_ds = ds.random_shuffle().limit(nb_samples) + + sim_dataset(ds, data, name) + return splitted_ds + +def sim_dataset(ds, data, name): + """ + Simulate the dataset from the database and generate its data + """ + k = len(data['kmers'][0]) + cols = ['id'] + cols.extend(data['taxas']) + cls = pd.DataFrame(columns = cols) + for batch in ds.iter_batches(batch_format = 'pandas'): + cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) + + sim_outdir = os.path.dirname(data['profile']) + cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) + sim_data = cv_sim.simulation(k, data['kmers']) + files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) + sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + return sim_ds \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 8b826b0..f52d467 100644 --- a/src/utils.py +++ b/src/utils.py @@ -13,8 +13,6 @@ from warnings import warn from psutil import virtual_memory -from models.reads_simulation import readsSimulation - __author__ = "Nicolas de Montigny" __all__ = [ @@ -45,9 +43,7 @@ 'convert_archaea_bacteria', 'verify_load_db', 'verify_load_host_merge', - 'merge_db_host', - 'split_sim_dataset', - 'sim_dataset' + 'merge_db_host' ] # System @@ -349,36 +345,3 @@ def merge_db_host(db_data, host_data): merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation return merged_db_host, merged_ds - -def split_sim_dataset(ds, data, name): - splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}') - if os.path.exists(splitted_path): - warnings.warn(f'Splitted dataset {name} already exists, skipping simulation') - return None - else: - splitted_ds = ds.random_sample(0.1) - if splitted_ds.count() == 0: - nb_samples = round(ds.count() * 0.1) - splitted_ds = ds.random_shuffle().limit(nb_samples) - - sim_dataset(ds, data, name) - return splitted_ds - -def sim_dataset(ds, data, name): - """ - Simulate the dataset from the database and generate its data - """ - k = len(data['kmers'][0]) - cols = ['id'] - cols.extend(data['taxas']) - cls = pd.DataFrame(columns = cols) - for batch in ds.iter_batches(batch_format = 'pandas'): - cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) - - sim_outdir = os.path.dirname(data['profile']) - cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) - sim_data = cv_sim.simulation(k, data['kmers']) - files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - return sim_ds - From b081221da800e14d6d8112a63001f47bb58117b2 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 5 Nov 2023 17:22:31 -0500 Subject: [PATCH 22/92] reads_simulation debug --- src/models/reads_simulation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index 5437dbd..b78d7d9 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -4,7 +4,9 @@ import pandas as pd import os +import ray import gzip +import warnings from Bio import SeqIO from glob import glob From 5393fdba745d27f4b9267e482e6dc239ce4471c8 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 5 Nov 2023 18:50:59 -0500 Subject: [PATCH 23/92] datasets loading in split for test/val + host merge --- src/Caribou_classification.py | 5 ++++- src/Caribou_classification_train_cv.py | 8 ++++++-- src/Caribou_extraction.py | 7 ++++++- src/Caribou_extraction_train_cv.py | 10 ++++++++-- src/Caribou_pipeline.py | 1 + src/Caribou_simulate_test_val.py | 25 ++++++++++++++++--------- src/models/classification.py | 16 +++++++++++----- src/models/reads_simulation.py | 16 ++++++++++------ src/utils.py | 19 +++++++++++++++++-- 9 files changed, 79 insertions(+), 28 deletions(-) diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index 9c29172..a1992c7 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -6,12 +6,15 @@ from utils import * from time import time from pathlib import Path +from models.reads_simulation import split_sim_dataset from models.classification_old import ClassificationMethods __author__ = "Nicolas de Montigny" __all__ = ['bacteria_classification_train_cv'] +VALIDATION_DATASET_NAME = 'validation' + # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_classification(opt): @@ -51,7 +54,7 @@ def bacteria_classification(opt): if 'domain' in lst_taxas: lst_taxas.remove('domain') - val_ds = split_sim_dataset(db_ds, db_data, 'validation') + val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) # Definition of model for bacteria taxonomic classification + training ################################################################################ diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index d7d25a5..aac75d8 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -7,6 +7,7 @@ from time import time from pathlib import Path from logging import ERROR +from models.reads_simulation import split_sim_dataset from models.classification_old import ClassificationMethods warnings.filterwarnings('ignore') @@ -15,6 +16,9 @@ __all__ = ['bacteria_classification_train_cv'] +VALIDATION_DATASET_NAME = 'validation' +TEST_DATASET_NAME = 'test' + # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_classification_train_cv(opt): @@ -48,8 +52,8 @@ def bacteria_classification_train_cv(opt): if 'domain' in lst_taxas: lst_taxas.remove('domain') - test_ds = split_sim_dataset(db_ds, db_data, 'test') - val_ds = split_sim_dataset(db_ds, db_data, 'validation') + test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME) + val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) # Training and cross-validation of models for classification of bacterias ################################################################################ diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py index d3ea11f..eda156b 100644 --- a/src/Caribou_extraction.py +++ b/src/Caribou_extraction.py @@ -5,12 +5,15 @@ from utils import * from time import time from pathlib import Path +from models.reads_simulation import split_sim_dataset from models.classification_old import ClassificationMethods __author__ = "Nicolas de Montigny" __all__ = ['bacteria_extraction_train_cv'] +VALIDATION_DATASET_NAME = 'validation' + # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_extraction(opt): @@ -35,13 +38,15 @@ def bacteria_extraction(opt): if opt['data_host'] is not None: db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + db_name = 'host_merged' else: db_data, db_ds = verify_load_db(opt['data_bacteria']) + db_name = opt['dataset_name'] data_metagenome = verify_load_data(opt['data_metagenome']) k_length = len(db_data['kmers'][0]) - val_ds = split_sim_dataset(db_ds, db_data, 'validation') + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') # Definition of model for bacteria extraction / host removal + execution ################################################################################ diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py index fccb0c5..2a77471 100644 --- a/src/Caribou_extraction_train_cv.py +++ b/src/Caribou_extraction_train_cv.py @@ -5,12 +5,16 @@ from utils import * from time import time from pathlib import Path +from models.reads_simulation import split_sim_dataset from models.classification_old import ClassificationMethods __author__ = "Nicolas de Montigny" __all__ = ['bacteria_extraction_train_cv'] +VALIDATION_DATASET_NAME = 'validation' +TEST_DATASET_NAME = 'test' + # Initialisation / validation of parameters from CLI ################################################################################ def bacteria_extraction_train_cv(opt): @@ -29,13 +33,15 @@ def bacteria_extraction_train_cv(opt): if opt['data_host'] is not None: db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + db_name = 'host_merged' else: db_data, db_ds = verify_load_db(opt['data_bacteria']) + db_name = opt['dataset_name'] k_length = len(db_data['kmers'][0]) - test_ds = split_sim_dataset(db_ds, db_data, 'test') - val_ds = split_sim_dataset(db_ds, db_data, 'validation') + test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') # Training and cross-validation of models for bacteria extraction / host removal ################################################################################ diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py index f6b1fe5..a6fdb0b 100644 --- a/src/Caribou_pipeline.py +++ b/src/Caribou_pipeline.py @@ -15,6 +15,7 @@ __all__ = ['caribou'] + # Part 0 - Initialisation / extraction of parameters from config file ################################################################################ def caribou(opt): diff --git a/src/Caribou_simulate_test_val.py b/src/Caribou_simulate_test_val.py index 969f533..ca45c4d 100644 --- a/src/Caribou_simulate_test_val.py +++ b/src/Caribou_simulate_test_val.py @@ -17,6 +17,9 @@ The script leverages the InSilicoSeq package for simulation of sequencing reads """ +VALIDATION_DATASET_NAME = 'validation' +TEST_DATASET_NAME = 'test' + # Initialisation / validation of parameters from CLI ################################################################################ def simulation(opt): @@ -24,18 +27,22 @@ def simulation(opt): 1. Verify existence of files and load data 2. Verify k-mers length concordance 3. Initialize cluster + 4. Load data and merge if necessary """ - if opt['hostset'] is not None: - db_data, db_ds = verify_load_host_merge(opt['dataset'], opt['hostset']) - else: - db_data, db_ds = verify_load_db(opt['dataset']) - + verify_file(opt['kmers_list']) outdirs = define_create_outdirs(opt['outdir']) init_ray_cluster(opt['workdir']) + if opt['hostset'] is not None: + db_data, db_ds = verify_load_host_merge(opt['dataset'], opt['hostset']) + db_name = 'host_merged' + else: + db_data, db_ds = verify_load_db(opt['dataset']) + db_name = opt['dataset_name'] + # Dataset(s) simulation ################################################################################ """ @@ -47,17 +54,17 @@ def simulation(opt): t_val = None if opt['test']: t_s = time() - test_ds = split_sim_dataset(db_ds, db_data, 'test') + test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') t_test = time() - t_s if opt['validation']: t_s = time() - val_ds = split_sim_dataset(db_ds, db_data, 'validation') + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') t_val = time() - t_s if t_test is not None: - print(f'Caribou finished generating the test dataset in {t_test} seconds') + print(f'Caribou finished generating the {TEST_DATASET_NAME} dataset in {t_test} seconds') if t_val is not None: - print(f'Caribou finished generating the validation dataset simulated in {t_val} seconds') + print(f'Caribou finished generating the {VALIDATION_DATASET_NAME} dataset simulated in {t_val} seconds') # Argument parsing from CLI ################################################################################ diff --git a/src/models/classification.py b/src/models/classification.py index 0b1412b..b9bc523 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -19,6 +19,10 @@ __all__ = ['ClassificationMethods'] +TRAINING_DATASET_NAME = 'train' +VALIDATION_DATASET_NAME = 'validation' +TEST_DATASET_NAME = 'test' + class ClassificationMethods(): """ Utilities class for classifying sequences from metagenomes using ray @@ -37,12 +41,14 @@ class ClassificationMethods(): Methods ---------- - execute_training : launch the training of the models for the chosen taxonomic levels - no parameters to pass + fit : function to call the fitting method + + predict : function to call the predicting method - execute_classification : - data2classify : a dictionnary containing the data to classify produced by the function Caribou.src.data.build_data.build_X_data + fit_predict : wrapper function for calling fit and predict + cross_validation : function to call the cross-validation process + """ def __init__( self, @@ -112,7 +118,7 @@ def __init__( # TODO: Remove parameters from global if they are only required for certain functions # TODO: Finish transfering the functions & calls from the old version # TODO: Validation of params before execution of private functions - def fit(self): + def fit(self, datasets, ): """ Wrapper function to call the fitting method """ diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index b78d7d9..e517cc4 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -8,6 +8,7 @@ import gzip import warnings +from utils import * from Bio import SeqIO from glob import glob from pathlib import Path @@ -211,18 +212,21 @@ def _verify_sim_arguments(self, k, kmers_list): ######################################################################################################### def split_sim_dataset(ds, data, name): - splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}') + splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}.npz') if os.path.exists(splitted_path): - warnings.warn(f'Splitted dataset {name} already exists, skipping simulation') - return None + warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset') + splitted_data = load_Xy_data(splitted_path) + files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet')) + splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + return splitted_ds, splitted_data else: splitted_ds = ds.random_sample(0.1) if splitted_ds.count() == 0: nb_samples = round(ds.count() * 0.1) splitted_ds = ds.random_shuffle().limit(nb_samples) - sim_dataset(ds, data, name) - return splitted_ds + splitted_ds, splitted_data = sim_dataset(ds, data, name) + return splitted_ds, splitted_data def sim_dataset(ds, data, name): """ @@ -240,4 +244,4 @@ def sim_dataset(ds, data, name): sim_data = cv_sim.simulation(k, data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - return sim_ds \ No newline at end of file + return sim_ds, sim_data \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index f52d467..f9f1d4b 100644 --- a/src/utils.py +++ b/src/utils.py @@ -46,6 +46,11 @@ 'merge_db_host' ] +# Constants +######################################################################################################### + +TENSOR_COLUMN_NAME = '__value__' + # System ######################################################################################################### @@ -324,17 +329,25 @@ def merge_db_host(db_data, host_data): Merge the two databases along the rows axis """ merged_db_host = {} - merged_db_host['profile'] = f"{db_data['profile']}_host_merged" + merged_db_host_file = f"{db_data['profile']}_host_merged.npz" - if os.path.exists(merged_db_host['profile']): + if os.path.exists(merged_db_host_file): + merged_db_host = load_Xy_data(merged_db_host_file) files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) else: + merged_db_host['profile'] = f"{db_data['profile']}_host_merged" files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]] + db_ds = db_ds.drop_columns(cols2drop) + + cols2drop = [col for col in host_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]] + host_ds = host_ds.drop_columns(cols2drop) + merged_ds = db_ds.union(host_ds) merged_ds = merged_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') merged_ds.write_parquet(merged_db_host['profile']) @@ -344,4 +357,6 @@ def merge_db_host(db_data, host_data): merged_db_host['taxas'] = ['domain'] # Known taxas for classification merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation + save_Xy_data(merged_db_host, merged_db_host_file) + return merged_db_host, merged_ds From 5cc20ee4e59052b7d30674d0f14ad8dd72d80cc2 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 6 Nov 2023 08:20:13 -0500 Subject: [PATCH 24/92] simulation wrong dataset passed --- src/models/reads_simulation.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index e517cc4..8057421 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -12,6 +12,7 @@ from Bio import SeqIO from glob import glob from pathlib import Path +from shutil import rmtree from warnings import warn from data.build_data import build_load_save_data from joblib import Parallel, delayed, parallel_backend @@ -92,7 +93,11 @@ def __init__( self._cls_out = os.path.join(outdir, f'sim_{self._name}_class.csv') # Dataset variables self.kmers_data = {} - os.mkdir(self._tmp_path) + try: + os.mkdir(self._tmp_path) + except FileExistsError: + rmtree(self._tmp_path) + os.mkdir(self._tmp_path) def simulation(self, k = None, kmers_list = None): k, kmers_list = self._verify_sim_arguments(k, kmers_list) @@ -224,8 +229,7 @@ def split_sim_dataset(ds, data, name): if splitted_ds.count() == 0: nb_samples = round(ds.count() * 0.1) splitted_ds = ds.random_shuffle().limit(nb_samples) - - splitted_ds, splitted_data = sim_dataset(ds, data, name) + splitted_ds, splitted_data = sim_dataset(splitted_ds, data, name) return splitted_ds, splitted_data def sim_dataset(ds, data, name): @@ -238,7 +242,6 @@ def sim_dataset(ds, data, name): cls = pd.DataFrame(columns = cols) for batch in ds.iter_batches(batch_format = 'pandas'): cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) - sim_outdir = os.path.dirname(data['profile']) cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, data['kmers']) From d94bd2cfd50e081f2affce0b5280c41d471c28a7 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 6 Nov 2023 17:29:46 -0500 Subject: [PATCH 25/92] read parallelism -1 + reduce nb simulation --- src/data/kmers.py | 3 +-- src/models/classification.py | 3 +-- src/models/classification_old.py | 24 ++++++++---------------- src/models/reads_simulation.py | 6 +++--- src/utils.py | 8 ++++---- 5 files changed, 17 insertions(+), 27 deletions(-) diff --git a/src/data/kmers.py b/src/data/kmers.py index 5a0fbe5..7b1cb52 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -319,8 +319,7 @@ def _make_ray_ds(self): self.df = self.df.repartition(int(self.df.count()/10)) else: self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet')) - self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(self._files_list)) - # self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1) + self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1) def _kmers_tokenization(self): print('_kmers_tokenization') diff --git a/src/models/classification.py b/src/models/classification.py index b9bc523..a2e8613 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -172,8 +172,7 @@ def _predict(self, data2classify): Predict the given data using the trained model """ files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) ids = data2classify['ids'] if len(self.classified_data['sequence']) == 0: raise ValueError('Please train a model before executing classification') diff --git a/src/models/classification_old.py b/src/models/classification_old.py index 7419d92..55847ec 100644 --- a/src/models/classification_old.py +++ b/src/models/classification_old.py @@ -112,8 +112,7 @@ def __init__( def execute_training_prediction(self, data2classify): print('execute_training_prediction') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) ids2classify = data2classify['ids'] for i, taxa in enumerate(self._taxas_order): if taxa in self._taxas: @@ -226,15 +225,12 @@ def _merge_database_host(self, database_data, host_data): if os.path.exists(self._merged_database_host['profile']): files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) cols2drop = [] for col in df_db.schema().names: @@ -262,8 +258,7 @@ def _load_training_data_merged(self, taxa): print('_load_training_data_merged') if self._classifier_binary == 'onesvm' and taxa == 'domain': files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val_test = self._merge_database_host(self._database_data, self._host_data) df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') @@ -285,8 +280,7 @@ def _load_training_data_merged(self, taxa): def _load_training_data(self): print('_load_training_data') files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') self._training_datasets = {'train': df_train, 'validation': df_val} @@ -306,8 +300,7 @@ def _sim_4_cv(self, df, kmers_ds, name): cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) return df def split_sim_cv_ds(self, ds, data, name): @@ -317,8 +310,7 @@ def split_sim_cv_ds(self, ds, data, name): ) if os.path.exists(ds_path): files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: cv_ds = ds.random_sample(0.1) if cv_ds.count() == 0: diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index 8057421..630f86f 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -79,7 +79,7 @@ def __init__( self._fasta_host = None self._cls_in = cls self._genomes = genomes - self._nb_reads = len(genomes) * 5 + self._nb_reads = len(genomes) * 3 self._sequencing = sequencing self._path = outdir self._tmp_path = os.path.join(outdir,'tmp') @@ -222,7 +222,7 @@ def split_sim_dataset(ds, data, name): warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset') splitted_data = load_Xy_data(splitted_path) files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet')) - splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) return splitted_ds, splitted_data else: splitted_ds = ds.random_sample(0.1) @@ -246,5 +246,5 @@ def sim_dataset(ds, data, name): cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) return sim_ds, sim_data \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index f9f1d4b..71c564d 100644 --- a/src/utils.py +++ b/src/utils.py @@ -308,7 +308,7 @@ def verify_load_db(db_data): """ db_data = verify_load_data(db_data) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') return db_data, db_ds @@ -334,13 +334,13 @@ def merge_db_host(db_data, host_data): if os.path.exists(merged_db_host_file): merged_db_host = load_Xy_data(merged_db_host_file) files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) - merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) else: merged_db_host['profile'] = f"{db_data['profile']}_host_merged" files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) - host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]] db_ds = db_ds.drop_columns(cols2drop) From 75bd9d1fdfbedb1cf257b8eeda77751df494133a Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 6 Nov 2023 17:35:35 -0500 Subject: [PATCH 26/92] reads parallelism = len(files_lst)/100 --- src/data/kmers.py | 2 +- src/models/classification.py | 2 +- src/models/classification_old.py | 16 ++++++++-------- src/models/reads_simulation.py | 4 ++-- src/utils.py | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/data/kmers.py b/src/data/kmers.py index 7b1cb52..bc19b21 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -319,7 +319,7 @@ def _make_ray_ds(self): self.df = self.df.repartition(int(self.df.count()/10)) else: self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet')) - self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = -1) + self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(files_lst)/100) def _kmers_tokenization(self): print('_kmers_tokenization') diff --git a/src/models/classification.py b/src/models/classification.py index a2e8613..e1e65c1 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -172,7 +172,7 @@ def _predict(self, data2classify): Predict the given data using the trained model """ files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) ids = data2classify['ids'] if len(self.classified_data['sequence']) == 0: raise ValueError('Please train a model before executing classification') diff --git a/src/models/classification_old.py b/src/models/classification_old.py index 55847ec..15d8f23 100644 --- a/src/models/classification_old.py +++ b/src/models/classification_old.py @@ -112,7 +112,7 @@ def __init__( def execute_training_prediction(self, data2classify): print('execute_training_prediction') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) ids2classify = data2classify['ids'] for i, taxa in enumerate(self._taxas_order): if taxa in self._taxas: @@ -225,12 +225,12 @@ def _merge_database_host(self, database_data, host_data): if os.path.exists(self._merged_database_host['profile']): files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) else: files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) cols2drop = [] for col in df_db.schema().names: @@ -258,7 +258,7 @@ def _load_training_data_merged(self, taxa): print('_load_training_data_merged') if self._classifier_binary == 'onesvm' and taxa == 'domain': files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val_test = self._merge_database_host(self._database_data, self._host_data) df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') @@ -280,7 +280,7 @@ def _load_training_data_merged(self, taxa): def _load_training_data(self): print('_load_training_data') files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') self._training_datasets = {'train': df_train, 'validation': df_val} @@ -300,7 +300,7 @@ def _sim_4_cv(self, df, kmers_ds, name): cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) return df def split_sim_cv_ds(self, ds, data, name): @@ -310,7 +310,7 @@ def split_sim_cv_ds(self, ds, data, name): ) if os.path.exists(ds_path): files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) else: cv_ds = ds.random_sample(0.1) if cv_ds.count() == 0: diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index 630f86f..eb61077 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -222,7 +222,7 @@ def split_sim_dataset(ds, data, name): warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset') splitted_data = load_Xy_data(splitted_path) files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet')) - splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) return splitted_ds, splitted_data else: splitted_ds = ds.random_sample(0.1) @@ -246,5 +246,5 @@ def sim_dataset(ds, data, name): cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) return sim_ds, sim_data \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 71c564d..5127ed3 100644 --- a/src/utils.py +++ b/src/utils.py @@ -308,7 +308,7 @@ def verify_load_db(db_data): """ db_data = verify_load_data(db_data) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') return db_data, db_ds @@ -334,13 +334,13 @@ def merge_db_host(db_data, host_data): if os.path.exists(merged_db_host_file): merged_db_host = load_Xy_data(merged_db_host_file) files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) - merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) else: merged_db_host['profile'] = f"{db_data['profile']}_host_merged" files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) - host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = -1) + host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]] db_ds = db_ds.drop_columns(cols2drop) From 9e9a663de32beb3d4e72afd18a9d8a48016e8147 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 6 Nov 2023 17:39:00 -0500 Subject: [PATCH 27/92] parallelism = len(files_lst) --- src/data/kmers.py | 2 +- src/models/classification.py | 2 +- src/models/classification_old.py | 16 ++++++++-------- src/models/reads_simulation.py | 4 ++-- src/utils.py | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/data/kmers.py b/src/data/kmers.py index bc19b21..a42f7d9 100644 --- a/src/data/kmers.py +++ b/src/data/kmers.py @@ -319,7 +319,7 @@ def _make_ray_ds(self): self.df = self.df.repartition(int(self.df.count()/10)) else: self._files_list = glob(os.path.join(self._tmp_dir, '*.parquet')) - self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(files_lst)/100) + self.df = ray.data.read_parquet_bulk(self._files_list, parallelism = len(files_lst)) def _kmers_tokenization(self): print('_kmers_tokenization') diff --git a/src/models/classification.py b/src/models/classification.py index e1e65c1..6a0cbc0 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -172,7 +172,7 @@ def _predict(self, data2classify): Predict the given data using the trained model """ files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) ids = data2classify['ids'] if len(self.classified_data['sequence']) == 0: raise ValueError('Please train a model before executing classification') diff --git a/src/models/classification_old.py b/src/models/classification_old.py index 15d8f23..7638c17 100644 --- a/src/models/classification_old.py +++ b/src/models/classification_old.py @@ -112,7 +112,7 @@ def __init__( def execute_training_prediction(self, data2classify): print('execute_training_prediction') files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) ids2classify = data2classify['ids'] for i, taxa in enumerate(self._taxas_order): if taxa in self._taxas: @@ -225,12 +225,12 @@ def _merge_database_host(self, database_data, host_data): if os.path.exists(self._merged_database_host['profile']): files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) else: files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) cols2drop = [] for col in df_db.schema().names: @@ -258,7 +258,7 @@ def _load_training_data_merged(self, taxa): print('_load_training_data_merged') if self._classifier_binary == 'onesvm' and taxa == 'domain': files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val_test = self._merge_database_host(self._database_data, self._host_data) df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') @@ -280,7 +280,7 @@ def _load_training_data_merged(self, taxa): def _load_training_data(self): print('_load_training_data') files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') self._training_datasets = {'train': df_train, 'validation': df_val} @@ -300,7 +300,7 @@ def _sim_4_cv(self, df, kmers_ds, name): cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) return df def split_sim_cv_ds(self, ds, data, name): @@ -310,7 +310,7 @@ def split_sim_cv_ds(self, ds, data, name): ) if os.path.exists(ds_path): files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) else: cv_ds = ds.random_sample(0.1) if cv_ds.count() == 0: diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index eb61077..463c077 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -222,7 +222,7 @@ def split_sim_dataset(ds, data, name): warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset') splitted_data = load_Xy_data(splitted_path) files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet')) - splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) return splitted_ds, splitted_data else: splitted_ds = ds.random_sample(0.1) @@ -246,5 +246,5 @@ def sim_dataset(ds, data, name): cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) return sim_ds, sim_data \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 5127ed3..f9f1d4b 100644 --- a/src/utils.py +++ b/src/utils.py @@ -308,7 +308,7 @@ def verify_load_db(db_data): """ db_data = verify_load_data(db_data) files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') return db_data, db_ds @@ -334,13 +334,13 @@ def merge_db_host(db_data, host_data): if os.path.exists(merged_db_host_file): merged_db_host = load_Xy_data(merged_db_host_file) files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) - merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) else: merged_db_host['profile'] = f"{db_data['profile']}_host_merged" files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) - host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)/100) + host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]] db_ds = db_ds.drop_columns(cols2drop) From 8e828f3d3724ce9e8127207f4eb6a7551493bd79 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 9 Nov 2023 17:50:56 -0500 Subject: [PATCH 28/92] new version of ClassificationMethods without data loading + move cv out of framework specific classes + debug sklearn --- src/Caribou_classification.py | 77 +-- src/Caribou_classification_train_cv.py | 59 +- src/Caribou_extraction.py | 109 ++-- src/Caribou_extraction_train_cv.py | 87 ++- src/models/classification.py | 596 ++++++++++-------- src/models/classification_old.py | 327 ---------- src/models/encoders/model_label_encoder.py | 1 + src/models/encoders/one_hot_tensor_encoder.py | 2 +- src/models/kerasTF/models.py | 79 +-- src/models/models_utils.py | 59 +- src/models/preprocessors/tfidf_transformer.py | 2 +- src/models/sklearn/models.py | 94 +-- src/models/sklearn/partial_trainer.py | 7 +- src/utils.py | 22 +- 14 files changed, 564 insertions(+), 957 deletions(-) delete mode 100644 src/models/classification_old.py diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index a1992c7..0c4b460 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -7,12 +7,13 @@ from time import time from pathlib import Path from models.reads_simulation import split_sim_dataset -from models.classification_old import ClassificationMethods +from models.classification import ClassificationMethods __author__ = "Nicolas de Montigny" __all__ = ['bacteria_classification_train_cv'] +TRAINING_DATASET_NAME = 'train' VALIDATION_DATASET_NAME = 'validation' # Initialisation / validation of parameters from CLI @@ -36,14 +37,6 @@ def bacteria_classification(opt): ################################################################################ db_data, db_ds = verify_load_db(opt['data_bacteria']) - data_metagenome = verify_load_data(opt['data_metagenome']) - - k_length = len(db_data['kmers'][0]) - - if opt['preclassified_data'] is not None: - preclassified_data = verify_load_preclassified(opt['preclassified_data']) - else: - preclassified_data = None # Validate and extract list of taxas if opt['taxa'] is not None: @@ -56,60 +49,60 @@ def bacteria_classification(opt): val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) -# Definition of model for bacteria taxonomic classification + training + datasets = { + TRAINING_DATASET_NAME : db_ds, + VALIDATION_DATASET_NAME : val_ds + } + + metagenome_data, metagenome_ds = verify_load_metagenome(opt['data_metagenome']) + +# Definition of model for bacteria taxonomic classification ################################################################################ + clf = ClassificationMethods( - database_k_mers = db_data, - k = k_length, + db_data = db_data, outdirs = outdirs, - database = opt['database_name'], - classifier_multiclass = opt['model_type'], - taxa = lst_taxas, + db_name = opt['database_name'], + clf_multiclass = opt['model_type'], + taxa = 'domain', batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'], - verbose = opt['verbose'], - cv = False + training_epochs = opt['training_epochs'] ) # Execution of bacteria taxonomic classification on metagenome + save results ################################################################################ - t_start = time() - end_taxa = clf.fit_predict(data_metagenome) - t_end = time() - t_classif = t_end - t_start - clf_data = merge_save_data( - clf.classified_data, - data_metagenome, - end_taxa, - outdirs['results_dir'], - opt['metagenome_name'], - preclassified = preclassified_data, - ) - if opt['taxa'] is None: - opt['taxa'] = 'all' - clf_data['classification'].to_csv(os.path.join(outdirs['results_dir'], f"classification_K{k_length}_{opt['taxa']}_{opt['model_type']}.csv")) - if end_taxa is None: - print(f"Caribou finished training the {opt['model_type']} model and classifying bacterial sequences at {opt['taxa']} taxonomic level with it. \ - \nThe training and classification steps took {t_classif} seconds to execute.") - else: - print(f"Caribou finished training the {opt['model_type']} model and classifying bacterial sequences at {opt['taxa']} taxonomic level until {end_taxa} because there were no more sequences to classify. \ - \nThe training and classification steps took {t_classif} seconds to execute.") + t_s = time() + clf.fit(datasets) + t_fit = time() - t_s + + t_s = time() + predictions = clf.predict(metagenome_ds) + t_clf = time() - t_s + + Xy_file = os.path.join(outdirs['results_dir'], f"extracted_bacteria_{opt['metagenome_name']}_{opt['model_type']}.npz") + save_Xy_data(predictions, Xy_file) + + print(f""" + Caribou finished training the {opt['model_type']} model in {t_fit} seconds. + Classification of bacteria from {opt['metagenome_name']} dataset was then executed in {t_clf} seconds. + """) # Argument parsing from CLI ################################################################################ if __name__ == "__main__": parser = argparse.ArgumentParser(description='This script trains a model and classifies bacteria sequences iteratively over known taxonomic levels.') + # Database parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') - parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify') parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files') + # Dataset + parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify') parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files') - parser.add_argument('-pc','--preclassified_data', default=None, type=Path,help='Optional. PATH to a .npz file contianing classified data at another taxonomic level than the ones in the current analysis') + # Parameters parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') - parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data') args = parser.parse_args() diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index aac75d8..f6832a8 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -8,7 +8,7 @@ from pathlib import Path from logging import ERROR from models.reads_simulation import split_sim_dataset -from models.classification_old import ClassificationMethods +from models.classification import ClassificationMethods warnings.filterwarnings('ignore') @@ -16,6 +16,7 @@ __all__ = ['bacteria_classification_train_cv'] +TRAINING_DATASET_NAME = 'train' VALIDATION_DATASET_NAME = 'validation' TEST_DATASET_NAME = 'test' @@ -41,53 +42,59 @@ def bacteria_classification_train_cv(opt): db_data, db_ds = verify_load_db(opt['data_bacteria']) - k_length = len(db_data['kmers'][0]) - # Validate and extract list of taxas if opt['taxa'] is not None: lst_taxas = verify_taxas(opt['taxa'], db_data['taxas']) else: lst_taxas = db_data['taxas'].copy() - + if 'domain' in lst_taxas: lst_taxas.remove('domain') + + for taxa in lst_taxas: + + test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME) + val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) - test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME) - val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) + datasets = { + TRAINING_DATASET_NAME : db_ds, + TEST_DATASET_NAME : test_ds, + VALIDATION_DATASET_NAME : val_ds + } # Training and cross-validation of models for classification of bacterias ################################################################################ - t_start = time() - ClassificationMethods( - database_k_mers = db_data, - k = k_length, - outdirs = outdirs, - database = opt['database_name'], - classifier_binary = None, - classifier_multiclass = opt['model_type'], - taxa = lst_taxas, - batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'], - verbose = opt['verbose'], - cv = True - ).fit() - t_end = time() - t_classify = t_end - t_start - print( - f"Caribou finished training and cross-validating the {opt['model_type']} model in {t_classify} seconds") + clf = ClassificationMethods( + db_data = db_data, + outdirs = outdirs, + db_name = opt['database_name'], + clf_multiclass = opt['model_type'], + taxa = taxa, + batch_size = opt['batch_size'], + training_epochs = opt['training_epochs'] + ) + + t_s = time() + + cv_scores = clf.cross_validation(datasets) + + t_clf = time() - t_s + + print(f"Caribou finished training and cross-validating the {opt['model_type']} model at taxa {taxa} in {t_clf} seconds") # Argument parsing from CLI ################################################################################ if __name__ == "__main__": parser = argparse.ArgumentParser(description='This script trains and cross-validates a model for the bacteria classification step.') + # Database parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') - parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files') + parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files') + # Parameters parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') - parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data') args = parser.parse_args() diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py index eda156b..3876f2b 100644 --- a/src/Caribou_extraction.py +++ b/src/Caribou_extraction.py @@ -1,17 +1,19 @@ #!/usr/bin python3 +import os import argparse from utils import * from time import time from pathlib import Path from models.reads_simulation import split_sim_dataset -from models.classification_old import ClassificationMethods +from models.classification import ClassificationMethods __author__ = "Nicolas de Montigny" __all__ = ['bacteria_extraction_train_cv'] +TRAINING_DATASET_NAME = 'train' VALIDATION_DATASET_NAME = 'validation' # Initialisation / validation of parameters from CLI @@ -36,82 +38,79 @@ def bacteria_extraction(opt): # Data loading ################################################################################ - if opt['data_host'] is not None: + if opt['model_type'] != 'onesvm': + if opt['data_host'] is not None: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + db_name = 'host_merged' + else: + db_data, db_ds = verify_load_db(opt['data_bacteria']) + db_name = opt['dataset_name'] + + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + else: db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) db_name = 'host_merged' - else: + + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['dataset_name'] - data_metagenome = verify_load_data(opt['data_metagenome']) - k_length = len(db_data['kmers'][0]) + datasets = { + TRAINING_DATASET_NAME : db_ds, + VALIDATION_DATASET_NAME : val_ds + } - val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + metagenome_data, metagenome_ds = verify_load_metagenome(opt['data_metagenome']) -# Definition of model for bacteria extraction / host removal + execution +# Definition of model for bacteria extraction / host removal ################################################################################ - if opt['host_name'] is None: - clf = ClassificationMethods( - database_k_mers = (db_data, db_ds), - k = k_length, - outdirs = outdirs, - database = opt['database_name'], - classifier_binary = opt['model_type'], - taxa = 'domain', - batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'], - verbose = opt['verbose'], - cv = False - ) - else: - clf = ClassificationMethods( - database_k_mers = (db_data, db_ds), - k = k_length, - outdirs = outdirs, - database = opt['database_name'], - classifier_binary = opt['model_type'], - taxa = 'domain', - batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'], - verbose = opt['verbose'], - cv = False - ) + + clf = ClassificationMethods( + db_data = db_data, + outdirs = outdirs, + db_name = opt['database_name'], + clf_binary = opt['model_type'], + taxa = 'domain', + batch_size = opt['batch_size'], + training_epochs = opt['training_epochs'] + ) + # Execution of bacteria extraction / host removal on metagenome + save results ################################################################################ - t_start = time() - end_taxa = clf.execute_training_prediction(data_metagenome) - t_end = time() - t_classify = t_end - t_start - - if end_taxa is None: - clf_data = merge_save_data( - clf.classified_data, - data_bacteria, - end_taxa, - outdirs['results_dir'], - opt['metagenome_name'], - ) - print(f"Caribou finished training the {opt['model_type']} model and extracting bacteria with it. \ - \nThe training and classification steps took {t_classify} seconds.") - else: - print(f"Caribou finished training the {opt['model_type']} model but there was no data to classify. \ - \nThe training and classification steps took {t_classify} seconds.") + t_s = time() + clf.fit(datasets) + t_fit = time() - t_s + + t_s = time() + predictions = clf.predict(metagenome_ds) + t_clf = time() - t_s + + Xy_file = os.path.join(outdirs['results_dir'], f"extracted_bacteria_{opt['metagenome_name']}_{opt['model_type']}.npz") + save_Xy_data(predictions, Xy_file) + + print(f""" + Caribou finished training the {opt['model_type']} model in {t_fit} seconds. + Extraction of bacteria from {opt['metagenome_name']} dataset was then executed in {t_clf} seconds. + """) # Argument parsing from CLI ################################################################################ if __name__ == "__main__": parser = argparse.ArgumentParser(description='This script trains a model and extracts bacteria / host sequences.') + # Database parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') parser.add_argument('-dh','--data_host', default=None, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the host') - parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify') - parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files') - parser.add_argument('-ds','--host_name', default=None, help='Name of the host database used to name files') + parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files') + parser.add_argument('-hn','--host_name', default=None, help='Name of the host database used to name files') + # Dataset + parser.add_argument('-dm','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify') parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files') + # Parameters parser.add_argument('-model','--model_type', default=None, choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') - parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data') args = parser.parse_args() diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py index 2a77471..1c73cad 100644 --- a/src/Caribou_extraction_train_cv.py +++ b/src/Caribou_extraction_train_cv.py @@ -6,12 +6,13 @@ from time import time from pathlib import Path from models.reads_simulation import split_sim_dataset -from models.classification_old import ClassificationMethods +from models.classification import ClassificationMethods __author__ = "Nicolas de Montigny" __all__ = ['bacteria_extraction_train_cv'] +TRAINING_DATASET_NAME = 'train' VALIDATION_DATASET_NAME = 'validation' TEST_DATASET_NAME = 'test' @@ -31,68 +32,66 @@ def bacteria_extraction_train_cv(opt): # Data loading ################################################################################ - if opt['data_host'] is not None: + if opt['model_type'] != 'onesvm': + if opt['data_host'] is not None: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + db_name = 'host_merged' + else: + db_data, db_ds = verify_load_db(opt['data_bacteria']) + db_name = opt['database_name'] + + test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + else: db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) db_name = 'host_merged' - else: - db_data, db_ds = verify_load_db(opt['data_bacteria']) - db_name = opt['dataset_name'] - k_length = len(db_data['kmers'][0]) + test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') + val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + + db_data, db_ds = verify_load_db(opt['data_bacteria']) + db_name = opt['database_name'] - test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') - val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + datasets = { + TRAINING_DATASET_NAME : db_ds, + TEST_DATASET_NAME : test_ds, + VALIDATION_DATASET_NAME : val_ds + } # Training and cross-validation of models for bacteria extraction / host removal ################################################################################ - t_start = time() - - if opt['host_name'] is None: - ClassificationMethods( - database_k_mers = (db_data, db_ds), - k = k_length, - outdirs = outdirs, - database = opt['database_name'], - classifier_binary = opt['model_type'], - taxa = 'domain', - batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'], - verbose = opt['verbose'], - cv = True - ).execute_training() - else: - ClassificationMethods( - database_k_mers = (db_data, db_ds), - k = k_length, - outdirs = outdirs, - database = opt['database_name'], - classifier_binary = opt['model_type'], - taxa = 'domain', - batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'], - verbose = opt['verbose'], - cv = True - ).execute_training() - - t_end = time() - t_classify = t_end - t_start - print( - f"Caribou finished training and cross-validating the {opt['model_type']} model in {t_classify} seconds") + clf = ClassificationMethods( + db_data = db_data, + outdirs = outdirs, + db_name = opt['database_name'], + clf_binary = opt['model_type'], + taxa = 'domain', + batch_size = opt['batch_size'], + training_epochs = opt['training_epochs'] + ) + + t_s = time() + + cv_scores = clf.cross_validation(datasets) + + t_clf = time() - t_s + print(f"Caribou finished training and cross-validating the {opt['model_type']} model in {t_clf} seconds") # Argument parsing from CLI ################################################################################ if __name__ == "__main__": parser = argparse.ArgumentParser(description='This script trains and cross-validates a model for the bacteria extraction / host removal step.') + # Database parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') parser.add_argument('-dh','--data_host', default=None, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the host') - parser.add_argument('-dt','--database_name', required=True, help='Name of the bacteria database used to name files') - parser.add_argument('-ds','--host_name', default=None, help='Name of the host database used to name files') + parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files') + parser.add_argument('-hn','--host_name', default=None, help='Name of the host database used to name files') + # Parameters parser.add_argument('-model','--model_type', required = True, choices=['onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one is chosen, defaults to 100') - parser.add_argument('-v','--verbose', action='store_true', help='Should the program be verbose') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where Ray Tune will output and spill tuning data') args = parser.parse_args() diff --git a/src/models/classification.py b/src/models/classification.py index 6a0cbc0..cbad2be 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -5,15 +5,13 @@ import numpy as np import pandas as pd -from glob import glob -from typing import Dict -from shutil import rmtree -from utils import load_Xy_data +from warnings import warn +from typing import Dict, List from models.sklearn.models import SklearnModel from models.kerasTF.models import KerasTFModel -# Simulation class -from models.reads_simulation import readsSimulation +# CV metrics +from sklearn.metrics import precision_recall_fscore_support __author__ = 'Nicolas de Montigny' @@ -22,20 +20,11 @@ TRAINING_DATASET_NAME = 'train' VALIDATION_DATASET_NAME = 'validation' TEST_DATASET_NAME = 'test' +TENSOR_COLUMN_NAME = '__value__' class ClassificationMethods(): """ - Utilities class for classifying sequences from metagenomes using ray - - ---------- - Attributes - ---------- - - classified_data : dictionary - Dictionary containing the classified data for each classified taxonomic level - - models : dictionary - Dictionary containing the trained models for each taxonomic level + Class for classifying sequences from metagenomes in a recursive manner ---------- Methods @@ -52,320 +41,387 @@ class ClassificationMethods(): """ def __init__( self, - database_k_mers: Dict, - k: int, + db_data: Dict, outdirs: Dict, - database: str, - classifier_binary: str = 'deeplstm', - classifier_multiclass: str = 'widecnn', - taxa: str = None, - threshold: float = 0.8, + db_name: str, + clf_binary: str = None, + clf_multiclass: str = None, + taxa: [str, List] = None, batch_size: int = 32, - training_epochs: int = 100, - verbose: bool = True, - cv: bool = False + training_epochs: int = 100 ): # Parameters - self._k = k - self._cv = cv self._taxas = taxa self._outdirs = outdirs - self._database = database - self._verbose = verbose - self._threshold = threshold - self._classifier_binary = classifier_binary - self._classifier_multiclass = classifier_multiclass + self._database = db_name + self._database_data = db_data + self._classifier_binary = clf_binary + self._classifier_multiclass = clf_multiclass self._batch_size = batch_size self._training_epochs = training_epochs - # Initialize with values - self.classified_data = { - 'sequence': [], - 'classification' : None, - 'classified_ids' : [], - 'unknown_ids' : [] - } - # Empty initializations - self.models = {} - self._host = False - self._taxas_order = [] - self._host_data = None - self._database_data = None - self._training_datasets = None - self._merged_training_datasets = None - self._merged_database_host = None - self.previous_taxa_unclassified = None - # Extract database data - if isinstance(database_k_mers, tuple): - self._host = True - self._database_data = database_k_mers[0] - self._host_data = database_k_mers[1] - else: - self._database_data = database_k_mers - # Remove 'id' from kmers if present - if 'id' in self._database_data['kmers']: - self._database_data['kmers'].remove('id') - if self._host and 'id' in self._host_data['kmers']: - self._host_data['kmers'].remove('id') - # Assign taxas order for top-down strategy - self._taxas_order = self._database_data['taxas'].copy() - self._taxas_order.reverse() - # Automatic executions - self._verify_assign_taxas(taxa) + # Init not fitted + self.is_fitted = False # Public functions ######################################################################################################### -# TODO: Revise documentation in heading -# TODO: Remove parameters from global if they are only required for certain functions -# TODO: Finish transfering the functions & calls from the old version -# TODO: Validation of params before execution of private functions - def fit(self, datasets, ): + + def fit(self, datasets): """ - Wrapper function to call the fitting method + Public function to call the fitting method after validation of parameters """ - # TODO: Pass training/validation data here - - def predict(self): + self._valid_assign_taxas() + self._valid_classifier() + tax_map = self._verify_model_trained() + + self._fit(datasets, tax_map) + + def predict(self, dataset): """ - Wrapper function to call the predicting method + Public function to call the predicting method after validation of parameters """ - # TODO: Pass data to predict here + model_mapping = self._verify_load_model() + predictions = self._predict(dataset, model_mapping) + + return predictions - def fit_predict(self): + def fit_predict(self, datasets, predict_ds): """ - Wrapper function for calling fit and predict + Public function for calling fit and predict after validation of parameters """ - # TODO: Pass training/validation data here - # TODO: Pass data to predict here + self._valid_assign_taxas() + self._valid_classifier() + tax_map = self._verify_model_trained() + + self._fit(datasets, tax_map) + + model_mapping = self._verify_load_model() + predictions = self._predict(predict_ds, model_mapping) - def cross_validation(self): + return predictions + + def cross_validation(self, datasets): """ - Wrapper function to call the cross-validation method + Public function to call the cross-validation method after validation of parameters + Executes cross-validation of a model by fitting it and predicting over a test dataset """ - # TODO: Pass training/validation data here - # TODO: Pass testing data here + + if isinstance(self._taxas, str): + self._valid_assign_taxas() + tax_map = self._verify_model_trained() + + test_ds = datasets.pop(TEST_DATASET_NAME) + y_true, test_ds = self._get_true_classif(test_ds, self._taxas) + + self._fit(datasets, tax_map) + + model_mapping = self._verify_load_model() + y_pred = self._cv_predict(test_ds, model_mapping) + cv_scores = self._score_cv(y_true, y_pred, self._taxas[0]) + + return cv_scores + else: + raise ValueError('Cross-validation can only be done on one taxa, please pass one taxa while initiating the ClassificationMethods object') + # Private principal functions ######################################################################################################### -# TODO: Pass training/validation data here - def _fit(self): + + def _fit(self, datasets, tax_map): """ Fit the given model to the training dataset """ - for taxa in self._taxas_order: - if taxa in self._taxas: - if taxa in ['domain','bacteria','host']: - clf = self._classifier_binary - else: - clf = self._classifier_multiclass - self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz') - self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') - train = self._verify_load_data_model(self._data_file, self._model_file, taxa) - if train: - if taxa in ['domain','bacteria','host']: - self._binary_training(taxa) - else: - self._multiclass_training(taxa) - -# TODO: Pass data to predict here - def _predict(self, data2classify): - """ - Predict the given data using the trained model - """ - files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - ids = data2classify['ids'] - if len(self.classified_data['sequence']) == 0: - raise ValueError('Please train a model before executing classification') - for i, taxa in enumerate(self.classified_data['sequence']): + for taxa, file in tax_map.items(): + if taxa in ['domain','bacteria','host']: + self._binary_training(datasets, taxa, file) + else: + self._multiclass_training(datasets, taxa, file) + self.is_fitted = True + + def _predict(self, ds, model_map): + """ + Predict the given data using the trained model in a recursive manner over taxas using a top-down approach + Returns a mapping of the predictions made by the models for the targeted taxas + """ + mapping = {} + if self.is_fitted: try: - if i == 0: - df = self._classify_first(df, taxa, ids, data2classify['profile']) - else: - df = self._classify_subsequent(df, taxa, ids, data2classify['profile']) + for taxa, model in model_map.items(): + predictions = model.predict(ds) # np.array + ds, predictions, ids = self._remove_unknown(ds, predictions) + file = self._save_dataset(ds, taxa) + mapping[taxa] = { + 'classification' : predictions, + 'ids' : ids, + 'dataset' : file + } + return mapping except ValueError: print('Stopping classification prematurelly because there are no more sequences to classify') - return taxa - return None - - def _cross_validation(self): + return mapping + else: + raise ValueError('The model was not fitted yet! Please call either the `fit` or the `fit_predict` method before making predictions') + + def _cv_predict(self, ds, model_map): """ - Execute cross-validation of a model by fitting a model and predicting over a test dataset + Predict the given data using the trained model for cross-validation + Returns a mapping of the predictions made by the models for the targeted taxas """ + mapping = {} + for taxa, model in model_map.items(): + mapping[taxa] = model.predict(ds) # np.array + return mapping # Private training secondary functions ######################################################################################################### -# TODO: Remove data loading & verification from inside these functions - def _binary_training(self, taxa): + + def _binary_training(self, datasets, taxa, file): print('_binary_training') - self._verify_classifier_binary() if self._classifier_binary == 'onesvm': - self.models[taxa] = SklearnModel( + model = SklearnModel( + self._classifier_binary, + self._outdirs['models_dir'], + self._batch_size, + self._training_epochs, + taxa, + self._database_data['kmers'] + ) + elif self._classifier_binary == 'linearsvm': + model = SklearnModel( self._classifier_binary, - self._database, self._outdirs['models_dir'], - self._outdirs['results_dir'], self._batch_size, self._training_epochs, - self._k, taxa, - self._database_data['kmers'], - self._verbose + self._database_data['kmers'] ) else: - if self._classifier_binary == 'linearsvm': - self.models[taxa] = SklearnModel( - self._classifier_binary, - self._database, - self._outdirs['models_dir'], - self._outdirs['results_dir'], - self._batch_size, - self._training_epochs, - self._k, - taxa, - self._merged_database_host['kmers'], - self._verbose - ) - else: - self.models[taxa] = KerasTFModel( - self._classifier_binary, - self._database, - self._outdirs['models_dir'], - self._outdirs['results_dir'], - self._batch_size, - self._training_epochs, - self._k, - taxa, - self._merged_database_host['kmers'], - self._verbose - ) - self.models[taxa].preprocess(self._merged_training_datasets['train']) - self.models[taxa].train(self._merged_training_datasets, self._merged_database_host, self._cv) - - self._save_model(self._model_file, taxa) - - def _multiclass_training(self, taxa): + model = KerasTFModel( + self._classifier_binary, + self._outdirs['models_dir'], + self._batch_size, + self._training_epochs, + taxa, + self._database_data['kmers'] + ) + model.preprocess(datasets[TRAINING_DATASET_NAME]) + model.fit(datasets) + + self._save_model(model, file) + + def _multiclass_training(self, datasets, taxa, file): print('_multiclass_training') - self._verify_classifier_multiclass() - self._load_training_data() if self._classifier_multiclass in ['sgd','mnb']: - self.models[taxa] = SklearnModel( + model = SklearnModel( self._classifier_multiclass, - self._database, self._outdirs['models_dir'], - self._outdirs['results_dir'], self._batch_size, self._training_epochs, - self._k, taxa, - self._database_data['kmers'], - self._verbose + self._database_data['kmers'] ) else: - self.models[taxa] = KerasTFModel( + model = KerasTFModel( self._classifier_multiclass, - self._database, self._outdirs['models_dir'], - self._outdirs['results_dir'], self._batch_size, self._training_epochs, - self._k, taxa, - self._database_data['kmers'], - self._verbose + self._database_data['kmers'] ) - self.models[taxa].preprocess(self._training_datasets['train']) - self.models[taxa].train(self._training_datasets, self._database_data, self._cv) - self._save_model(self._model_file, taxa) + model.preprocess(datasets[TRAINING_DATASET_NAME]) + model.fit(datasets) + + self._save_model(model, file) # Private predicting secondary functions ######################################################################################################### -# TODO: Revise these functions to parallelise with Ray + ease process - # Classify sequences for first iteration - def _classify_first(self, df, taxa, ids, df_file): - print('_classify_first') - try: - pred_df = self._predict_sequences(df, taxa, ids) - not_pred_df = pred_df[pred_df[taxa] == 'unknown'] - pred_df = pred_df[pred_df[taxa] != 'unknown'] - - self.classified_data['classified_ids'] = list(pred_df['id'].values) - self.classified_data['unknown_ids'] = list(not_pred_df['id'].values) - - self.classified_data['classification'] = pred_df - - if taxa == 'domain': - if self._host == True: - pred_df_host = pred_df[pred_df['domain'] == 'host'] - pred_df = pred_df[pred_df['domain'] != 'host'] - classified_host, classified_host_file = self._extract_subset(df, df_file, list(pred_df_host['id'].values), taxa, 'bacteria') - self.classified_data[taxa]['host'] = { - 'classification' : classified_host_file - } - classified, classified_file = self._extract_subset(df, df_file, self.classified_data['classified_ids'], taxa, 'bacteria') - self.classified_data[taxa]['bacteria'] = classified_file - not_classified, not_classified_file = self._extract_subset(df, df_file, self.classified_data['unknown_ids'], taxa, 'unknown') - self.classified_data[taxa]['unknown'] = not_classified_file - return classified - else: - classified, classified_file = self._extract_subset(df, df_file, self.classified_data['classified_ids'], taxa, 'bacteria') - self.classified_data[taxa]['classified'] = classified_file - not_classified, not_classified_file = self._extract_subset(df, df_file, self.classified_data['unknown_ids'], taxa, 'unknown') - self.classified_data[taxa]['unknown'] = not_classified_file - return classified - except: - raise ValueError('No sequences to classify for {}.'.format(taxa)) - - # Classify sequences according to passed taxa and model - def _classify_subsequent(self, df, taxa, ids, df_file): - print('_classify_subsequent') - try: - pred_df = self._predict_sequences(df, taxa, ids) - not_pred_df = pred_df[pred_df[taxa] == 'unknown'] - pred_df = pred_df[pred_df[taxa] != 'unknown'] - - self.classified_data['classification'] = self.classified_data['classification'].join(pred_df, how = 'outer', on = 'id') - - classified, classified_file = self._extract_subset(df, df_file, list(pred_df['id'].values), taxa, 'classified') - self.classified_data[taxa]['classified'] = classified_file - not_classified, not_classified_file = self._extract_subset(df, df_file, list(not_pred_df['id'].values), taxa, 'unknown') - self.classified_data[taxa]['unknown'] = not_classified_file - - return classified - except: - raise ValueError('No sequences to classify for {}.'.format(taxa)) - - # Make predictions - def _predict_sequences(self, df, taxa, ids): - print('_predict_sequences') - try: - predictions = self.models[taxa].predict(df, self._threshold) - pred_df = pd.DataFrame({'id': ids, taxa: predictions.values}) - - taxa_pos = self.classified_data['sequence'].index(taxa) - lst_taxa = self.classified_data['sequence'][taxa_pos:] - db_df = pd.DataFrame( - self._database_data['classes'], - columns=self._database_data['taxas'] - )[lst_taxa] - pred_df = pred_df.merge(db_df, on=taxa, how='left') - - return pred_df - except ValueError: - raise ValueError('No sequences to classify for {}.'.format(taxa)) - - # Extract subset of classified or not classified sequences - def _extract_subset(self, df, df_file, ids, taxa, status): - print('_extract_subset') - clf_file = df_file + '_{}_{}'.format(taxa, status) - rows_clf = [] - for row in df.iter_rows(): - if row['id'] in ids: - rows_clf.append(row) - df_clf = ray.data.from_items(rows_clf) - if df_clf.count() > 0: - df_clf.write_parquet(clf_file) - return df_clf, clf_file - - # Helper functions + + def _remove_unknown(self, ds, predict): + ids = [] + for row in ds.iter_rows(): + ids.append(row['id']) + mapping = pd.DataFrame({ + 'ids' : ids, + 'predictions' : predict + }) + mapping = mapping[mapping['predictions'] != -1] + ids = mapping['ids'] + predict = mapping['predictions'] + + def remove_unknown(df): + df = df[df['ids'].isin(ids)] + return df + + ds = ds.map_batches(remove_unknown, batch_format = 'pandas') + + return ds, predict, ids + + # Private cross-validation secondary methods ######################################################################################################### + def _get_true_classif(self, ds, taxas): + """ + Extract the true classification of the dataset used for cross-validation + """ + classif = {taxa : [] for taxa in taxas} + + cols2drop = [col for col in ds.schema().names if col not in ['id', taxas[0]]] + classif_ds = ds.drop_columns(cols2drop) + + cols2drop = [col for col in ds.schema().names if col not in ['id',TENSOR_COLUMN_NAME]] + ds = ds.drop_columns(cols2drop) + + for row in classif_ds.iter_rows(): + for taxa in taxas: + classif[taxa].append(row[taxa]) + + return classif, ds + + def _score_cv(self, y_true, y_pred, taxa): + """ + Compute the cross validation scores + """ + if self._classifier_binary is not None: + model = self._classifier_binary + else : + model = self._classifier_multiclass + + cv_csv = os.path.join(self._outdirs['results_dir'],f'{self._database}_{model}_{taxa}_cv_scores.csv') + + + y_compare = pd.DataFrame({ + 'y_true': y_true[taxa], + 'y_pred': y_pred[taxa] + }) + y_compare['y_true'] = y_compare['y_true'].str.lower() + y_compare['y_pred'] = y_compare['y_pred'].str.lower() + y_compare.to_csv(os.path.join(self._outdirs['models_dir'], f'y_compare_{self._database}_{model}_{taxa}.csv')) + + support = precision_recall_fscore_support( + y_compare['y_true'], + y_compare['y_pred'], + average = 'weighted' + ) + + scores = pd.DataFrame({ + taxa : [support[0],support[1],support[2]] + }, + index = ['Precision','Recall','F-score'] + ) + + scores.T.to_csv(cv_csv, index = True) + + return scores + + # Validation & verification methods + ######################################################################################################### + + def _valid_assign_taxas(self): + """ + Validate taxas and assign to class variable + Assign order for top-down strategy + """ + print('_valid_assign_taxas') + if self._taxas is None: + self._taxas = self._database_data['taxas'].copy() + elif isinstance(self._taxas, list): + self._taxas = self._taxas + elif isinstance(self._taxas, str): + self._taxas = [self._taxas] + else: + raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract") + self._valid_taxas() + self._taxas = [taxa for taxa in self._database_data['taxas'] if taxa in self._taxas] + self._taxas.reverse() + + def _valid_taxas(self): + """ + Validate that selected taxas are in database + """ + print('_valid_taxas') + for taxa in self._taxas: + if taxa not in self._database_data['taxas']: + raise ValueError("Taxa {} not found in database".format(taxa)) + + def _valid_classifier(self): + if self._classifier_binary is not None: + if self._classifier_binary not in ['onesvm','linearsvm','attention','lstm','deeplstm']: + raise ValueError(""" + Invalid classifier option for bacteria extraction! + Models implemented at this moment are : + Classic algorithm : One-class SVM (onesvm) and Linear SVM (linearsvm) + Neural networks : Attention (attention), LSTM (lstm) and Deep LSTM (deeplstm) + """) + if self._classifier_multiclass is not None: + if self._classifier_multiclass not in ['sgd','mnb','lstm_attention','cnn','widecnn']: + raise ValueError(""" + Invalid classifier option for bacteria classification! + Models implemented at this moment are : + Classic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb) + Neural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn) + """) + + def _verify_model_trained(self): + """ + Verify if the model is already trained for all desired taxas + Taxas for which a model is already trained will be removed from the list + Returns a mapping of the file per taxa to train + """ + mapping = {} + for taxa in self._taxas: + if taxa in ['domain','bacteria','host']: + clf = self._classifier_binary + else: + clf = self._classifier_multiclass + file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') + if not os.path.isfile(file): + mapping[taxa] = file + + return mapping + + def _verify_load_model(self): + """ + Verify if the model is already trained for all desired taxas + Taxas for which no model was not trained will raise a ValueError + Returns a mapping of the model per taxa for predicting + """ + mapping = {} + for taxa in self._taxas: + if taxa in ['domain','bacteria','host']: + clf = self._classifier_binary + else: + clf = self._classifier_multiclass + file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') + if not os.path.isfile(file): + raise ValueError(f'No model found for {taxa}') + else: + mapping[taxa] = self._load_model(file, taxa) + return mapping + + def _load_model(self, file, taxa): + """ + Load a model from the specified file + """ + print('_load_model') + with open(file, 'rb') as handle: + return cloudpickle.load(handle) + + def _save_model(self, model, file): + """ + Save a model to a specified file + """ + print('_save_model') + with open(file, 'wb') as handle: + cloudpickle.dump(model, handle) + + def _save_dataset(self, ds, taxa): + """ + Save a dataset to disk and return the filename + """ + if taxa in ['domain','bacteria','host']: + model = self._classifier_binary + else: + model = self._classifier_multiclass + file = os.path.join(self._outdirs['results'], f'data_classified_{model}_{taxa}.parquet') + ds.write_parquet(file) + return file \ No newline at end of file diff --git a/src/models/classification_old.py b/src/models/classification_old.py deleted file mode 100644 index 7638c17..0000000 --- a/src/models/classification_old.py +++ /dev/null @@ -1,327 +0,0 @@ -import os -import ray -import cloudpickle - -import numpy as np -import pandas as pd - -from glob import glob -from shutil import rmtree -from utils import load_Xy_data -from models.sklearn.models import SklearnModel -from models.kerasTF.models import KerasTFModel - -# Simulation class -from models.reads_simulation import readsSimulation - -__author__ = 'Nicolas de Montigny' - -__all__ = ['ClassificationMethods'] - -class ClassificationMethods(): - """ - Utilities class for classifying sequences from metagenomes using ray - - ---------- - Attributes - ---------- - - classified_data : dictionary - Dictionary containing the classified data for each classified taxonomic level - - models : dictionary - Dictionary containing the trained models for each taxonomic level - - ---------- - Methods - ---------- - - execute_training : launch the training of the models for the chosen taxonomic levels - no parameters to pass - - execute_classification : - data2classify : a dictionnary containing the data to classify produced by the function Caribou.src.data.build_data.build_X_data - - """ - def __init__( - self, - database_k_mers, - k, - outdirs, - database, - classifier_binary = 'deeplstm', - classifier_multiclass = 'widecnn', - taxa = None, - threshold = 0.8, - batch_size = 32, - training_epochs = 100, - verbose = True, - cv = False - ): - # Parameters - self._k = k - self._cv = cv - self._taxas = taxa - self._outdirs = outdirs - self._database = database - self._verbose = verbose - self._threshold = threshold - self._classifier_binary = classifier_binary - self._classifier_multiclass = classifier_multiclass - self._batch_size = batch_size - self._training_epochs = training_epochs - # Initialize with values - self.classified_data = { - 'sequence': [], - 'classification' : None, - 'classified_ids' : [], - 'unknown_ids' : [] - } - # Empty initializations - self.models = {} - self._host = False - self._taxas_order = [] - self._host_data = None - self._database_data = None - self._training_datasets = None - self._merged_training_datasets = None - self._merged_database_host = None - self.previous_taxa_unclassified = None - # Extract database data - if isinstance(database_k_mers, tuple): - self._host = True - self._database_data = database_k_mers[0] - self._host_data = database_k_mers[1] - else: - self._database_data = database_k_mers - # Remove 'id' from kmers if present - if 'id' in self._database_data['kmers']: - self._database_data['kmers'].remove('id') - if self._host and 'id' in self._host_data['kmers']: - self._host_data['kmers'].remove('id') - # Assign taxas order for top-down strategy - self._taxas_order = self._database_data['taxas'].copy() - self._taxas_order.reverse() - # Automatic executions - self._verify_assign_taxas(taxa) - - # Main functions - ######################################################################################################### - - # Wrapper function for training and predicting over each known taxa - def execute_training_prediction(self, data2classify): - print('execute_training_prediction') - files_lst = glob(os.path.join(data2classify['profile'],'*.parquet')) - df2classify = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - ids2classify = data2classify['ids'] - for i, taxa in enumerate(self._taxas_order): - if taxa in self._taxas: - # Training - if taxa in ['domain','bacteria','host']: - clf = self._classifier_binary - else: - clf = self._classifier_multiclass - self._data_file = os.path.join(self._outdirs['data_dir'], f'Xy_{taxa}_database_K{self._k}_{clf}_{self._database}_data.npz') - self._model_file = os.path.join(self._outdirs['models_dir'], f'{clf}_{taxa}.pkl') - train = self._verify_load_data_model(self._data_file, self._model_file, taxa) - if train: - self._train_model(taxa) - # Predicting - try: - if i == 0: - df2classify = self._classify_first(df2classify, taxa, ids2classify, data2classify['profile']) - else: - df2classify = self._classify_subsequent(df2classify, taxa, ids2classify, data2classify['profile']) - except ValueError: - print('Stopping classification prematurelly because there are no more sequences to classify') - return taxa - return None - - # Utils functions - ######################################################################################################### - - # Verify taxas and assign to class variable - def _verify_assign_taxas(self, taxa): - print('_verify_assign_taxas') - if taxa is None: - self._taxas = self._database_data['taxas'].copy() - elif isinstance(taxa, list): - self._taxas = taxa - elif isinstance(taxa, str): - self._taxas = [taxa] - else: - raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract") - self._verify_taxas() - - # Verify if selected taxas are in database - def _verify_taxas(self): - print('_verify_taxas') - for taxa in self._taxas: - if taxa not in self._database_data['taxas']: - raise ValueError("Taxa {} not found in database".format(taxa)) - - # Caller function for verifying if the data and model already exist - def _verify_load_data_model(self, data_file, model_file, taxa): - print('_verify_load_data_model') - self._verify_files(data_file, taxa) - return self._verify_load_model(model_file, taxa) - - # Load extracted data if already exists - def _verify_files(self, file, taxa): - print('_verify_files') - self.classified_data['sequence'].append(taxa) - if os.path.isfile(file): - self.classified_data[taxa] = load_Xy_data(file) - else: - self.classified_data[taxa] = {} - - # Load model if already exists - def _verify_load_model(self, model_file, taxa): - print('_verify_load_model') - if os.path.exists(model_file): - with open(model_file, 'rb') as f: - self.models[taxa] = cloudpickle.load(f) - return False - else: - return True - - def _save_model(self, model_file, taxa): - print('_save_model') - with open(model_file, 'wb') as f: - cloudpickle.dump(self.models[taxa], f) - - def _verify_classifier_binary(self): - print('_verify_classifier_binary') - if self._classifier_binary == 'onesvm': - if self._cv == True and self._host == True: - pass - elif self._cv == True and self._host == False: - raise ValueError('Classifier One-Class SVM cannot be cross-validated with bacteria data only!\nEither add host data from parameters or choose to predict directly using this method') - elif self._cv == False and self._host == True: - raise ValueError('Classifier One-Class SVM cannot classify with host data!\nEither remove host data from parameters or choose another bacteria extraction method') - elif self._cv == False and self._host == False: - pass - elif self._classifier_binary == 'onesvm' and self._host == False: - pass - elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == True: - pass - elif self._classifier_binary in ['linearsvm','attention','lstm','deeplstm'] and self._host == False: - raise ValueError('Classifier {} cannot classify without host data!\nEither add host data to config file or choose the One-Class SVM classifier'.format(self._classifier_binary)) - else: - raise ValueError('Invalid classifier option for bacteria extraction!\n\tModels implemented at this moment are :\n\tBacteria isolator : One Class SVM (onesvm)\n\tClassic algorithm : Linear SVM (linearsvm)\n\tNeural networks : Attention (attention), Shallow LSTM (lstm) and Deep LSTM (deeplstm)') - - def _verify_classifier_multiclass(self): - print('_verify_classifier_multiclass') - if self._classifier_multiclass in ['sgd','mnb','lstm_attention','cnn','widecnn']: - pass - else: - raise ValueError('Invalid classifier option for bacteria classification!\n\tModels implemented at this moment are :\n\tClassic algorithm : Stochastic Gradient Descent (sgd) and Multinomial Naïve Bayes (mnb)\n\tNeural networks : Deep hybrid between LSTM and Attention (lstm_attention), CNN (cnn) and Wide CNN (widecnn)') - - # Merge database and host reference data for bacteria extraction training - def _merge_database_host(self, database_data, host_data): - print('_merge_database_host') - self._merged_database_host = {} - self._merged_database_host['profile'] = f"{database_data['profile']}_host_merged" # Kmers profile - - if os.path.exists(self._merged_database_host['profile']): - files_lst = glob(os.path.join(self._merged_database_host['profile'],'*.parquet')) - df_merged = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - else: - files_lst = glob(os.path.join(database_data['profile'],'*.parquet')) - df_db = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - files_lst = glob(os.path.join(host_data['profile'],'*.parquet')) - df_host = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - - cols2drop = [] - for col in df_db.schema().names: - if col not in ['id','domain','__value__']: - cols2drop.append(col) - df_db = df_db.drop_columns(cols2drop) - cols2drop = [] - for col in df_host.schema().names: - if col not in ['id','domain','__value__']: - cols2drop.append(col) - df_host = df_host.drop_columns(cols2drop) - - df_merged = df_db.union(df_host) - df_merged.write_parquet(self._merged_database_host['profile']) - - self._merged_database_host['ids'] = np.concatenate((database_data["ids"], host_data["ids"])) # IDs - self._merged_database_host['kmers'] = database_data["kmers"] # Features - self._merged_database_host['taxas'] = ['domain'] # Known taxas for classification - self._merged_database_host['fasta'] = (database_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation - - return df_merged - - # Load, merge db + host & simulate validation / test datasets - def _load_training_data_merged(self, taxa): - print('_load_training_data_merged') - if self._classifier_binary == 'onesvm' and taxa == 'domain': - files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val_test = self._merge_database_host(self._database_data, self._host_data) - df_val_test = df_val_test.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_validation') - self._merged_training_datasets = {'train': df_train, 'validation': df_val} - if self._cv: - df_test = self.split_sim_cv_ds(df_val_test,self._merged_database_host, 'merged_test') - self._merged_training_datasets['test'] = df_test - else: - df_train = self._merge_database_host(self._database_data, self._host_data) - df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_validation') - self._merged_training_datasets = {'train': df_train, 'validation': df_val} - if self._cv: - df_test = self.split_sim_cv_ds(df_train,self._merged_database_host, 'merged_test') - self._merged_training_datasets['test'] = df_test - - # Load db & simulate validation / test datasets - def _load_training_data(self): - print('_load_training_data') - files_lst = glob(os.path.join(self._database_data['profile'],'*.parquet')) - df_train = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - df_train = df_train.map_batches(convert_archaea_bacteria, batch_format = 'pandas') - df_val = self.split_sim_cv_ds(df_train,self._database_data, 'validation') - self._training_datasets = {'train': df_train, 'validation': df_val} - if self._cv: - df_test = self.split_sim_cv_ds(df_train,self._database_data, 'test') - self._training_datasets['test'] = df_test - - def _sim_4_cv(self, df, kmers_ds, name): - print('_sim_4_cv') - cols = ['id'] - cols.extend(kmers_ds['taxas']) - cls = pd.DataFrame(columns = cols) - for batch in df.iter_batches(batch_format = 'pandas'): - cls = pd.concat([cls, batch[cols]], axis = 0, ignore_index = True) - - sim_outdir = os.path.dirname(kmers_ds['profile']) - cv_sim = readsSimulation(kmers_ds['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) - sim_data = cv_sim.simulation(self._k, kmers_ds['kmers']) - files_lst = glob(os.path.join(sim_data['profile'],'*.parquet')) - df = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - return df - - def split_sim_cv_ds(self, ds, data, name): - ds_path = os.path.join( - os.path.dirname(data['profile']), - f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}' - ) - if os.path.exists(ds_path): - files_lst = glob(os.path.join(ds_path,'*.parquet')) - cv_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - else: - cv_ds = ds.random_sample(0.1) - if cv_ds.count() == 0: - nb_smpl = round(ds.count() * 0.1) - cv_ds = ds.random_shuffle().limit(nb_smpl) - cv_ds = self._sim_4_cv(cv_ds, data, name) - return cv_ds - -# Helper functions outside of class -############################################################################### - -def convert_archaea_bacteria(df): - df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' - return df \ No newline at end of file diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py index 2ed90e1..7084b2b 100644 --- a/src/models/encoders/model_label_encoder.py +++ b/src/models/encoders/model_label_encoder.py @@ -2,6 +2,7 @@ from functools import partial from typing import Dict, List, Optional +import ray import numpy as np import pandas as pd import pandas.api.types diff --git a/src/models/encoders/one_hot_tensor_encoder.py b/src/models/encoders/one_hot_tensor_encoder.py index 8acd7fe..3ae7950 100644 --- a/src/models/encoders/one_hot_tensor_encoder.py +++ b/src/models/encoders/one_hot_tensor_encoder.py @@ -23,7 +23,7 @@ def _fit(self, dataset: Dataset) -> Preprocessor: [self.column], encode_lists = False, ) - + return self def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index cd57ef5..ff51baa 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -73,7 +73,7 @@ class KerasTFModel(ModelsUtils): train : train a model using the given datasets predict : predict the classes of a dataset - df : ray.data.Dataset + ds : ray.data.Dataset Dataset containing K-mers profiles of sequences to be classified threshold : float @@ -86,27 +86,19 @@ class KerasTFModel(ModelsUtils): def __init__( self, classifier, - dataset, outdir_model, - outdir_results, batch_size, training_epochs, - k, taxa, - kmers_list, - verbose + kmers_list ): super().__init__( classifier, - dataset, outdir_model, - outdir_results, batch_size, training_epochs, - k, taxa, - kmers_list, - verbose + kmers_list ) # Parameters # Initialize hidden @@ -141,11 +133,11 @@ def __init__( elif self.classifier == 'widecnn': print('Training multiclass classifier based on Wide CNN Network') - def preprocess(self, df): + def preprocess(self, ds): print('preprocess') labels = [] encoded = [] - for row in df.iter_rows(): + for row in ds.iter_rows(): labels.append(row[self.taxa]) self._nb_classes = len(np.unique(labels)) if self._nb_classes == 2: @@ -164,10 +156,10 @@ def preprocess(self, df): TensorRDFFeaturesSelection(self.kmers, self.taxa), ) - self._encoder.fit(df) - df = self._preprocessor.fit_transform(df) + self._encoder.fit(ds) + ds = self._preprocessor.fit_transform(ds) self._reductor = TensorTruncatedSVDReduction(self.kmers) - self._reductor.fit(df) + self._reductor.fit(ds) # Labels mapping if self._nb_classes == 2: labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) @@ -186,29 +178,8 @@ def _label_decode(self, predict): return np.array(decoded) - def train(self, datasets, kmers_ds, cv = True): - print('train') - if cv: - self._cross_validation(datasets, kmers_ds) - else: - self._fit_model(datasets) - - def _cross_validation(self, datasets, kmers_ds): - print('_cross_validation') - df_test = datasets.pop('test') - - self._fit_model(datasets) - - y_true = [] - for row in df_test.iter_rows(): - y_true.append(row[self.taxa]) - - y_pred = self.predict(df_test.drop_columns([self.taxa]), threshold = 0.8) - - self._cv_score(y_true, y_pred) - - def _fit_model(self, datasets): - print('_fit_model') + def fit(self, datasets): + print('fit') # Preprocessing loop for name, ds in datasets.items(): ds = ds.drop_columns(['id']) @@ -249,15 +220,15 @@ def _fit_model(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] - def predict(self, df, threshold=0.8): + def predict(self, ds, threshold=0.8): print('predict') - if df.count() > 0: - if len(df.schema().names) > 1: - col_2_drop = [col for col in df.schema().names if col != TENSOR_COLUMN_NAME] - df = df.drop_columns(col_2_drop) + if ds.count() > 0: + if len(ds.schema().names) > 1: + col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] + ds = ds.drop_columns(col_2_drop) # Preprocess - df = self._preprocessor.transform(df) + ds = self._preprocessor.transform(ds) self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, @@ -265,7 +236,7 @@ def predict(self, df, threshold=0.8): model_definition = lambda: build_model(self.classifier, self._nb_classes, len(self.kmers)) ) predictions = self._predictor.predict( - data = df, + data = ds, batch_size = self.batch_size ) @@ -279,23 +250,23 @@ def predict(self, df, threshold=0.8): # Iterate over batches of predictions to transform probabilities to labels without mapping def _prob_2_cls(self, predictions, threshold): print('_prob_2_cls') - def map_predicted_label_binary(df, threshold): - df = np.ravel(df['predictions']) + def map_predicted_label_binary(ds, threshold): + ds = np.ravel(ds['predictions']) lower_threshold = 0.5 - (threshold * 0.5) upper_threshold = 0.5 + (threshold * 0.5) predict = pd.DataFrame({ - 'proba': df, - 'predicted_label': np.full(len(df), -1) + 'proba': ds, + 'predicted_label': np.full(len(ds), -1) }) predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} - def map_predicted_label_multiclass(df, threshold): - df = df['predictions'] + def map_predicted_label_multiclass(ds, threshold): + ds = ds['predictions'] pred = pd.DataFrame({ - 'best_proba': [np.max(arr) for arr in df], - 'predicted_label' : [np.argmax(arr) for arr in df] + 'best_proba': [np.max(arr) for arr in ds], + 'predicted_label' : [np.argmax(arr) for arr in ds] }) pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1 diff --git a/src/models/models_utils.py b/src/models/models_utils.py index c38ca25..9ccc27d 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -5,9 +5,6 @@ # Class construction from abc import ABC, abstractmethod -# CV metrics -from sklearn.metrics import precision_recall_fscore_support - __author__ = 'Nicolas de Montigny' __all__ = ['ModelsUtils'] @@ -43,14 +40,11 @@ class ModelsUtils(ABC): Methods ---------- - train : only train or cross-validate training of classifier + fit : only train or cross-validate training of classifier X : ray.data.Dataset Dataset containing the K-mers profiles of sequences for learning y : ray.data.Dataset Dataset containing the classes of sequences for learning - cv : boolean - Should cross-validation be verified or not. - Defaults to True. predict : abstract method to predict the classes of a dataset @@ -58,31 +52,22 @@ class ModelsUtils(ABC): def __init__( self, classifier, - dataset, outdir_model, - outdir_results, batch_size, training_epochs, - k, taxa, - kmers_list, - verbose + kmers_list ): # Parameters self.classifier = classifier - self.dataset = dataset - self.outdir_results = outdir_results self.batch_size = batch_size - self.k = k self.taxa = taxa self.kmers = kmers_list - self.verbose = verbose # Initialize hidden self._nb_kmers = len(kmers_list) self._training_epochs = training_epochs # Initialize empty self._labels_map = None - self._predict_ids = [] # Initialize Ray variables self._clf = None self._encoder = None @@ -93,53 +78,17 @@ def __init__( self._train_params = {} self._predictor = None self._workdir = outdir_model - # Files - self._cv_csv = os.path.join(self.outdir_results,'{}_{}_K{}_cv_scores.csv'.format(self.classifier, self.taxa, self.k)) - - @abstractmethod - def preprocess(self, df): - """ - """ @abstractmethod - def train(self): + def preprocess(self, ds): """ """ @abstractmethod - def _fit_model(self): + def fit(self): """ """ - @abstractmethod - def _cross_validation(self): - """ - """ - - def _cv_score(self, y_true, y_pred): - print('_cv_score') - - y_compare = pd.DataFrame({ - 'y_true': y_true, - 'y_pred': y_pred - }) - y_compare['y_true'] = y_compare['y_true'].str.lower() - y_compare['y_pred'] = y_compare['y_pred'].str.lower() - y_compare.to_csv(os.path.join(self._workdir, f'y_compare_{self.dataset}_{self.classifier}.csv')) - - support = precision_recall_fscore_support( - y_compare['y_true'], - y_compare['y_pred'], - average = 'weighted' - ) - - scores = pd.DataFrame( - {self.classifier : [support[0],support[1],support[2]]}, - index = ['Precision','Recall','F-score'] - ) - - scores.to_csv(self._cv_csv, index = True) - @abstractmethod def predict(self): """ diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py index a6032fa..88d899c 100644 --- a/src/models/preprocessors/tfidf_transformer.py +++ b/src/models/preprocessors/tfidf_transformer.py @@ -47,7 +47,7 @@ def _fit(self, ds: Dataset) -> Preprocessor: return self def _transform_pandas(self, batch: pd.DataFrame) -> pd.DataFrame: - # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + # _validate_df(batch, TENSOR_COLUMN_NAME, self._nb_features) idf_diag = self.stats_['idf_diag'] df = batch[TENSOR_COLUMN_NAME] diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index fa8139e..93bebaf 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -21,8 +21,9 @@ # Training from ray.air.config import ScalingConfig from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import SGDOneClassSVM, SGDClassifier +from sklearn.linear_model import SGDClassifier from models.sklearn.partial_trainer import SklearnPartialTrainer +from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM from models.sklearn.tensor_predictor import SklearnTensorPredictor # Tuning @@ -66,7 +67,7 @@ class SklearnModel(ModelsUtils): train : train a model using the given datasets predict : predict the classes of a dataset - df : ray.data.Dataset + ds : ray.data.Dataset Dataset containing K-mers profiles of sequences to be classified threshold : float @@ -78,34 +79,26 @@ class SklearnModel(ModelsUtils): def __init__( self, classifier, - dataset, outdir_model, - outdir_results, batch_size, training_epochs, - k, taxa, - kmers_list, - verbose + kmers_list ): super().__init__( classifier, - dataset, outdir_model, - outdir_results, batch_size, training_epochs, - k, taxa, - kmers_list, - verbose + kmers_list ) # Parameters self._encoded = [] # Computes self._build() - def preprocess(self, df): + def preprocess(self, ds): print('preprocess') if self.classifier == 'onesvm': self._encoder = OneClassSVMLabelEncoder(self.taxa) @@ -118,11 +111,11 @@ def preprocess(self, df): TensorTfIdfTransformer(self.kmers), TensorRDFFeaturesSelection(self.kmers, self.taxa), ) - self._encoder.fit(df) - df = self._preprocessor.fit_transform(df) + self._encoder.fit(ds) + ds = self._preprocessor.fit_transform(ds) self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep'] self._reductor = TensorTruncatedSVDReduction(self.kmers) - self._reductor.fit(df) + self._reductor.fit(ds) # Labels mapping if self.classifier != 'onesvm': @@ -140,37 +133,11 @@ def _label_decode(self, predict): return np.array(decoded) - def train(self, datasets, kmers_ds, cv = True): - print('train') - - if cv: - self._cross_validation(datasets, kmers_ds) - else: - self._fit_model(datasets) - - def _cross_validation(self, datasets, kmers_ds): - print('_cross_validation') - - df_test = datasets.pop('test') - - self._fit_model(datasets) - - y_true = [] - for row in df_test.iter_rows(): - y_true.append(row[self.taxa]) - - y_true = np.array(y_true) - y_true = list(y_true) - - y_pred = self._predict_cv(df_test.drop_columns([self.taxa])) - - self._cv_score(y_true, y_pred) - def _build(self): print('_build') if self.classifier == 'onesvm': print('Training bacterial extractor with One Class SVM') - self._clf = SGDOneClassSVM() + self._clf = ScoringSGDOneClassSVM() self._train_params = { 'nu' : 0.026441491, 'learning_rate' : 'constant', @@ -206,7 +173,7 @@ def _build(self): 'fit_prior' : True } - def _fit_model(self, datasets): + def fit(self, datasets): print('_fit_model') for name, ds in datasets.items(): ds = ds.drop_columns(['id']) @@ -216,8 +183,7 @@ def _fit_model(self, datasets): datasets[name] = ray.put(ds) try: training_labels = self._encoded.copy() - training_labels = np.delete( - training_labels, np.where(training_labels == -1)) + training_labels = np.delete(training_labels, np.where(training_labels == -1)) except: pass @@ -246,43 +212,25 @@ def _fit_model(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.checkpoint - def _predict_cv(self, df): - print('_predict_cv') - if df.count() > 0: + def predict(self, ds, threshold = 0.8): + print('predict') + if ds.count() > 0: + ds = self._preprocessor.transform(ds) + ds = self._reductor.transform(ds) predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) - predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) + predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) predictions = np.array(predictions.to_pandas()).reshape(-1) - - return self._label_decode(predictions) - else: - raise ValueError('No data to predict') - - def predict(self, df, threshold = 0.8): - print('predict') - if df.count() > 0: - df = self._preprocessor.transform(df) - df = self._reductor.transform(df) - if self.classifier == 'onesvm': - predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} - self._predictor = BatchPredictor.from_checkpoint(self._models_collection['domain'], SklearnTensorPredictor) - predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) - predictions = np.array(predictions.to_pandas()).reshape(-1) - else: - predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} - self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor) - predictions = self._predictor.predict(df, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) - predictions = self._prob_2_cls(predictions, len(self._encoded), threshold) return self._label_decode(predictions) else: raise ValueError('No data to predict') def _prob_2_cls(self, predict, nb_cls, threshold): print('_prob_2_cls') - def map_predicted_label(df : pd.DataFrame): + def map_predicted_label(ds : pd.DataFrame): predict = pd.DataFrame({ - 'best_proba': [max(df.iloc[i].values) for i in range(len(df))], - 'predicted_label': [np.argmax(df.iloc[i].values) for i in range(len(df))] + 'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))], + 'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))] }) predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1 return pd.DataFrame(predict['predicted_label']) diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py index f08dd7c..021d9ce 100644 --- a/src/models/sklearn/partial_trainer.py +++ b/src/models/sklearn/partial_trainer.py @@ -26,6 +26,7 @@ from ray.train.sklearn import SklearnTrainer +TENSOR_COLUMN_NAME = '__value__' LABELS_COLUMN_NAME = 'labels' simplefilter(action='ignore', category=FutureWarning) @@ -216,7 +217,7 @@ def training_loop(self): ) ): if isinstance(batch_X, dict): - batch_X = batch_X['__value__'] + batch_X = batch_X[TENSOR_COLUMN_NAME] """ try: @@ -244,7 +245,7 @@ def training_loop(self): # batch_size = 1, # batch_format = 'numpy' # )): - # X_calib_df[ind] = batch['__value__'] + # X_calib_df[ind] = batch[TENSOR_COLUMN_NAME] # """ # X_calib = pd.DataFrame(X_calib_df, columns = self._features_list) @@ -318,7 +319,7 @@ def _score_on_validation_sets( ) ): if isinstance(batch, dict): - batch = batch['__value__'] + batch = batch[TENSOR_COLUMN_NAME] """ try: diff --git a/src/utils.py b/src/utils.py index f9f1d4b..5e7924f 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,8 +1,6 @@ import os import ray -import json import logging -import warnings import numpy as np import pandas as pd @@ -41,6 +39,7 @@ 'zip_X_y', 'ensure_length_ds', 'convert_archaea_bacteria', + 'verify_load_metagenome', 'verify_load_db', 'verify_load_host_merge', 'merge_db_host' @@ -75,12 +74,12 @@ def init_ray_cluster(workdir): # Load data from file def load_Xy_data(Xy_file): - with np.load(Xy_file, allow_pickle=True) as f: - return f['data'].tolist() + with np.load(Xy_file, allow_pickle=True) as handle: + return handle['data'].tolist() # Save data to file -def save_Xy_data(df, Xy_file): - np.savez(Xy_file, data = df) +def save_Xy_data(data, Xy_file): + np.savez(Xy_file, data = data) # User arguments verification ######################################################################################################### @@ -302,6 +301,17 @@ def convert_archaea_bacteria(df): df.loc[df['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' return df +def verify_load_metagenome(data): + """ + Wrapper function for verifying and loading the metagenome dataset + """ + data = verify_load_data(data) + files_lst = glob(os.path.join(data['profile'], '*.parquet')) + ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + + return data, ds + + def verify_load_db(db_data): """ Wrapper function for verifying and loading the db dataset From 9babd8fb4d66e2243bab7957d713694499005c03 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 10 Nov 2023 08:24:09 -0500 Subject: [PATCH 29/92] remove XGBoost model for features selection from training --- src/models/kerasTF/models.py | 27 ++++++++++++++++----------- src/models/models_utils.py | 1 + src/models/sklearn/models.py | 21 +++++++++++++-------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index ff51baa..f2d0a7d 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -142,22 +142,25 @@ def preprocess(self, ds): self._nb_classes = len(np.unique(labels)) if self._nb_classes == 2: self._encoder = ModelLabelEncoder(self.taxa) - self._preprocessor = Chain( - TensorTfIdfTransformer(self.kmers), - TensorRDFFeaturesSelection(self.kmers, self.taxa), - ) + self._scaler = TensorTfIdfTransformer(self.kmers) + # self._preprocessor = Chain( + # TensorTfIdfTransformer(self.kmers), + # TensorRDFFeaturesSelection(self.kmers, self.taxa), + # ) else: self._encoder = Chain( LabelEncoder(self.taxa), OneHotTensorEncoder(self.taxa) ) - self._preprocessor = Chain( - TensorTfIdfTransformer(self.kmers), - TensorRDFFeaturesSelection(self.kmers, self.taxa), - ) + self._scaler = TensorTfIdfTransformer(self.kmers) + # self._preprocessor = Chain( + # TensorTfIdfTransformer(self.kmers), + # TensorRDFFeaturesSelection(self.kmers, self.taxa), + # ) self._encoder.fit(ds) - ds = self._preprocessor.fit_transform(ds) + ds = self._scaler.fit_transform(ds) + # ds = self._preprocessor.fit_transform(ds) self._reductor = TensorTruncatedSVDReduction(self.kmers) self._reductor.fit(ds) # Labels mapping @@ -184,7 +187,8 @@ def fit(self, datasets): for name, ds in datasets.items(): ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._preprocessor.transform(ds) + ds = self._scaler.transform(ds) + # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) datasets[name] = ds @@ -228,7 +232,8 @@ def predict(self, ds, threshold=0.8): ds = ds.drop_columns(col_2_drop) # Preprocess - ds = self._preprocessor.transform(ds) + ds = self._scaler.transform(ds) + # ds = self._preprocessor.transform(ds) self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 9ccc27d..7e6e50f 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -71,6 +71,7 @@ def __init__( # Initialize Ray variables self._clf = None self._encoder = None + self._scaler = None self._preprocessor = None self._reductor = None self._model_ckpt = None diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 93bebaf..73ca634 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -107,13 +107,16 @@ def preprocess(self, ds): else: self._encoder = ModelLabelEncoder(self.taxa) - self._preprocessor = Chain( - TensorTfIdfTransformer(self.kmers), - TensorRDFFeaturesSelection(self.kmers, self.taxa), - ) + self._scaler = TensorTfIdfTransformer(self.kmers) + + # self._preprocessor = Chain( + # TensorTfIdfTransformer(self.kmers), + # TensorRDFFeaturesSelection(self.kmers, self.taxa), + # ) self._encoder.fit(ds) - ds = self._preprocessor.fit_transform(ds) - self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep'] + ds = self._scaler.fit_transform(ds) + # ds = self._preprocessor.fit_transform(ds) + # self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep'] self._reductor = TensorTruncatedSVDReduction(self.kmers) self._reductor.fit(ds) @@ -178,7 +181,8 @@ def fit(self, datasets): for name, ds in datasets.items(): ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._preprocessor.transform(ds) + ds = self._scaler.transform(ds) + # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) datasets[name] = ray.put(ds) try: @@ -215,7 +219,8 @@ def fit(self, datasets): def predict(self, ds, threshold = 0.8): print('predict') if ds.count() > 0: - ds = self._preprocessor.transform(ds) + ds = self._scaler.transform(ds) + # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) From 1796002d97c6605e9bd5cb7baafcf861e4442ffe Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 10 Nov 2023 15:03:54 -0500 Subject: [PATCH 30/92] truncated svd incremental fitting --- src/data/reduction/truncated_svd_reduction.py | 45 ++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py index 9e64773..4d2a43a 100644 --- a/src/data/reduction/truncated_svd_reduction.py +++ b/src/data/reduction/truncated_svd_reduction.py @@ -18,7 +18,9 @@ class TensorTruncatedSVDReduction(Preprocessor): This makes it possible to use the class as a Ray preprocessor in a features reduction strategy. TruncatedSVD performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). When it is applied following the TF-IDF normalisation, it becomes a latent semantic analysis (LSA). + https://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA """ def __init__(self, features: List[str], nb_components: int = 10000): @@ -26,17 +28,20 @@ def __init__(self, features: List[str], nb_components: int = 10000): self.features = features self._nb_features = len(features) self._nb_components = nb_components - - + self._n_samples_seen = 0 + self._mean = 0.0 + self._var = 0.0 + def _fit(self, ds: Dataset) -> Preprocessor: + # Parallel + """ def svd_batch(arr: np.array): - df = arr['__value__'] + df = arr[TENSOR_COLUMN_NAME] df = _unwrap_ndarray_object_type_if_needed(df) U, Sigma, VT = randomized_svd( df, n_components = self._nb_components, - n_iter = 5, - n_oversamples = 10, + n_iter = 1, power_iteration_normalizer = 'LU', random_state = None ) @@ -51,8 +56,36 @@ def svd_batch(arr: np.array): for row in svd_vt.iter_rows(): components.append(row['VT']) - components = np.mean(components, axis = 0) + components = np.concatenate(components, axis = 0) + self.stats_ = {'components' : components} + """ + # Incremental + # If too long to exec, will have to parallelise internal SVD computations + components = None + singular_values = None + if self._nb_features > self._nb_components: + for batch in ds.iter_batches(batch_format = 'numpy'): + batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) + if components is not None: + # Build matrix of previous computations + batch = np.vstack( + ( + singular_values.reshape((-1, 1)) * components, + batch, + ) + ) + + U, Sigma, VT = randomized_svd( + batch, + n_components = self._nb_components, + n_iter = 1, + power_iteration_normalizer = 'LU', + ) + components = VT + singular_values = Sigma + self.stats_ = {'components' : components} else: warn('No features reduction to do because the number of features is already lower than the required number of components') From b02f7c7c4414a7bafa2dfe0775243c3d1070250d Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 10 Nov 2023 16:05:49 -0500 Subject: [PATCH 31/92] tqdm for TruncSVD + materialize ds after preprocessing --- src/data/reduction/truncated_svd_reduction.py | 3 ++- src/models/kerasTF/models.py | 4 +++- src/models/sklearn/models.py | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py index 4d2a43a..ccf0187 100644 --- a/src/data/reduction/truncated_svd_reduction.py +++ b/src/data/reduction/truncated_svd_reduction.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd +from tqdm import tqdm from typing import List from warnings import warn from ray.data import Dataset @@ -65,7 +66,7 @@ def svd_batch(arr: np.array): components = None singular_values = None if self._nb_features > self._nb_components: - for batch in ds.iter_batches(batch_format = 'numpy'): + for batch in tqdm(ds.iter_batches(batch_format = 'numpy')): batch = batch[TENSOR_COLUMN_NAME] batch = _unwrap_ndarray_object_type_if_needed(batch) if components is not None: diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index f2d0a7d..cf1936d 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -190,6 +190,9 @@ def fit(self, datasets): ds = self._scaler.transform(ds) # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) + # Trigger the preprocessing computations before ingest in trainer + # Otherwise, it would be executed at each epoch + ds = ds.materialize() datasets[name] = ds # Training parameters @@ -321,7 +324,6 @@ def train_func(config): train_data = session.get_dataset_shard('train') val_data = session.get_dataset_shard('validation') - for _ in range(epochs): batch_train = train_data.to_tf( feature_columns = TENSOR_COLUMN_NAME, diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 73ca634..04b38b2 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -184,6 +184,9 @@ def fit(self, datasets): ds = self._scaler.transform(ds) # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) + # Trigger the preprocessing computations before ingest in trainer + # Otherwise, it would be executed at each epoch + ds = ds.materialize() datasets[name] = ray.put(ds) try: training_labels = self._encoded.copy() From 9b827f11aaa8a37bd0b66fb5986b6439cf792f7d Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 13 Nov 2023 09:33:03 -0500 Subject: [PATCH 32/92] parallel occurence counting --- src/data/reduction/occurence_exclusion.py | 15 +++++++--- src/data/reduction/truncated_svd_reduction.py | 28 ++----------------- src/models/preprocessors/tfidf_transformer.py | 14 ++++++++-- 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index fe9b45d..ab65389 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -22,12 +22,19 @@ def __init__(self, features: List[str], num_features: int): self._num_features = int(self._nb_features - num_features) def _fit(self, ds: Dataset) -> Preprocessor: + def get_occurences(batch): + batch = batch[TENSOR_COLUMN_NAME] + return {'occurences' : np.count_nonzero(batch, axis = 0)} + # Nb of occurences occurences = np.zeros(self._nb_features) - for batch in ds.iter_batches(batch_format = 'numpy'): - batch = batch[TENSOR_COLUMN_NAME] - occurences += np.count_nonzero(batch, axis = 0) - + occur = ds.map_batches(get_occurences, batch_format = 'numpy') + # for batch in ds.iter_batches(batch_format = 'numpy'): + # batch = batch[TENSOR_COLUMN_NAME] + # occurences += np.count_nonzero(batch, axis = 0) + for row in occur.iter_rows(): + occurences += row['occurences'] + # Include / Exclude by sorted position cols_keep = pd.Series(occurences, index = self.features) cols_keep = cols_keep.sort_values(ascending = True) # Long operation diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_reduction.py index ccf0187..ed653cb 100644 --- a/src/data/reduction/truncated_svd_reduction.py +++ b/src/data/reduction/truncated_svd_reduction.py @@ -36,33 +36,11 @@ def __init__(self, features: List[str], nb_components: int = 10000): def _fit(self, ds: Dataset) -> Preprocessor: # Parallel """ - def svd_batch(arr: np.array): - df = arr[TENSOR_COLUMN_NAME] - df = _unwrap_ndarray_object_type_if_needed(df) - U, Sigma, VT = randomized_svd( - df, - n_components = self._nb_components, - n_iter = 1, - power_iteration_normalizer = 'LU', - random_state = None - ) - - return {'VT': [VT]} - - if self._nb_features > self._nb_components: - # Exec svd - components = [] - svd_vt = ds.map_batches(svd_batch, batch_format = 'numpy') - - for row in svd_vt.iter_rows(): - components.append(row['VT']) - - components = np.concatenate(components, axis = 0) - - self.stats_ = {'components' : components} + # TODO: implement parallel computation for svd + # https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html#scipy.linalg.svd + # https://github.com/scipy/scipy/blob/v1.11.3/scipy/linalg/_decomp_svd.py#L13-L138 """ # Incremental - # If too long to exec, will have to parallelise internal SVD computations components = None singular_values = None if self._nb_features > self._nb_components: diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py index 88d899c..ba4dcc3 100644 --- a/src/models/preprocessors/tfidf_transformer.py +++ b/src/models/preprocessors/tfidf_transformer.py @@ -27,10 +27,18 @@ def _fit(self, ds: Dataset) -> Preprocessor: nb_samples = ds.count() # Nb of occurences - occurences = np.zeros(self._nb_features) - for batch in ds.iter_batches(batch_format = 'numpy'): + def get_occurences(batch): batch = batch[TENSOR_COLUMN_NAME] - occurences += np.count_nonzero(batch, axis = 0) + return {'occurences' : np.count_nonzero(batch, axis = 0)} + + # Nb of occurences + occurences = np.zeros(self._nb_features) + occur = ds.map_batches(get_occurences, batch_format = 'numpy') + # for batch in ds.iter_batches(batch_format = 'numpy'): + # batch = batch[TENSOR_COLUMN_NAME] + # occurences += np.count_nonzero(batch, axis = 0) + for row in occur.iter_rows(): + occurences += row['occurences'] idf = np.log(nb_samples / occurences) + 1 From f5ccbec396590506118b4762deb2de39e78ff585 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 13 Nov 2023 09:40:54 -0500 Subject: [PATCH 33/92] serial occurence counting --- src/data/reduction/occurence_exclusion.py | 12 ++++++------ src/models/preprocessors/tfidf_transformer.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index ab65389..05aa6d3 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -28,12 +28,12 @@ def get_occurences(batch): # Nb of occurences occurences = np.zeros(self._nb_features) - occur = ds.map_batches(get_occurences, batch_format = 'numpy') - # for batch in ds.iter_batches(batch_format = 'numpy'): - # batch = batch[TENSOR_COLUMN_NAME] - # occurences += np.count_nonzero(batch, axis = 0) - for row in occur.iter_rows(): - occurences += row['occurences'] + # occur = ds.map_batches(get_occurences, batch_format = 'numpy') + for batch in ds.iter_batches(batch_format = 'numpy'): + batch = batch[TENSOR_COLUMN_NAME] + occurences += np.count_nonzero(batch, axis = 0) + # for row in occur.iter_rows(): + # occurences += row['occurences'] # Include / Exclude by sorted position cols_keep = pd.Series(occurences, index = self.features) diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py index ba4dcc3..188ae0b 100644 --- a/src/models/preprocessors/tfidf_transformer.py +++ b/src/models/preprocessors/tfidf_transformer.py @@ -33,12 +33,12 @@ def get_occurences(batch): # Nb of occurences occurences = np.zeros(self._nb_features) - occur = ds.map_batches(get_occurences, batch_format = 'numpy') - # for batch in ds.iter_batches(batch_format = 'numpy'): - # batch = batch[TENSOR_COLUMN_NAME] - # occurences += np.count_nonzero(batch, axis = 0) - for row in occur.iter_rows(): - occurences += row['occurences'] + # occur = ds.map_batches(get_occurences, batch_format = 'numpy') + for batch in ds.iter_batches(batch_format = 'numpy'): + batch = batch[TENSOR_COLUMN_NAME] + occurences += np.count_nonzero(batch, axis = 0) + # for row in occur.iter_rows(): + # occurences += row['occurences'] idf = np.log(nb_samples / occurences) + 1 From a071131454a8d85baea9c333f3162c48f0ca9943 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 13 Nov 2023 17:40:12 -0500 Subject: [PATCH 34/92] parallel LSA --- src/data/reduction/occurence_exclusion.py | 7 -- ...tion.py => truncated_svd_decomposition.py} | 80 +++++++++++++------ src/models/kerasTF/models.py | 4 +- src/models/preprocessors/tfidf_transformer.py | 8 -- src/models/sklearn/models.py | 4 +- 5 files changed, 58 insertions(+), 45 deletions(-) rename src/data/reduction/{truncated_svd_reduction.py => truncated_svd_decomposition.py} (56%) diff --git a/src/data/reduction/occurence_exclusion.py b/src/data/reduction/occurence_exclusion.py index 05aa6d3..8eee147 100644 --- a/src/data/reduction/occurence_exclusion.py +++ b/src/data/reduction/occurence_exclusion.py @@ -22,18 +22,11 @@ def __init__(self, features: List[str], num_features: int): self._num_features = int(self._nb_features - num_features) def _fit(self, ds: Dataset) -> Preprocessor: - def get_occurences(batch): - batch = batch[TENSOR_COLUMN_NAME] - return {'occurences' : np.count_nonzero(batch, axis = 0)} - # Nb of occurences occurences = np.zeros(self._nb_features) - # occur = ds.map_batches(get_occurences, batch_format = 'numpy') for batch in ds.iter_batches(batch_format = 'numpy'): batch = batch[TENSOR_COLUMN_NAME] occurences += np.count_nonzero(batch, axis = 0) - # for row in occur.iter_rows(): - # occurences += row['occurences'] # Include / Exclude by sorted position cols_keep = pd.Series(occurences, index = self.features) diff --git a/src/data/reduction/truncated_svd_reduction.py b/src/data/reduction/truncated_svd_decomposition.py similarity index 56% rename from src/data/reduction/truncated_svd_reduction.py rename to src/data/reduction/truncated_svd_decomposition.py index ed653cb..6d4aa6f 100644 --- a/src/data/reduction/truncated_svd_reduction.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -13,9 +13,9 @@ TENSOR_COLUMN_NAME = '__value__' -class TensorTruncatedSVDReduction(Preprocessor): +class TensorTruncatedSVDDecomposition(Preprocessor): """ - Custom class for using a mix of TruncatedSVD inspired by sklearn.decomposition.TruncatedSVD and applying a batched strategy inspired by sklearn.decomposition.IncrementalPCA to process batches in parallel. + Custom class for using a mix of TruncatedSVD inspired by sklearn.decomposition.TruncatedSVD and applying a batched strategy inspired by sklearn.decomposition.IncrementalPCA to process batches sequentially. This makes it possible to use the class as a Ray preprocessor in a features reduction strategy. TruncatedSVD performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). When it is applied following the TF-IDF normalisation, it becomes a latent semantic analysis (LSA). @@ -23,7 +23,6 @@ class TensorTruncatedSVDReduction(Preprocessor): https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA """ - def __init__(self, features: List[str], nb_components: int = 10000): # Parameters self.features = features @@ -34,36 +33,65 @@ def __init__(self, features: List[str], nb_components: int = 10000): self._var = 0.0 def _fit(self, ds: Dataset) -> Preprocessor: - # Parallel + # Parallel MiniBatchPCA """ - # TODO: implement parallel computation for svd - # https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html#scipy.linalg.svd - # https://github.com/scipy/scipy/blob/v1.11.3/scipy/linalg/_decomp_svd.py#L13-L138 + Possibilities for parallel TruncatedSVD + * sklearn minibatch PCA -> PCA / SVD mostly equivalent + * implement parallel based on other library + * dask-ml has a truncated svd + * tf has a svd function """ - # Incremental - components = None - singular_values = None - if self._nb_features > self._nb_components: - for batch in tqdm(ds.iter_batches(batch_format = 'numpy')): - batch = batch[TENSOR_COLUMN_NAME] - batch = _unwrap_ndarray_object_type_if_needed(batch) - if components is not None: - # Build matrix of previous computations - batch = np.vstack( - ( - singular_values.reshape((-1, 1)) * components, - batch, - ) - ) - - U, Sigma, VT = randomized_svd( + """ + Option to implement parallel computation for SVD + 1- Sparse Dictionnary Learning -> encode data to sparse representation by sample + 2- Sparse PCA (sparse SVD?) -> construct a PCA from sparsely encoded data + It is possible to parallelize batches computation by applying the logic from MiniBatchDictionaryLearning and MiniBatchSparsePCA + """ + components = [] + def batch_svd(batch): + batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) + + U, Sigma, VT = randomized_svd( batch, n_components = self._nb_components, n_iter = 1, power_iteration_normalizer = 'LU', ) - components = VT - singular_values = Sigma + return {'VT' : VT} + + if self._nb_features > self._nb_components: + svd = ds.map_batches(batch_svd, batch_format = 'numpy') + for row in svd.iter_rows(): + components.append(row['VT']) + components = np.sum(components, axis = 0) + + # Incremental + # components = None + # singular_values = None + # if self._nb_features > self._nb_components: + # for batch in tqdm(ds.iter_batches(batch_format = 'numpy')): + # batch = batch[TENSOR_COLUMN_NAME] + # batch = _unwrap_ndarray_object_type_if_needed(batch) + # if components is not None: + # # Build matrix of previous computations + # batch = np.vstack( + # ( + # singular_values.reshape((-1, 1)) * components, + # batch, + # ) + # ) + # # U : (1000, 100), S : (100,), V : (100, 1024) + # # S.reshape : (100, 1), S.reshape * components : (100, 1024) + # # batch : (1000, 1024), vstack : (1100, 1024) + # U, Sigma, VT = randomized_svd( + # batch, + # n_components = self._nb_components, + # n_iter = 1, + # power_iteration_normalizer = 'LU', + # ) + # components = VT + # singular_values = Sigma self.stats_ = {'components' : components} else: diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index cf1936d..a6bf191 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -11,7 +11,7 @@ # Dimensions reduction from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection -from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction +from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain @@ -161,7 +161,7 @@ def preprocess(self, ds): self._encoder.fit(ds) ds = self._scaler.fit_transform(ds) # ds = self._preprocessor.fit_transform(ds) - self._reductor = TensorTruncatedSVDReduction(self.kmers) + self._reductor = TensorTruncatedSVDDecomposition(self.kmers) self._reductor.fit(ds) # Labels mapping if self._nb_classes == 2: diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py index 188ae0b..88d899c 100644 --- a/src/models/preprocessors/tfidf_transformer.py +++ b/src/models/preprocessors/tfidf_transformer.py @@ -26,19 +26,11 @@ def __init__(self, features): def _fit(self, ds: Dataset) -> Preprocessor: nb_samples = ds.count() - # Nb of occurences - def get_occurences(batch): - batch = batch[TENSOR_COLUMN_NAME] - return {'occurences' : np.count_nonzero(batch, axis = 0)} - # Nb of occurences occurences = np.zeros(self._nb_features) - # occur = ds.map_batches(get_occurences, batch_format = 'numpy') for batch in ds.iter_batches(batch_format = 'numpy'): batch = batch[TENSOR_COLUMN_NAME] occurences += np.count_nonzero(batch, axis = 0) - # for row in occur.iter_rows(): - # occurences += row['occurences'] idf = np.log(nb_samples / occurences) + 1 diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 04b38b2..0591372 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -10,7 +10,7 @@ # Dimensions reduction from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection -from data.reduction.truncated_svd_reduction import TensorTruncatedSVDReduction +from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition # Preprocessing from ray.data.preprocessors import Chain @@ -117,7 +117,7 @@ def preprocess(self, ds): ds = self._scaler.fit_transform(ds) # ds = self._preprocessor.fit_transform(ds) # self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep'] - self._reductor = TensorTruncatedSVDReduction(self.kmers) + self._reductor = TensorTruncatedSVDDecomposition(self.kmers) self._reductor.fit(ds) # Labels mapping From e51e10bd72694223228731f0b8c031b8cce7680e Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 14 Nov 2023 16:51:07 -0500 Subject: [PATCH 35/92] features reduction using CountHashing --- src/data/reduction/count_hashing.py | 2 +- .../reduction/truncated_svd_decomposition.py | 47 +++++++++++-------- src/models/classification.py | 4 +- src/models/kerasTF/models.py | 16 ++----- src/models/sklearn/models.py | 13 ++--- 5 files changed, 39 insertions(+), 43 deletions(-) diff --git a/src/data/reduction/count_hashing.py b/src/data/reduction/count_hashing.py index 1b6506e..d23e0cf 100644 --- a/src/data/reduction/count_hashing.py +++ b/src/data/reduction/count_hashing.py @@ -19,7 +19,7 @@ class TensorCountHashing(Preprocessor): """ _is_fittable = False - def __init__(self, features: List[str], num_features: int): + def __init__(self, features: List[str], num_features: int = 1000): self.features = features self.num_features = num_features diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py index 6d4aa6f..3cc14c8 100644 --- a/src/data/reduction/truncated_svd_decomposition.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -1,3 +1,5 @@ +import os + import numpy as np import pandas as pd @@ -5,6 +7,7 @@ from typing import List from warnings import warn from ray.data import Dataset +from utils import save_Xy_data, load_Xy_data from sklearn.utils.extmath import randomized_svd @@ -23,17 +26,15 @@ class TensorTruncatedSVDDecomposition(Preprocessor): https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA """ - def __init__(self, features: List[str], nb_components: int = 10000): + def __init__(self, features: List[str], nb_components: int = 100, file: str = ''): # Parameters self.features = features self._nb_features = len(features) self._nb_components = nb_components - self._n_samples_seen = 0 - self._mean = 0.0 - self._var = 0.0 - + self._file = file + def _fit(self, ds: Dataset) -> Preprocessor: - # Parallel MiniBatchPCA + # Parallel """ Possibilities for parallel TruncatedSVD * sklearn minibatch PCA -> PCA / SVD mostly equivalent @@ -51,20 +52,28 @@ def _fit(self, ds: Dataset) -> Preprocessor: def batch_svd(batch): batch = batch[TENSOR_COLUMN_NAME] batch = _unwrap_ndarray_object_type_if_needed(batch) - - U, Sigma, VT = randomized_svd( - batch, - n_components = self._nb_components, - n_iter = 1, - power_iteration_normalizer = 'LU', - ) - return {'VT' : VT} + U, S, V = randomized_svd( + batch, + n_components = self._nb_components, + n_iter = 1, + power_iteration_normalizer = 'LU', + ) + print(V.shape) + return {'V' : V} if self._nb_features > self._nb_components: - svd = ds.map_batches(batch_svd, batch_format = 'numpy') - for row in svd.iter_rows(): - components.append(row['VT']) - components = np.sum(components, axis = 0) + if os.path.isfile(self._file): + components = np.array(load_Xy_data(self._file)) + else: + # sampl = ds.random_sample(0.1) + # svd = sampl.map_batches(batch_svd, batch_format = 'numpy') + svd = ds.map_batches(batch_svd, batch_format = 'numpy') + print(svd.to_pandas()) + for row in svd.iter_rows(): + components.append(row['V']) + # components = np.vstack(components) + components = np.sum(components, axis = 0) + save_Xy_data(components, self._file) # Incremental # components = None @@ -108,7 +117,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: tensor_col = df[TENSOR_COLUMN_NAME] tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) tensor_col = np.dot(tensor_col, components.T) - df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) return df diff --git a/src/models/classification.py b/src/models/classification.py index cbad2be..eaa682c 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -203,7 +203,7 @@ def _binary_training(self, datasets, taxa, file): taxa, self._database_data['kmers'] ) - model.preprocess(datasets[TRAINING_DATASET_NAME]) + model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz')) model.fit(datasets) self._save_model(model, file) @@ -228,7 +228,7 @@ def _multiclass_training(self, datasets, taxa, file): taxa, self._database_data['kmers'] ) - model.preprocess(datasets[TRAINING_DATASET_NAME]) + model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz')) model.fit(datasets) self._save_model(model, file) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index a6bf191..a955274 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -9,6 +9,7 @@ from shutil import rmtree # Dimensions reduction +from data.reduction.count_hashing import TensorCountHashing from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition @@ -133,7 +134,7 @@ def __init__( elif self.classifier == 'widecnn': print('Training multiclass classifier based on Wide CNN Network') - def preprocess(self, ds): + def preprocess(self, ds, reductor_file): print('preprocess') labels = [] encoded = [] @@ -143,25 +144,16 @@ def preprocess(self, ds): if self._nb_classes == 2: self._encoder = ModelLabelEncoder(self.taxa) self._scaler = TensorTfIdfTransformer(self.kmers) - # self._preprocessor = Chain( - # TensorTfIdfTransformer(self.kmers), - # TensorRDFFeaturesSelection(self.kmers, self.taxa), - # ) else: self._encoder = Chain( LabelEncoder(self.taxa), OneHotTensorEncoder(self.taxa) ) self._scaler = TensorTfIdfTransformer(self.kmers) - # self._preprocessor = Chain( - # TensorTfIdfTransformer(self.kmers), - # TensorRDFFeaturesSelection(self.kmers, self.taxa), - # ) - + self._encoder.fit(ds) ds = self._scaler.fit_transform(ds) - # ds = self._preprocessor.fit_transform(ds) - self._reductor = TensorTruncatedSVDDecomposition(self.kmers) + self._reductor = TensorCountHashing(self.kmers, 10000) self._reductor.fit(ds) # Labels mapping if self._nb_classes == 2: diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 0591372..650b62f 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -8,12 +8,12 @@ from shutil import rmtree # Dimensions reduction +from data.reduction.count_hashing import TensorCountHashing from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition # Preprocessing -from ray.data.preprocessors import Chain from models.encoders.model_label_encoder import ModelLabelEncoder from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder @@ -98,7 +98,7 @@ def __init__( # Computes self._build() - def preprocess(self, ds): + def preprocess(self, ds, reductor_file): print('preprocess') if self.classifier == 'onesvm': self._encoder = OneClassSVMLabelEncoder(self.taxa) @@ -109,15 +109,10 @@ def preprocess(self, ds): self._scaler = TensorTfIdfTransformer(self.kmers) - # self._preprocessor = Chain( - # TensorTfIdfTransformer(self.kmers), - # TensorRDFFeaturesSelection(self.kmers, self.taxa), - # ) self._encoder.fit(ds) ds = self._scaler.fit_transform(ds) - # ds = self._preprocessor.fit_transform(ds) - # self.kmers = self._preprocessor.preprocessors[1].stats_['cols_keep'] - self._reductor = TensorTruncatedSVDDecomposition(self.kmers) + + self._reductor = TensorCountHashing(self.kmers, 10000) self._reductor.fit(ds) # Labels mapping From 4609ff97a22968dc24df35791bdacd2b16f62b6f Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 14 Nov 2023 16:54:16 -0500 Subject: [PATCH 36/92] remove tfidf transform in preprocessing --- src/models/kerasTF/models.py | 2 +- src/models/sklearn/models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index a955274..8d0d992 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -152,7 +152,7 @@ def preprocess(self, ds, reductor_file): self._scaler = TensorTfIdfTransformer(self.kmers) self._encoder.fit(ds) - ds = self._scaler.fit_transform(ds) + self._scaler.fit(ds) self._reductor = TensorCountHashing(self.kmers, 10000) self._reductor.fit(ds) # Labels mapping diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 650b62f..f74fc8e 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -110,7 +110,7 @@ def preprocess(self, ds, reductor_file): self._scaler = TensorTfIdfTransformer(self.kmers) self._encoder.fit(ds) - ds = self._scaler.fit_transform(ds) + self._scaler.fit(ds) self._reductor = TensorCountHashing(self.kmers, 10000) self._reductor.fit(ds) From 20ad4875b36f1fdfef0be521f412d5deab712406 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 15 Nov 2023 18:13:27 -0500 Subject: [PATCH 37/92] TruncatedSVD + keras debugging --- src/data/reduction/count_hashing.py | 30 +++++++----- .../reduction/truncated_svd_decomposition.py | 49 ++++++++++++++----- src/models/kerasTF/build_neural_networks.py | 44 ++++++++--------- src/models/kerasTF/models.py | 12 +++-- src/models/models_utils.py | 2 +- src/models/sklearn/models.py | 7 +-- src/models/sklearn/partial_trainer.py | 11 +++-- 7 files changed, 95 insertions(+), 60 deletions(-) diff --git a/src/data/reduction/count_hashing.py b/src/data/reduction/count_hashing.py index d23e0cf..89f4c13 100644 --- a/src/data/reduction/count_hashing.py +++ b/src/data/reduction/count_hashing.py @@ -21,27 +21,33 @@ class TensorCountHashing(Preprocessor): def __init__(self, features: List[str], num_features: int = 1000): self.features = features - self.num_features = num_features + self._nb_features = len(features) + self._num_features = num_features def _transform_pandas(self, df: pd.DataFrame): def row_feature_hasher(row): hash_counts = collections.defaultdict(int) for feature in self.features: - hashed_value = simple_hash(feature, self.num_features) + hashed_value = simple_hash(feature, self._num_features) hash_counts[hashed_value] += row[feature] - return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)} + return {f"hash_{i}": hash_counts[i] for i in range(self._num_features)} - tensor_col = df[TENSOR_COLUMN_NAME] - tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) - tensor_col = pd.DataFrame(tensor_col, columns = self.features) + if self._nb_features > self._num_features: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = pd.DataFrame(tensor_col, columns = self.features) - tensor_col = tensor_col.apply( - row_feature_hasher, axis=1, result_type="expand" - ) - - tensor_col = tensor_col.to_numpy() + tensor_col = tensor_col.apply( + row_feature_hasher, axis=1, result_type="expand" + ) + + self.stats_ = {'nb_features' : self._num_features} - df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + tensor_col = tensor_col.to_numpy() + + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + self.stats_ = {'nb_features' : self._nb_features} return df diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py index 3cc14c8..e48c2c5 100644 --- a/src/data/reduction/truncated_svd_decomposition.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -10,6 +10,7 @@ from utils import save_Xy_data, load_Xy_data from sklearn.utils.extmath import randomized_svd +from sklearn.decomposition import DictionaryLearning from ray.data.preprocessor import Preprocessor from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed @@ -26,7 +27,7 @@ class TensorTruncatedSVDDecomposition(Preprocessor): https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA """ - def __init__(self, features: List[str], nb_components: int = 100, file: str = ''): + def __init__(self, features: List[str], nb_components: int = 10000, file: str = ''): # Parameters self.features = features self._nb_features = len(features) @@ -48,33 +49,61 @@ def _fit(self, ds: Dataset) -> Preprocessor: 2- Sparse PCA (sparse SVD?) -> construct a PCA from sparsely encoded data It is possible to parallelize batches computation by applying the logic from MiniBatchDictionaryLearning and MiniBatchSparsePCA """ - components = [] + # Parallel + def batch_svd(batch): batch = batch[TENSOR_COLUMN_NAME] batch = _unwrap_ndarray_object_type_if_needed(batch) U, S, V = randomized_svd( batch, n_components = self._nb_components, - n_iter = 1, + n_iter = 2, power_iteration_normalizer = 'LU', ) - print(V.shape) return {'V' : V} + components = [] if self._nb_features > self._nb_components: if os.path.isfile(self._file): components = np.array(load_Xy_data(self._file)) else: # sampl = ds.random_sample(0.1) # svd = sampl.map_batches(batch_svd, batch_format = 'numpy') + svd = ds.map_batches(batch_svd, batch_size = 1, batch_format = 'numpy') + components = svd.random_shuffle().limit(self._nb_components).to_pandas()['V'] + components = _unwrap_ndarray_object_type_if_needed(components) + + save_Xy_data(components, self._file) + + self.stats_ = {'components' : components} + else: + warn('No features reduction to do because the number of features is already lower than the required number of components') + self.stats_ = {'components' : False} + """ + # Parallel multiple MiniBatchDictionaryLearning + def batch_svd(batch): + batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) + dict = DictionaryLearning( + n_components = self._nb_components, + max_iter = 10, + transform_algorithm = 'lasso_lars', + ) + dict.fit(batch) + return {'dictonnary' : [dict.components_]} + components = [] + if self._nb_features > self._nb_components: + if os.path.isfile(self._file): + components = np.array(load_Xy_data(self._file)) + else: svd = ds.map_batches(batch_svd, batch_format = 'numpy') print(svd.to_pandas()) for row in svd.iter_rows(): - components.append(row['V']) - # components = np.vstack(components) - components = np.sum(components, axis = 0) + components.append(row['dictonnary']) + components = np.mean(components, axis = 0) + print(components.shape) save_Xy_data(components, self._file) - + """ # Incremental # components = None # singular_values = None @@ -102,10 +131,6 @@ def batch_svd(batch): # components = VT # singular_values = Sigma - self.stats_ = {'components' : components} - else: - warn('No features reduction to do because the number of features is already lower than the required number of components') - self.stats_ = {'components' : False} return self diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 81751dc..80bdc07 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -13,14 +13,14 @@ __all__ = ['build_attention','build_LSTM','build_deepLSTM','build_LSTM_attention','build_CNN','build_wideCNN'] # Self-aware binary classifier -def build_attention(nb_kmers): +def build_attention(nb_features): """ Function extracted from module virnet/NNClassifier.py of VirNet package [Abdelkareem et al. 2018] https://github.com/alyosama/virnet/blob/master/NNClassifier.py """ - inputs = Input(shape = (nb_kmers,)) - x = Embedding(nb_kmers, 128)(inputs) + inputs = Input(shape = (nb_features,)) + x = Embedding(nb_features, 128)(inputs) x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x) x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x) @@ -36,15 +36,15 @@ def build_attention(nb_kmers): return model # Recurrent binary classifier -def build_LSTM(nb_kmers): +def build_LSTM(nb_features): """ Function extracted from module seeker/train_model/train_model.py of Seeker package [Auslander et al. 2020] https://github.com/gussow/seeker/blob/master/train_model/train_model.py """ - inputs = Input(shape = (nb_kmers,)) - x = Embedding(nb_kmers, 128)(inputs) + inputs = Input(shape = (nb_features,)) + x = Embedding(nb_features, 128)(inputs) x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(x) @@ -56,16 +56,16 @@ def build_LSTM(nb_kmers): return model # Deep recurrent binary classifier -def build_deepLSTM(nb_kmers): +def build_deepLSTM(nb_features): """ Function adapted from module deeplasmid/classifier/dl/DL_Model.py of Deeplasmid package [Andreopoulos et al. 2021] https://github.com/wandreopoulos/deeplasmid/blob/docker/classifier/dl/DL_Model.py """ - inputs = Input(shape=(nb_kmers,)) + inputs = Input(shape=(nb_features,)) - netA = Embedding(nb_kmers, 128)(inputs) + netA = Embedding(nb_features, 128)(inputs) netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (netA) netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA) @@ -87,7 +87,7 @@ def build_deepLSTM(nb_kmers): return model # Recurrent self-aware multiclass classifier -def build_LSTM_attention(nb_kmers, nb_classes): +def build_LSTM_attention(nb_features, nb_classes): """ Function adapted in keras from module DeepMicrobes/models/embed_lstm_attention.py and default values for layers in script DeepMicrobes/models/define_flags.py of @@ -95,12 +95,12 @@ def build_LSTM_attention(nb_kmers, nb_classes): https://github.com/MicrobeLab/DeepMicrobes/blob/master/models/embed_lstm_attention.py """ - inputs = Input(shape = (nb_kmers,)) - net = Embedding(nb_kmers, 100)(inputs) + inputs = Input(shape = (nb_features,)) + net = Embedding(nb_features, 100)(inputs) net = Bidirectional(LSTM(300, return_sequences=True))(net) net = Attention(dropout = 0.2)([net,net]) # MLP - net = Dense((nb_kmers * 300 * 2), activation = 'relu')(net) + net = Dense((nb_features * 300 * 2), activation = 'relu')(net) net = Dropout(0.2)(net) net = Dense(nb_classes, activation = 'relu')(net) net = Dropout(0.2)(net) @@ -113,7 +113,7 @@ def build_LSTM_attention(nb_kmers, nb_classes): return model # Convolutional multiclass classifier -def build_CNN(nb_kmers, nb_classes): +def build_CNN(nb_features, nb_classes): """ Function extracted from module MetagenomicDC/models/CNN.py of MetagenomicDC package [Fiannaca et al. 2018] @@ -121,7 +121,7 @@ def build_CNN(nb_kmers, nb_classes): """ model = Sequential() - model.add(Conv1D(5,5, input_shape = (nb_kmers, 1), padding = 'valid')) #input_dim + model.add(Conv1D(5,5, input_shape = (nb_features, 1), padding = 'valid')) #input_dim model.add(Activation('relu')) model.add(MaxPooling1D(pool_size = 2, padding = 'valid')) model.add(Conv1D(10, 5, padding = 'valid')) @@ -139,28 +139,28 @@ def build_CNN(nb_kmers, nb_classes): return model # Wide convolutional multiclass classifier -def build_wideCNN(nb_kmers, nb_classes): +def build_wideCNN(nb_features, nb_classes): """ Function adapted in keras from module CHEER/Classifier/model/Wcnn.py of CHEER package [Shang et al. 2021] https://github.com/KennthShang/CHEER/blob/master/Classifier/model/Wcnn.py """ - inputs = Input(shape = (nb_kmers,)) + inputs = Input(shape = (nb_features,)) embed = Embedding(248, 100)(inputs) - embed = Reshape((nb_kmers, -1, 1))(embed) + embed = Reshape((nb_features, -1, 1))(embed) conv1 = Conv2D(256, 3, activation = 'relu')(embed) - conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv1) + conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv1) conv2 = Conv2D(256, 7, activation = 'relu')(embed) - conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv2) + conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv2) conv3 = Conv2D(256, 11, activation = 'relu')(embed) - conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv3) + conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv3) conv4 = Conv2D(256, 15, activation = 'relu')(embed) - conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_kmers)(conv4) + conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv4) net = Concatenate(axis = 1)([conv1,conv2,conv3,conv4]) net = Flatten()(net) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 8d0d992..9187e69 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -152,8 +152,9 @@ def preprocess(self, ds, reductor_file): self._scaler = TensorTfIdfTransformer(self.kmers) self._encoder.fit(ds) - self._scaler.fit(ds) - self._reductor = TensorCountHashing(self.kmers, 10000) + ds = self._scaler.fit_transform(ds) + self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file) + # self._reductor = TensorCountHashing(self.kmers, 10000) self._reductor.fit(ds) # Labels mapping if self._nb_classes == 2: @@ -182,6 +183,7 @@ def fit(self, datasets): ds = self._scaler.transform(ds) # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) + self._nb_features = self._reductor._nb_components # Trigger the preprocessing computations before ingest in trainer # Otherwise, it would be executed at each epoch ds = ds.materialize() @@ -191,7 +193,7 @@ def fit(self, datasets): self._train_params = { 'batch_size': self.batch_size, 'epochs': self._training_epochs, - 'size': self._nb_kmers, + 'size': self._nb_features, 'nb_cls': self._nb_classes, 'model': self.classifier } @@ -228,12 +230,12 @@ def predict(self, ds, threshold=0.8): # Preprocess ds = self._scaler.transform(ds) - # ds = self._preprocessor.transform(ds) + ds = self._reductor.transform(ds) self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, len(self.kmers)) + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_features) ) predictions = self._predictor.predict( data = ds, diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 7e6e50f..2eeb0a9 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -68,12 +68,12 @@ def __init__( self._training_epochs = training_epochs # Initialize empty self._labels_map = None - # Initialize Ray variables self._clf = None self._encoder = None self._scaler = None self._preprocessor = None self._reductor = None + self._nb_features = None self._model_ckpt = None self._trainer = None self._train_params = {} diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index f74fc8e..9199c4e 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -110,9 +110,10 @@ def preprocess(self, ds, reductor_file): self._scaler = TensorTfIdfTransformer(self.kmers) self._encoder.fit(ds) - self._scaler.fit(ds) + ds = self._scaler.fit_transform(ds) - self._reductor = TensorCountHashing(self.kmers, 10000) + self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file) + # self._reductor = TensorCountHashing(self.kmers, 10000) self._reductor.fit(ds) # Labels mapping @@ -179,6 +180,7 @@ def fit(self, datasets): ds = self._scaler.transform(ds) # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) + self._nb_features = self._reductor._nb_components # Trigger the preprocessing computations before ingest in trainer # Otherwise, it would be executed at each epoch ds = ds.materialize() @@ -218,7 +220,6 @@ def predict(self, ds, threshold = 0.8): print('predict') if ds.count() > 0: ds = self._scaler.transform(ds) - # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py index 021d9ce..046c88c 100644 --- a/src/models/sklearn/partial_trainer.py +++ b/src/models/sklearn/partial_trainer.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from tqdm import tqdm from joblib import parallel_backend from sklearn.metrics import check_scoring @@ -201,18 +202,18 @@ def training_loop(self): _set_cpu_params(self.estimator, num_cpus) - for epoch_X, epoch_y in zip(X_train.iter_epochs(), y_train.iter_epochs()): + for epoch_X, epoch_y in tqdm(zip(X_train.iter_epochs(), y_train.iter_epochs())): with parallel_backend("ray", n_jobs=num_cpus): start_time = time() for batch_X, batch_y in zip( epoch_X.iter_batches( - # batch_size = self._batch_size, - batch_size = 1, + batch_size = self._batch_size, + # batch_size = 1, batch_format = 'numpy' ), epoch_y.iter_batches( - # batch_size = self._batch_size, - batch_size = 1, + batch_size = self._batch_size, + # batch_size = 1, batch_format = 'numpy' ) ): From 23b6ddba6be8e67ed2b041354ac2d1db7131be12 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 16 Nov 2023 06:59:56 -0500 Subject: [PATCH 38/92] remove random sample in TruncatedSVD --- src/data/reduction/truncated_svd_decomposition.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py index e48c2c5..0a67cd0 100644 --- a/src/data/reduction/truncated_svd_decomposition.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -69,8 +69,8 @@ def batch_svd(batch): else: # sampl = ds.random_sample(0.1) # svd = sampl.map_batches(batch_svd, batch_format = 'numpy') - svd = ds.map_batches(batch_svd, batch_size = 1, batch_format = 'numpy') - components = svd.random_shuffle().limit(self._nb_components).to_pandas()['V'] + svd = ds.map_batches(batch_svd, batch_format = 'numpy') + components = svd.limit(self._nb_components).to_pandas()['V'] components = _unwrap_ndarray_object_type_if_needed(components) save_Xy_data(components, self._file) From 40835bb7e46e45045fc684a73ccadaa213dc6dba Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 17 Nov 2023 20:49:16 -0500 Subject: [PATCH 39/92] req for CCDB + dataset name in classif --- requirements.txt | 18 ++++++++---------- src/Caribou_classification.py | 2 +- src/Caribou_classification_train_cv.py | 4 ++-- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index a8d4ad5..4a6ea4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,8 +26,8 @@ google-auth-oauthlib==1.0.0 google-pasta==0.2.0 googleapis-common-protos==1.60.0 gpustat==1.1 -grpcio==1.48.2 -h5py==3.9.0 +grpcio==1.57.0 +h5py==3.8.0 idna==3.4 importlib-metadata==6.8.0 importlib-resources==6.0.0 @@ -41,7 +41,7 @@ Markdown==3.4.4 MarkupSafe==2.1.3 msgpack==1.0.5 multidict==6.0.4 -numpy==1.24.3 +numpy==1.25.2 nvidia-ml-py==12.535.77 oauthlib==3.2.2 opencensus==0.11.2 @@ -55,7 +55,7 @@ prometheus-client==0.13.1 protobuf==4.23.4 psutil==5.9.5 py-spy==0.3.14 -pyarrow==12.0.1 +pyarrow==12.0.0 pyasn1==0.5.0 pyasn1-modules==0.3.0 pydantic==1.10.12 @@ -67,7 +67,7 @@ ray==2.6.3 referencing==0.30.2 requests==2.31.0 requests-oauthlib==1.3.1 -rpds-py==0.9.2 +rpds-py==0.10.0 rsa==4.9 scikit-learn==1.3.0 scipy==1.10.1 @@ -77,9 +77,9 @@ tabulate==0.9.0 tensorboard==2.13.0 tensorboard-data-server==0.7.1 tensorboardX==2.6.2 -tensorflow==2.13.0 +tensorflow==2.14.0 tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.33.0 +tensorflow-io-gcs-filesystem==0.32.0 termcolor==2.3.0 threadpoolctl==3.2.0 tqdm==4.65.0 @@ -92,6 +92,4 @@ wcwidth==0.2.6 Werkzeug==2.3.6 wrapt==1.15.0 yarl==1.9.2 -zipp==3.16.2 -xgboost==2.0.1 -xgboost_ray==0.1.18 \ No newline at end of file +zipp==3.16.2 \ No newline at end of file diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index 0c4b460..da9dbf7 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -47,7 +47,7 @@ def bacteria_classification(opt): if 'domain' in lst_taxas: lst_taxas.remove('domain') - val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) + val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") datasets = { TRAINING_DATASET_NAME : db_ds, diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index f6832a8..00fb97f 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -53,8 +53,8 @@ def bacteria_classification_train_cv(opt): for taxa in lst_taxas: - test_ds, test_data = split_sim_dataset(db_ds, db_data, TEST_DATASET_NAME) - val_ds, val_data = split_sim_dataset(db_ds, db_data, VALIDATION_DATASET_NAME) + test_ds, test_data = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}") + val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") datasets = { TRAINING_DATASET_NAME : db_ds, From 556c30268ea8d5c62159551f7240ee69419da4b0 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 18 Nov 2023 13:08:03 -0500 Subject: [PATCH 40/92] class weights + adjust requirements to be more fluid --- requirements copy.txt | 95 ++++++++++++++++++ requirements.txt | 97 ++----------------- src/models/classification.py | 8 +- src/models/encoders/model_label_encoder.py | 4 +- src/models/kerasTF/models.py | 21 ++-- src/models/models_utils.py | 1 + .../preprocessors/compute_class_weights.py | 49 ++++++++++ src/models/sklearn/models.py | 32 +++--- 8 files changed, 183 insertions(+), 124 deletions(-) create mode 100644 requirements copy.txt create mode 100644 src/models/preprocessors/compute_class_weights.py diff --git a/requirements copy.txt b/requirements copy.txt new file mode 100644 index 0000000..caf68a4 --- /dev/null +++ b/requirements copy.txt @@ -0,0 +1,95 @@ +absl-py==1.4.0 +aiohttp==3.8.5 +aiohttp-cors==0.7.0 +aiosignal==1.3.1 +astunparse==1.6.3 +async-timeout==4.0.2 +attrs==23.1.0 +biopython==1.78 +blessed==1.20.0 +cachetools==5.3.1 +certifi==2023.7.22 +charset-normalizer==3.2.0 +click==8.1.6 +cloudpickle==2.2.1 +colorful==0.5.5 +Cython==3.0.0 +distlib==0.3.7 +filelock==3.12.2 +flatbuffers==23.5.26 +frozenlist==1.4.0 +future==0.18.3 +gast==0.4.0 +google-api-core==2.11.1 +google-auth==2.22.0 +google-auth-oauthlib==1.0.0 +google-pasta==0.2.0 +googleapis-common-protos==1.60.0 +gpustat==1.1 +grpcio==1.47.0 +h5py==3.8.0 +idna==3.4 +importlib-metadata==6.8.0 +importlib-resources==6.0.0 +InSilicoSeq==1.5.4 +joblib==1.3.1 +jsonschema==4.18.6 +jsonschema-specifications==2023.7.1 +keras==2.13.1 +libclang==16.0.6 +Markdown==3.4.4 +MarkupSafe==2.1.3 +msgpack==1.0.5 +multidict==6.0.4 +numpy==1.25.2 +nvidia-ml-py==12.535.77 +oauthlib==3.2.2 +opencensus==0.11.2 +opencensus-context==0.1.3 +opt-einsum==3.3.0 +packaging==23.1 +pandas==2.0.3 +pkgutil_resolve_name==1.3.10 +platformdirs==3.10.0 +prometheus-client==0.13.1 +protobuf==4.23.4 +psutil==5.9.5 +py-spy==0.3.14 +pyarrow==12.0.0 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pydantic==1.10.12 +pysam==0.21.0 +python-dateutil==2.8.2 +pytz==2023.3 +PyYAML==6.0.1 +ray==2.6.3 +referencing==0.30.2 +requests==2.31.0 +requests-oauthlib==1.3.1 +rpds-py==0.10.0 +rsa==4.9 +scikit-learn==1.3.0 +scipy==1.10.1 +six==1.16.0 +smart-open==6.3.0 +tabulate==0.9.0 +tensorboard==2.13.0 +tensorboard-data-server==0.7.1 +tensorboardX==2.6.2 +tensorflow==2.14.0 +tensorflow-estimator==2.13.0 +tensorflow-io-gcs-filesystem==0.32.0 +termcolor==2.3.0 +threadpoolctl==3.2.0 +tqdm==4.65.0 +tune-sklearn==0.4.6 +typing_extensions==4.5.0 +tzdata==2023.3 +urllib3==1.26.16 +virtualenv==20.24.2 +wcwidth==0.2.6 +Werkzeug==2.3.6 +wrapt==1.15.0 +yarl==1.9.2 +zipp==3.16.2 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4a6ea4b..d409b51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,95 +1,10 @@ -absl-py==1.4.0 -aiohttp==3.8.5 -aiohttp-cors==0.7.0 -aiosignal==1.3.1 -astunparse==1.6.3 -async-timeout==4.0.2 -attrs==23.1.0 biopython==1.78 -blessed==1.20.0 -cachetools==5.3.1 -certifi==2023.7.22 -charset-normalizer==3.2.0 -click==8.1.6 -cloudpickle==2.2.1 -colorful==0.5.5 -Cython==3.0.0 -distlib==0.3.7 -filelock==3.12.2 -flatbuffers==23.5.26 -frozenlist==1.4.0 -future==0.18.3 -gast==0.4.0 -google-api-core==2.11.1 -google-auth==2.22.0 -google-auth-oauthlib==1.0.0 -google-pasta==0.2.0 -googleapis-common-protos==1.60.0 -gpustat==1.1 -grpcio==1.57.0 -h5py==3.8.0 -idna==3.4 -importlib-metadata==6.8.0 -importlib-resources==6.0.0 +cloudpickle>=2.2.1 InSilicoSeq==1.5.4 -joblib==1.3.1 -jsonschema==4.18.6 -jsonschema-specifications==2023.7.1 -keras==2.13.1 -libclang==16.0.6 -Markdown==3.4.4 -MarkupSafe==2.1.3 -msgpack==1.0.5 -multidict==6.0.4 -numpy==1.25.2 -nvidia-ml-py==12.535.77 -oauthlib==3.2.2 -opencensus==0.11.2 -opencensus-context==0.1.3 -opt-einsum==3.3.0 -packaging==23.1 -pandas==2.0.3 -pkgutil_resolve_name==1.3.10 -platformdirs==3.10.0 -prometheus-client==0.13.1 -protobuf==4.23.4 -psutil==5.9.5 -py-spy==0.3.14 -pyarrow==12.0.0 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pydantic==1.10.12 -pysam==0.21.0 -python-dateutil==2.8.2 -pytz==2023.3 -PyYAML==6.0.1 +keras==2.14 +numpy>=1.2 +pandas>=2.0 ray==2.6.3 -referencing==0.30.2 -requests==2.31.0 -requests-oauthlib==1.3.1 -rpds-py==0.10.0 -rsa==4.9 scikit-learn==1.3.0 -scipy==1.10.1 -six==1.16.0 -smart-open==6.3.0 -tabulate==0.9.0 -tensorboard==2.13.0 -tensorboard-data-server==0.7.1 -tensorboardX==2.6.2 -tensorflow==2.14.0 -tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.32.0 -termcolor==2.3.0 -threadpoolctl==3.2.0 -tqdm==4.65.0 -tune-sklearn==0.4.6 -typing_extensions==4.5.0 -tzdata==2023.3 -urllib3==1.26.16 -virtualenv==20.24.2 -wcwidth==0.2.6 -Werkzeug==2.3.6 -wrapt==1.15.0 -yarl==1.9.2 -zipp==3.16.2 \ No newline at end of file +tensorflow==2.14 +pyarrow==12.0 \ No newline at end of file diff --git a/src/models/classification.py b/src/models/classification.py index eaa682c..99a4a9b 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -103,19 +103,19 @@ def cross_validation(self, datasets): """ Public function to call the cross-validation method after validation of parameters Executes cross-validation of a model by fitting it and predicting over a test dataset - """ - + """ if isinstance(self._taxas, str): self._valid_assign_taxas() tax_map = self._verify_model_trained() test_ds = datasets.pop(TEST_DATASET_NAME) y_true, test_ds = self._get_true_classif(test_ds, self._taxas) - + self._fit(datasets, tax_map) model_mapping = self._verify_load_model() y_pred = self._cv_predict(test_ds, model_mapping) + cv_scores = self._score_cv(y_true, y_pred, self._taxas[0]) return cv_scores @@ -168,7 +168,7 @@ def _cv_predict(self, ds, model_map): """ mapping = {} for taxa, model in model_map.items(): - mapping[taxa] = model.predict(ds) # np.array + mapping[taxa] = model.predict(ds) # np.array return mapping # Private training secondary functions diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py index 7084b2b..b635108 100644 --- a/src/models/encoders/model_label_encoder.py +++ b/src/models/encoders/model_label_encoder.py @@ -9,7 +9,7 @@ from ray.data import Dataset from ray.data.preprocessor import Preprocessor -from ray.data.preprocessors.encoder import _get_unique_value_indices, _validate_df +from ray.data.preprocessors.encoder import _get_unique_value_indices LABELS_COLUMN_NAME = 'labels' @@ -25,8 +25,6 @@ def _fit(self, dataset: Dataset) -> Preprocessor: return self def _transform_pandas(self, df: pd.DataFrame): - _validate_df(df, self.label_column) - def column_label_encoder(s: pd.Series): s_values = self.stats_[f"unique_values({s.name})"] return s.map(s_values) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 9187e69..08746c5 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -1,24 +1,18 @@ import os import gc -import ray import warnings import numpy as np import pandas as pd -from glob import glob -from shutil import rmtree - # Dimensions reduction -from data.reduction.count_hashing import TensorCountHashing from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer -from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain -from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.model_label_encoder import ModelLabelEncoder from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder +from models.preprocessors.compute_class_weights import ComputeClassWeights # Parent class / models from models.models_utils import ModelsUtils @@ -27,11 +21,10 @@ # Training import tensorflow as tf from ray.air import session -from ray.train import DataConfig # from ray.air.integrations.keras import Callback +from ray.air.config import ScalingConfig from ray.air.integrations.keras import ReportCheckpointCallback -from ray.air.config import ScalingConfig #DatasetConfig -from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint, prepare_dataset_shard +from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint # Tuning from ray.air.config import RunConfig @@ -152,6 +145,11 @@ def preprocess(self, ds, reductor_file): self._scaler = TensorTfIdfTransformer(self.kmers) self._encoder.fit(ds) + + self._weights = ComputeClassWeights(LABELS_COLUMN_NAME) + self._weights.fit(ds) + self._weights = self._weights.stats_ + ds = self._scaler.fit_transform(ds) self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file) # self._reductor = TensorCountHashing(self.kmers, 10000) @@ -183,7 +181,7 @@ def fit(self, datasets): ds = self._scaler.transform(ds) # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) - self._nb_features = self._reductor._nb_components + self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers # Trigger the preprocessing computations before ingest in trainer # Otherwise, it would be executed at each epoch ds = ds.materialize() @@ -231,6 +229,7 @@ def predict(self, ds, threshold=0.8): # Preprocess ds = self._scaler.transform(ds) ds = self._reductor.transform(ds) + ds = ds.materialize() self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 2eeb0a9..6a97587 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -67,6 +67,7 @@ def __init__( self._nb_kmers = len(kmers_list) self._training_epochs = training_epochs # Initialize empty + self._weights = [] self._labels_map = None self._clf = None self._encoder = None diff --git a/src/models/preprocessors/compute_class_weights.py b/src/models/preprocessors/compute_class_weights.py new file mode 100644 index 0000000..43b4c5d --- /dev/null +++ b/src/models/preprocessors/compute_class_weights.py @@ -0,0 +1,49 @@ + +import numpy as np +import pandas as pd + +from ray.data.dataset import Dataset +from ray.data.preprocessor import Preprocessor + +TENSOR_COLUMN_NAME = '__value__' + +class ComputeClassWeights(Preprocessor): + """ + Custom implementation of Class Weight Computation inspired by sklearn.utils.class_weight.compute_class_weight to be used as a Ray preprocessor. + https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html + This permits to estimate balanced class weights for an unbalanced dataset. + """ + + def __init__(self, class_col): + # Parameters + self._col = class_col + self._cls = [] + self._counts_map = {} + + def _fit(self, ds: Dataset) -> Preprocessor: + def get_cls_counts(df): + mapping = {} + counts = df[self._col].value_counts() + for cls in self._cls: + if cls in counts.index: + mapping[str(cls)] = [counts[cls]] + else: + mapping[str(cls)] = [0] + return mapping + + self._cls = ds.unique(self._col) + + counts = ds.map_batches(get_cls_counts, batch_format = 'pandas') + + for cls in self._cls: + self._counts_map[str(cls)] = counts.sum(str(cls)) + + freqs = ds.count() / (len(self._cls) * np.array(list(self._counts_map.values())).astype(np.float64)) + + self.stats_ = {} + for i, cls in enumerate(self._cls): + self.stats_[cls] = freqs[i] + + return self + + diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 9199c4e..4f7bc76 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -4,19 +4,14 @@ import numpy as np import pandas as pd -from glob import glob -from shutil import rmtree - # Dimensions reduction -from data.reduction.count_hashing import TensorCountHashing from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer -from data.reduction.rdf_features_selection import TensorRDFFeaturesSelection from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition # Preprocessing from models.encoders.model_label_encoder import ModelLabelEncoder -from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder +from models.preprocessors.compute_class_weights import ComputeClassWeights # Training from ray.air.config import ScalingConfig @@ -24,14 +19,13 @@ from sklearn.linear_model import SGDClassifier from models.sklearn.partial_trainer import SklearnPartialTrainer from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM -from models.sklearn.tensor_predictor import SklearnTensorPredictor # Tuning from ray.air.config import RunConfig # Predicting from ray.train.batch_predictor import BatchPredictor -from models.sklearn.probability_predictor import SklearnTensorProbaPredictor +from models.sklearn.tensor_predictor import SklearnTensorPredictor # Parent class from models.models_utils import ModelsUtils @@ -95,8 +89,6 @@ def __init__( ) # Parameters self._encoded = [] - # Computes - self._build() def preprocess(self, ds, reductor_file): print('preprocess') @@ -109,7 +101,12 @@ def preprocess(self, ds, reductor_file): self._scaler = TensorTfIdfTransformer(self.kmers) - self._encoder.fit(ds) + ds = self._encoder.fit_transform(ds) + + self._weights = ComputeClassWeights(LABELS_COLUMN_NAME) + self._weights.fit(ds) + self._weights = self._weights.stats_ + ds = self._scaler.fit_transform(ds) self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file) @@ -123,7 +120,7 @@ def preprocess(self, ds, reductor_file): labels = np.append(labels, 'unknown') self._encoded = np.append(self._encoded, -1) self._labels_map = zip(labels, self._encoded) - + def _label_decode(self, predict): print('_label_decode') decoded = pd.Series(np.empty(len(predict), dtype=object)) @@ -151,6 +148,7 @@ def _build(self): 'penalty' : 'elasticnet', 'alpha' : 141.6146176, 'learning_rate' : 'adaptive', + 'class_weight' : self._weights, 'eta0' : 0.001, 'n_jobs' : -1 } @@ -162,7 +160,8 @@ def _build(self): 'alpha' : 173.5667373, 'learning_rate' : 'optimal', 'loss': 'modified_huber', - 'penalty' : 'l2' + 'penalty' : 'l2', + 'class_weight' : self._weights, } elif self.classifier == 'mnb': print('Training multiclass Multinomial Naive Bayes classifier') @@ -174,17 +173,19 @@ def _build(self): def fit(self, datasets): print('_fit_model') + # Define model + self._build() for name, ds in datasets.items(): ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) ds = self._scaler.transform(ds) - # ds = self._preprocessor.transform(ds) ds = self._reductor.transform(ds) - self._nb_features = self._reductor._nb_components + self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers # Trigger the preprocessing computations before ingest in trainer # Otherwise, it would be executed at each epoch ds = ds.materialize() datasets[name] = ray.put(ds) + try: training_labels = self._encoded.copy() training_labels = np.delete(training_labels, np.where(training_labels == -1)) @@ -221,6 +222,7 @@ def predict(self, ds, threshold = 0.8): if ds.count() > 0: ds = self._scaler.transform(ds) ds = self._reductor.transform(ds) + ds = ds.materialize() predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) From 6608365320687f179d156139e7467c1e5da931f1 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 20 Nov 2023 18:28:49 -0500 Subject: [PATCH 41/92] cls weights + TruncatedSVD in separated step --- setup.cfg | 1 + src/Caribou_classification.py | 6 +- src/Caribou_classification_train_cv.py | 6 +- src/Caribou_dimensions_decomposition.py | 104 +++++++++++++++++++++ src/Caribou_extraction.py | 6 +- src/Caribou_extraction_train_cv.py | 6 +- src/Caribou_reduce_features.py | 56 +++++------ src/models/classification.py | 29 ++++-- src/models/encoders/model_label_encoder.py | 2 +- src/models/kerasTF/models.py | 53 +++++------ src/models/models_utils.py | 37 ++++++-- src/models/sklearn/models.py | 40 +++----- src/utils.py | 10 +- 13 files changed, 246 insertions(+), 110 deletions(-) create mode 100644 src/Caribou_dimensions_decomposition.py diff --git a/setup.cfg b/setup.cfg index fce114a..a9f82f6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,6 +35,7 @@ scripts = src/Caribou_kmers.py src/Caribou_reduce_features.py src/Caribou_simulate_test_val.py + src/Caribou_dimensions_decomposition.py src/Caribou_extraction.py src/Caribou_classification.py src/Caribou_extraction_train_cv.py diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index da9dbf7..1e1da9c 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -47,6 +47,9 @@ def bacteria_classification(opt): if 'domain' in lst_taxas: lst_taxas.remove('domain') + # Verify need for scaling + scaling = verify_need_scaling(db_data) + val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") datasets = { @@ -66,7 +69,8 @@ def bacteria_classification(opt): clf_multiclass = opt['model_type'], taxa = 'domain', batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'] + training_epochs = opt['training_epochs'], + scaling = scaling ) # Execution of bacteria taxonomic classification on metagenome + save results diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index 00fb97f..1f707b9 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -51,6 +51,9 @@ def bacteria_classification_train_cv(opt): if 'domain' in lst_taxas: lst_taxas.remove('domain') + # Verify need for scaling + scaling = verify_need_scaling(db_data) + for taxa in lst_taxas: test_ds, test_data = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}") @@ -72,7 +75,8 @@ def bacteria_classification_train_cv(opt): clf_multiclass = opt['model_type'], taxa = taxa, batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'] + training_epochs = opt['training_epochs'], + scaling = scaling ) t_s = time() diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py new file mode 100644 index 0000000..52ccff6 --- /dev/null +++ b/src/Caribou_dimensions_decomposition.py @@ -0,0 +1,104 @@ +#!/usr/bin python3 + +import ray +import os.path +import argparse + +import numpy as np + +from utils import * +from time import time +from glob import glob +from pathlib import Path + +from ray.data.preprocessors import Chain +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer +from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition + +__author__ = "Nicolas de Montigny" + +__all__ = ['dimensions_decomposition'] + +""" +This script computes dimensions decomposition via TruncatedSVD and saves a reduced version of the dataset. +""" + +# Initialisation / validation of parameters from CLI +################################################################################ +def dimensions_decomposition(opt): + + # Verify existence of files and load data + data = verify_load_data(opt['dataset']) + + # Verification of k length + k_length = len(data['kmers'][0]) + verify_file(opt['kmers_list']) + k_length, kmers = verify_kmers_list_length(k_length, opt['kmers_list']) + + outdirs = define_create_outdirs(opt['outdir']) + + # Initialize cluster + init_ray_cluster(opt['workdir']) + +# Dimensions decomposition +################################################################################ + + # Define new file + path, ext = os.path.splitext(opt['dataset']) + data_file = f'{path}_decomposed{ext}' + + if not os.path.exists(data_file): + if opt['nb_features'] < len(kmers): + # Load data + files_lst = glob(os.path.join(data['profile'],'*.parquet')) + ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + + reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz') + + # Compute the decomposition + preprocessor = Chain( + TensorTfIdfTransformer( + features = kmers + ), + TensorTruncatedSVDDecomposition( + features = kmers, + nb_components = opt['nb_components'], + file = reductor_file + ) + ) + t_s = time() + ds = preprocessor.fit_transform(ds) + t_decomposition = time() - t_s + + # Save decomposed dataset + data['profile'] = f"{data['profile']}_decomposed" + data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor._nb_components)] + ds.write_parquet(data['profile']) + + # Save decomposed data + save_Xy_data(data, data_file) + + print(f"Caribou finished decomposing the features of {opt['dataset_name']} in {t_decomposition} seconds.") + else: + print('Caribou did not decompose the features because the number to extract is bigger than the actual number of features') + else: + print("Caribou did not decompose the features because the file already exists") + +# Argument parsing from CLI +################################################################################ + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='This script computes features reduction to a given K-mers dataset and then applies it.') + # Dataset + parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') + parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files') + parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced') + # Parameters + parser.add_argument('-n','--nb_components', default=1000, type=int, help='Number of components to decompose data into') + parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') + parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled') + args = parser.parse_args() + + opt = vars(args) + + dimensions_decomposition(opt) diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py index 3876f2b..a2168fb 100644 --- a/src/Caribou_extraction.py +++ b/src/Caribou_extraction.py @@ -56,6 +56,9 @@ def bacteria_extraction(opt): db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['dataset_name'] + # Verify need for scaling + scaling = verify_need_scaling(db_data) + datasets = { TRAINING_DATASET_NAME : db_ds, VALIDATION_DATASET_NAME : val_ds @@ -73,7 +76,8 @@ def bacteria_extraction(opt): clf_binary = opt['model_type'], taxa = 'domain', batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'] + training_epochs = opt['training_epochs'], + scaling = scaling ) # Execution of bacteria extraction / host removal on metagenome + save results diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py index 1c73cad..2547886 100644 --- a/src/Caribou_extraction_train_cv.py +++ b/src/Caribou_extraction_train_cv.py @@ -52,6 +52,9 @@ def bacteria_extraction_train_cv(opt): db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['database_name'] + # Verify need for scaling + scaling = verify_need_scaling(db_data) + datasets = { TRAINING_DATASET_NAME : db_ds, TEST_DATASET_NAME : test_ds, @@ -69,7 +72,8 @@ def bacteria_extraction_train_cv(opt): clf_binary = opt['model_type'], taxa = 'domain', batch_size = opt['batch_size'], - training_epochs = opt['training_epochs'] + training_epochs = opt['training_epochs'], + scaling = scaling ) t_s = time() diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 412b8bc..c95861d 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -58,35 +58,39 @@ def features_reduction(opt): 2. TruncatedSVD decomposition (map the features to 10 000 decomposed features if there is still more) """ - # Load data - files_lst = glob(os.path.join(data['profile'],'*.parquet')) - export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - # Time the computation of transformations - t_start = time() - # Features scaling - train_ds = tfidf_transform(train_ds, kmers) - # Brute force features exclusion - train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers) - train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers) - # Statistical features selection - train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa']) - # Time the computation of transformations - t_end = time() - t_reduction = t_end - t_start - # Save reduced dataset - data['profile'] = f"{data['profile']}_reduced" - export_ds.write_parquet(data['profile']) - # Save reduced K-mers - with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle: - handle.writelines("%s\n" % item for item in data['kmers']) - # Save reduced data + # Define new file path, ext = os.path.splitext(opt['dataset']) data_file = f'{path}_reduced{ext}' - save_Xy_data(data, data_file) - - print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.") + if not os.path.exists(data_file): + # Load data + files_lst = glob(os.path.join(data['profile'],'*.parquet')) + export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # Time the computation of transformations + t_start = time() + # Features scaling + train_ds = tfidf_transform(train_ds, kmers) + # Brute force features exclusion + train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers) + train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers) + # Statistical features selection + train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa']) + # Time the computation of transformations + t_end = time() + t_reduction = t_end - t_start + # Save reduced dataset + data['profile'] = f"{data['profile']}_reduced" + export_ds.write_parquet(data['profile']) + # Save reduced K-mers + with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle: + handle.writelines("%s\n" % item for item in data['kmers']) + # Save reduced data + save_Xy_data(data, data_file) + + print(f"Caribou finished reducing k-mers features of {opt['dataset_name']} in {t_reduction} seconds.") + else: + print("Caribou did not reduce features because the file already exists") # TF-IDF scaling of the features def tfidf_transform(ds, kmers): preprocessor = TensorTfIdfTransformer( diff --git a/src/models/classification.py b/src/models/classification.py index 99a4a9b..a9ce5b1 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -48,18 +48,20 @@ def __init__( clf_multiclass: str = None, taxa: [str, List] = None, batch_size: int = 32, - training_epochs: int = 100 + training_epochs: int = 100, + scaling = False ): # Parameters self._taxas = taxa self._outdirs = outdirs + self._scaling = scaling self._database = db_name self._database_data = db_data self._classifier_binary = clf_binary self._classifier_multiclass = clf_multiclass self._batch_size = batch_size self._training_epochs = training_epochs - # Init not fitted + # Init False self.is_fitted = False # Public functions @@ -130,6 +132,7 @@ def _fit(self, datasets, tax_map): """ Fit the given model to the training dataset """ + for taxa, file in tax_map.items(): if taxa in ['domain','bacteria','host']: self._binary_training(datasets, taxa, file) @@ -183,7 +186,8 @@ def _binary_training(self, datasets, taxa, file): self._batch_size, self._training_epochs, taxa, - self._database_data['kmers'] + self._database_data['kmers'], + self._database_data['csv'] ) elif self._classifier_binary == 'linearsvm': model = SklearnModel( @@ -192,7 +196,8 @@ def _binary_training(self, datasets, taxa, file): self._batch_size, self._training_epochs, taxa, - self._database_data['kmers'] + self._database_data['kmers'], + self._database_data['csv'] ) else: model = KerasTFModel( @@ -201,9 +206,10 @@ def _binary_training(self, datasets, taxa, file): self._batch_size, self._training_epochs, taxa, - self._database_data['kmers'] + self._database_data['kmers'], + self._database_data['csv'] ) - model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz')) + model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling) model.fit(datasets) self._save_model(model, file) @@ -217,7 +223,8 @@ def _multiclass_training(self, datasets, taxa, file): self._batch_size, self._training_epochs, taxa, - self._database_data['kmers'] + self._database_data['kmers'], + self._database_data['csv'] ) else: model = KerasTFModel( @@ -226,9 +233,10 @@ def _multiclass_training(self, datasets, taxa, file): self._batch_size, self._training_epochs, taxa, - self._database_data['kmers'] + self._database_data['kmers'], + self._database_data['csv'] ) - model.preprocess(datasets[TRAINING_DATASET_NAME], os.path.join(self._outdirs['models_dir'], f'TruncatedSVD_components.npz')) + model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling) model.fit(datasets) self._save_model(model, file) @@ -424,4 +432,5 @@ def _save_dataset(self, ds, taxa): model = self._classifier_multiclass file = os.path.join(self._outdirs['results'], f'data_classified_{model}_{taxa}.parquet') ds.write_parquet(file) - return file \ No newline at end of file + return file + \ No newline at end of file diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py index b635108..3317257 100644 --- a/src/models/encoders/model_label_encoder.py +++ b/src/models/encoders/model_label_encoder.py @@ -25,8 +25,8 @@ def _fit(self, dataset: Dataset) -> Preprocessor: return self def _transform_pandas(self, df: pd.DataFrame): + s_values = self.stats_ def column_label_encoder(s: pd.Series): - s_values = self.stats_[f"unique_values({s.name})"] return s.map(s_values) df[self.label_column] = df[self.label_column].transform(column_label_encoder) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 08746c5..827225f 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -4,15 +4,11 @@ import numpy as np import pandas as pd -# Dimensions reduction -from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer -from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition - # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.encoders.model_label_encoder import ModelLabelEncoder from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder -from models.preprocessors.compute_class_weights import ComputeClassWeights +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Parent class / models from models.models_utils import ModelsUtils @@ -84,7 +80,8 @@ def __init__( batch_size, training_epochs, taxa, - kmers_list + kmers_list, + csv ): super().__init__( classifier, @@ -92,7 +89,8 @@ def __init__( batch_size, training_epochs, taxa, - kmers_list + kmers_list, + csv ) # Parameters # Initialize hidden @@ -127,7 +125,7 @@ def __init__( elif self.classifier == 'widecnn': print('Training multiclass classifier based on Wide CNN Network') - def preprocess(self, ds, reductor_file): + def preprocess(self, ds, scaling = False): print('preprocess') labels = [] encoded = [] @@ -136,24 +134,19 @@ def preprocess(self, ds, reductor_file): self._nb_classes = len(np.unique(labels)) if self._nb_classes == 2: self._encoder = ModelLabelEncoder(self.taxa) - self._scaler = TensorTfIdfTransformer(self.kmers) + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers) else: self._encoder = Chain( LabelEncoder(self.taxa), OneHotTensorEncoder(self.taxa) ) - self._scaler = TensorTfIdfTransformer(self.kmers) + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers) self._encoder.fit(ds) - - self._weights = ComputeClassWeights(LABELS_COLUMN_NAME) - self._weights.fit(ds) - self._weights = self._weights.stats_ - - ds = self._scaler.fit_transform(ds) - self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file) - # self._reductor = TensorCountHashing(self.kmers, 10000) - self._reductor.fit(ds) + if scaling: + self._scaler.fit(ds) # Labels mapping if self._nb_classes == 2: labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) @@ -163,6 +156,7 @@ def preprocess(self, ds, reductor_file): labels = np.append(labels, 'unknown') encoded = np.append(encoded, -1) self._labels_map = zip(labels, encoded) + self._compute_weights() def _label_decode(self, predict): print('_label_decode') @@ -178,10 +172,8 @@ def fit(self, datasets): for name, ds in datasets.items(): ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._scaler.transform(ds) - # ds = self._preprocessor.transform(ds) - ds = self._reductor.transform(ds) - self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers + if self._scaler is not None: + ds = self._scaler.transform(ds) # Trigger the preprocessing computations before ingest in trainer # Otherwise, it would be executed at each epoch ds = ds.materialize() @@ -191,9 +183,10 @@ def fit(self, datasets): self._train_params = { 'batch_size': self.batch_size, 'epochs': self._training_epochs, - 'size': self._nb_features, + 'size': self._nb_kmers, 'nb_cls': self._nb_classes, - 'model': self.classifier + 'model': self.classifier, + 'weights': self._weights } # Define trainer / tuner @@ -227,14 +220,14 @@ def predict(self, ds, threshold=0.8): ds = ds.drop_columns(col_2_drop) # Preprocess - ds = self._scaler.transform(ds) - ds = self._reductor.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) ds = ds.materialize() self._predictor = BatchPredictor.from_checkpoint( self._model_ckpt, TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_features) + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) ) predictions = self._predictor.predict( data = ds, @@ -308,8 +301,7 @@ def train_func(config): size = config.get('size') nb_cls = config.get('nb_cls') model = config.get('model') - - + weights = config.get('weights') # Model construction model = build_model(model, nb_cls, size) @@ -336,6 +328,7 @@ def train_func(config): x = batch_train, validation_data = batch_val, callbacks = [ReportCheckpointCallback()], + class_weight = weights, verbose = 0 ) session.report({ diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 6a97587..3bf2e5e 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -1,10 +1,14 @@ import os import warnings +import numpy as np import pandas as pd # Class construction from abc import ABC, abstractmethod +# Class weights +from sklearn.utils.class_weight import compute_class_weight + __author__ = 'Nicolas de Montigny' __all__ = ['ModelsUtils'] @@ -56,7 +60,8 @@ def __init__( batch_size, training_epochs, taxa, - kmers_list + kmers_list, + csv ): # Parameters self.classifier = classifier @@ -64,21 +69,21 @@ def __init__( self.taxa = taxa self.kmers = kmers_list # Initialize hidden + self._csv = csv self._nb_kmers = len(kmers_list) self._training_epochs = training_epochs # Initialize empty - self._weights = [] - self._labels_map = None self._clf = None - self._encoder = None + self._weights = {} self._scaler = None - self._preprocessor = None + self._encoder = None + self._trainer = None self._reductor = None - self._nb_features = None + self._predictor = None + self._labels_map = None self._model_ckpt = None - self._trainer = None self._train_params = {} - self._predictor = None + self._preprocessor = None self._workdir = outdir_model @abstractmethod @@ -104,4 +109,18 @@ def _prob_2_cls(self): @abstractmethod def _label_decode(self): """ - """ \ No newline at end of file + """ + + def _compute_weights(self): + """ + Set class weights depending on their abundance in data-associated classes csv + """ + cls = pd.read_csv(self._csv) + classes = list(cls[self.taxa].unique()) + weights = compute_class_weight( + class_weight = 'balanced', + classes = classes, + y = cls[self.taxa] + ) + for lab, encoded in self._labels_map: + self._weights[encoded] = weights[classes.index(lab)] \ No newline at end of file diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 4f7bc76..4d61f65 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -4,14 +4,10 @@ import numpy as np import pandas as pd -# Dimensions reduction -from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer -from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition - # Preprocessing from models.encoders.model_label_encoder import ModelLabelEncoder from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder -from models.preprocessors.compute_class_weights import ComputeClassWeights +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Training from ray.air.config import ScalingConfig @@ -77,7 +73,8 @@ def __init__( batch_size, training_epochs, taxa, - kmers_list + kmers_list, + csv ): super().__init__( classifier, @@ -85,12 +82,13 @@ def __init__( batch_size, training_epochs, taxa, - kmers_list + kmers_list, + csv ) # Parameters self._encoded = [] - def preprocess(self, ds, reductor_file): + def preprocess(self, ds, scaling = False): print('preprocess') if self.classifier == 'onesvm': self._encoder = OneClassSVMLabelEncoder(self.taxa) @@ -99,19 +97,11 @@ def preprocess(self, ds, reductor_file): else: self._encoder = ModelLabelEncoder(self.taxa) - self._scaler = TensorTfIdfTransformer(self.kmers) - - ds = self._encoder.fit_transform(ds) - - self._weights = ComputeClassWeights(LABELS_COLUMN_NAME) - self._weights.fit(ds) - self._weights = self._weights.stats_ - - ds = self._scaler.fit_transform(ds) + self._encoder.fit(ds) - self._reductor = TensorTruncatedSVDDecomposition(self.kmers, 10000, reductor_file) - # self._reductor = TensorCountHashing(self.kmers, 10000) - self._reductor.fit(ds) + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers) + self._scaler.fit(ds) # Labels mapping if self.classifier != 'onesvm': @@ -120,6 +110,7 @@ def preprocess(self, ds, reductor_file): labels = np.append(labels, 'unknown') self._encoded = np.append(self._encoded, -1) self._labels_map = zip(labels, self._encoded) + self._compute_weights() def _label_decode(self, predict): print('_label_decode') @@ -178,9 +169,8 @@ def fit(self, datasets): for name, ds in datasets.items(): ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._scaler.transform(ds) - ds = self._reductor.transform(ds) - self._nb_features = self._reductor._nb_components if self._reductor._nb_components < self._nb_kmers else self._nb_kmers + if self._scaler is not None: + ds = self._scaler.transform(ds) # Trigger the preprocessing computations before ingest in trainer # Otherwise, it would be executed at each epoch ds = ds.materialize() @@ -220,8 +210,8 @@ def fit(self, datasets): def predict(self, ds, threshold = 0.8): print('predict') if ds.count() > 0: - ds = self._scaler.transform(ds) - ds = self._reductor.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) ds = ds.materialize() predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) diff --git a/src/utils.py b/src/utils.py index 5e7924f..b45b0b1 100644 --- a/src/utils.py +++ b/src/utils.py @@ -33,6 +33,7 @@ 'verify_kmers_list_length', 'verify_load_data', 'verify_concordance_klength', + 'verify_need_scaling', 'verify_taxas', 'verify_load_preclassified', 'merge_save_data', @@ -162,6 +163,9 @@ def verify_concordance_klength(klen1 : int, klen2 : int): raise ValueError("K length between datasets is inconsistent ! Exiting\n" + f"K length of bacteria dataset is {klen1} while K length from host is {klen2}") +def verify_need_scaling(data : dict): + return False if 'decomposed' in data['profile'] else True + # Verif + handling ######################################################################################################### @@ -199,10 +203,6 @@ def verify_load_data(data_file: Path): verify_file(data_file) data = load_Xy_data(data_file) verify_data_path(data['profile']) - if not isinstance(data['ids'], list): - raise ValueError("Invalid data file !") - elif not isinstance(data['kmers'], list): - raise ValueError("Invalid data file !") return data def verify_taxas(taxas : str, db_taxas : list): @@ -249,7 +249,7 @@ def merge_classified_data( clf_ids.extend(clf_data['unknown_ids']) clf_data['unknown_ids'] = list(np.unique(clf_ids)) # classes - dct_diff = {k : v for k,v in db_data.items() if k not in clf_data.keys()} + dct_diff = {k : v for k, v in db_data.items() if k not in clf_data.keys()} clf_data = {**clf_data,**dct_diff} return clf_data From 59625e907e83671a19ad884993d74a618cff78b5 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 20 Nov 2023 18:33:07 -0500 Subject: [PATCH 42/92] correction in dimension decomp --- src/Caribou_dimensions_decomposition.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py index 52ccff6..627877b 100644 --- a/src/Caribou_dimensions_decomposition.py +++ b/src/Caribou_dimensions_decomposition.py @@ -78,7 +78,7 @@ def dimensions_decomposition(opt): # Save decomposed data save_Xy_data(data, data_file) - print(f"Caribou finished decomposing the features of {opt['dataset_name']} in {t_decomposition} seconds.") + print(f"Caribou finished decomposing the features in {t_decomposition} seconds.") else: print('Caribou did not decompose the features because the number to extract is bigger than the actual number of features') else: @@ -91,7 +91,6 @@ def dimensions_decomposition(opt): parser = argparse.ArgumentParser(description='This script computes features reduction to a given K-mers dataset and then applies it.') # Dataset parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') - parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files') parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced') # Parameters parser.add_argument('-n','--nb_components', default=1000, type=int, help='Number of components to decompose data into') From 799b39f956b65d086a44a2341e8f83949b2a130c Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 20 Nov 2023 18:51:21 -0500 Subject: [PATCH 43/92] dim decomp local debug --- src/Caribou_dimensions_decomposition.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py index 627877b..286861d 100644 --- a/src/Caribou_dimensions_decomposition.py +++ b/src/Caribou_dimensions_decomposition.py @@ -48,7 +48,7 @@ def dimensions_decomposition(opt): data_file = f'{path}_decomposed{ext}' if not os.path.exists(data_file): - if opt['nb_features'] < len(kmers): + if opt['nb_components'] < len(kmers): # Load data files_lst = glob(os.path.join(data['profile'],'*.parquet')) ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) @@ -72,7 +72,7 @@ def dimensions_decomposition(opt): # Save decomposed dataset data['profile'] = f"{data['profile']}_decomposed" - data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor._nb_components)] + data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor.preprocessors[1]._nb_components)] ds.write_parquet(data['profile']) # Save decomposed data @@ -88,7 +88,7 @@ def dimensions_decomposition(opt): ################################################################################ if __name__ == "__main__": - parser = argparse.ArgumentParser(description='This script computes features reduction to a given K-mers dataset and then applies it.') + parser = argparse.ArgumentParser(description='This script computes features decomposition to a given K-mers dataset and then applies it.') # Dataset parser.add_argument('-db','--dataset', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced') From bb25d283c0fecc8f892075e4570d98a5170e27f2 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 20 Nov 2023 18:56:15 -0500 Subject: [PATCH 44/92] tf-idf unwrap batch to avoid 0 div --- src/models/preprocessors/tfidf_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py index 88d899c..3732526 100644 --- a/src/models/preprocessors/tfidf_transformer.py +++ b/src/models/preprocessors/tfidf_transformer.py @@ -30,6 +30,7 @@ def _fit(self, ds: Dataset) -> Preprocessor: occurences = np.zeros(self._nb_features) for batch in ds.iter_batches(batch_format = 'numpy'): batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) occurences += np.count_nonzero(batch, axis = 0) idf = np.log(nb_samples / occurences) + 1 From 2246cd0c3cffb749cba86d40f634be3c2b433cb8 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 21 Nov 2023 09:44:44 -0500 Subject: [PATCH 45/92] decomposition script rectify + load tf-idf in prepro --- src/Caribou_dimensions_decomposition.py | 15 +++++- src/Caribou_reduce_features.py | 3 +- .../reduction/truncated_svd_decomposition.py | 6 +-- src/models/classification.py | 12 ++++- src/models/kerasTF/models.py | 6 +-- src/models/preprocessors/tfidf_transformer.py | 47 ++++++++++++------- src/models/sklearn/models.py | 4 +- 7 files changed, 62 insertions(+), 31 deletions(-) diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py index 286861d..8c539b6 100644 --- a/src/Caribou_dimensions_decomposition.py +++ b/src/Caribou_dimensions_decomposition.py @@ -36,7 +36,7 @@ def dimensions_decomposition(opt): k_length, kmers = verify_kmers_list_length(k_length, opt['kmers_list']) outdirs = define_create_outdirs(opt['outdir']) - + # Initialize cluster init_ray_cluster(opt['workdir']) @@ -53,12 +53,14 @@ def dimensions_decomposition(opt): files_lst = glob(os.path.join(data['profile'],'*.parquet')) ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz') reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz') # Compute the decomposition preprocessor = Chain( TensorTfIdfTransformer( - features = kmers + features = kmers, + file = scaler_file ), TensorTruncatedSVDDecomposition( features = kmers, @@ -101,3 +103,12 @@ def dimensions_decomposition(opt): opt = vars(args) dimensions_decomposition(opt) + +# Test params +opt = { + 'dataset':'/home/nicdemon/results/data/Xy_genome_cucurbita_data_K10.npz', + 'kmers_list':'/home/nicdemon/results/data/kmers_list_reduced.txt', + 'nb_components':10000, + 'outdir':'/home/nicdemon/results/', + 'workdir':'/home/nicdemon/ray/', +} diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index c95861d..5fe02cd 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -75,7 +75,7 @@ def features_reduction(opt): train_ds, export_ds, kmers = occurence_exclusion(train_ds, export_ds, kmers) train_ds, export_ds, kmers = low_var_selection(train_ds, export_ds, kmers) # Statistical features selection - train_ds, export_ds, data['kmers'] = features_selection(train_ds, export_ds, kmers, opt['taxa']) + train_ds, export_ds, kmers = features_selection(train_ds, export_ds, kmers, opt['taxa']) # Time the computation of transformations t_end = time() t_reduction = t_end - t_start @@ -83,6 +83,7 @@ def features_reduction(opt): data['profile'] = f"{data['profile']}_reduced" export_ds.write_parquet(data['profile']) # Save reduced K-mers + data['kmers'] = kmers with open(os.path.join(outdirs["data_dir"],'kmers_list_reduced.txt'),'w') as handle: handle.writelines("%s\n" % item for item in data['kmers']) # Save reduced data diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py index 0a67cd0..8c27ac5 100644 --- a/src/data/reduction/truncated_svd_decomposition.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -1,4 +1,3 @@ -import os import numpy as np import pandas as pd @@ -6,6 +5,7 @@ from tqdm import tqdm from typing import List from warnings import warn +from os.path import isfile from ray.data import Dataset from utils import save_Xy_data, load_Xy_data @@ -64,7 +64,7 @@ def batch_svd(batch): components = [] if self._nb_features > self._nb_components: - if os.path.isfile(self._file): + if isfile(self._file): components = np.array(load_Xy_data(self._file)) else: # sampl = ds.random_sample(0.1) @@ -147,7 +147,7 @@ def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: return df def __repr__(self): - return (f"{self.__class__.__name__}(features={self._nb_features!r}, taxa={self.taxa!r}, threshold={self.threshold!r})") + return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})") def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: if len(df.loc[0, column]) != nb_features: diff --git a/src/models/classification.py b/src/models/classification.py index a9ce5b1..30747a4 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -209,7 +209,11 @@ def _binary_training(self, datasets, taxa, file): self._database_data['kmers'], self._database_data['csv'] ) - model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling) + model.preprocess( + datasets[TRAINING_DATASET_NAME], + self._scaling, + os.path.join(self._outdirs['models_dir'], 'TF-IDF_diag.npz') + ) model.fit(datasets) self._save_model(model, file) @@ -236,7 +240,11 @@ def _multiclass_training(self, datasets, taxa, file): self._database_data['kmers'], self._database_data['csv'] ) - model.preprocess(datasets[TRAINING_DATASET_NAME], self._scaling) + model.preprocess( + datasets[TRAINING_DATASET_NAME], + self._scaling, + os.path.join(self._outdirs['models_dir'], 'TF-IDF_diag.npz') + ) model.fit(datasets) self._save_model(model, file) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 827225f..28a5767 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -125,7 +125,7 @@ def __init__( elif self.classifier == 'widecnn': print('Training multiclass classifier based on Wide CNN Network') - def preprocess(self, ds, scaling = False): + def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') labels = [] encoded = [] @@ -135,14 +135,14 @@ def preprocess(self, ds, scaling = False): if self._nb_classes == 2: self._encoder = ModelLabelEncoder(self.taxa) if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers) + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) else: self._encoder = Chain( LabelEncoder(self.taxa), OneHotTensorEncoder(self.taxa) ) if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers) + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) self._encoder.fit(ds) if scaling: diff --git a/src/models/preprocessors/tfidf_transformer.py b/src/models/preprocessors/tfidf_transformer.py index 3732526..de5c2ec 100644 --- a/src/models/preprocessors/tfidf_transformer.py +++ b/src/models/preprocessors/tfidf_transformer.py @@ -4,8 +4,10 @@ import scipy.sparse as sp +from os.path import isfile from ray.data.dataset import Dataset from sklearn.preprocessing import normalize +from utils import save_Xy_data, load_Xy_data from ray.data.preprocessor import Preprocessor from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed @@ -18,31 +20,37 @@ class TensorTfIdfTransformer(Preprocessor): TF-IDF transformation is used to scale down the impact of tokens that occur very frequently and scale up the impact of those that occur very rarely. """ - def __init__(self, features): + def __init__(self, features, file: str = ''): # Parameters self._features = features self._nb_features = len(features) + self._file = file def _fit(self, ds: Dataset) -> Preprocessor: - nb_samples = ds.count() + if isfile(self._file): + idf_diag = load_Xy_data(self._file) + else: + nb_samples = ds.count() - # Nb of occurences - occurences = np.zeros(self._nb_features) - for batch in ds.iter_batches(batch_format = 'numpy'): - batch = batch[TENSOR_COLUMN_NAME] - batch = _unwrap_ndarray_object_type_if_needed(batch) - occurences += np.count_nonzero(batch, axis = 0) + # Nb of occurences + occurences = np.zeros(self._nb_features) + for batch in ds.iter_batches(batch_format = 'numpy'): + batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) + occurences += np.count_nonzero(batch, axis = 0) - idf = np.log(nb_samples / occurences) + 1 - - idf_diag = sp.diags( - idf, - offsets=0, - shape=(self._nb_features, self._nb_features), - format="csr", - dtype=np.float64, - ) - + idf = np.log(nb_samples / occurences) + 1 + + idf_diag = sp.diags( + idf, + offsets=0, + shape=(self._nb_features, self._nb_features), + format="csr", + dtype=np.float64, + ) + + save_Xy_data(idf_diag, self._file) + self.stats_ = {'idf_diag' : idf_diag} return self @@ -62,6 +70,9 @@ def _transform_pandas(self, batch: pd.DataFrame) -> pd.DataFrame: return batch + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})") + def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: if len(df.loc[0, column]) != nb_features: raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') \ No newline at end of file diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 4d61f65..6a7ef3f 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -88,7 +88,7 @@ def __init__( # Parameters self._encoded = [] - def preprocess(self, ds, scaling = False): + def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') if self.classifier == 'onesvm': self._encoder = OneClassSVMLabelEncoder(self.taxa) @@ -100,7 +100,7 @@ def preprocess(self, ds, scaling = False): self._encoder.fit(ds) if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers) + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) self._scaler.fit(ds) # Labels mapping From f7cc7a11645f1bbad34c5593155299b9ceb999ce Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 22 Nov 2023 18:08:50 -0500 Subject: [PATCH 46/92] NMF for decomposition + debug weights --- ...ements copy.txt => frozen_requirements.txt | 0 requirements.txt | 6 +- setup.cfg | 27 +++--- src/Caribou_classification.py | 9 +- src/Caribou_classification_train_cv.py | 15 +++- src/Caribou_dimensions_decomposition.py | 8 +- src/Caribou_extraction.py | 23 ++++- src/Caribou_extraction_train_cv.py | 36 ++++++-- .../reduction/dictionnary_decomposition.py | 83 +++++++++++++++++++ src/data/reduction/nmf_decomposition.py | 81 ++++++++++++++++++ .../reduction/truncated_svd_decomposition.py | 3 +- src/models/classification.py | 5 +- src/models/encoders/model_label_encoder.py | 2 +- src/models/kerasTF/models.py | 11 ++- src/models/models_utils.py | 13 ++- src/models/reads_simulation.py | 8 +- src/models/sklearn/models.py | 12 +-- src/models/sklearn/partial_trainer.py | 3 +- src/utils.py | 13 +-- 19 files changed, 285 insertions(+), 73 deletions(-) rename requirements copy.txt => frozen_requirements.txt (100%) create mode 100644 src/data/reduction/dictionnary_decomposition.py create mode 100644 src/data/reduction/nmf_decomposition.py diff --git a/requirements copy.txt b/frozen_requirements.txt similarity index 100% rename from requirements copy.txt rename to frozen_requirements.txt diff --git a/requirements.txt b/requirements.txt index d409b51..b51fcee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -biopython==1.78 +biopython>=1.79 cloudpickle>=2.2.1 -InSilicoSeq==1.5.4 +InSilicoSeq==1.6.0 keras==2.14 numpy>=1.2 pandas>=2.0 ray==2.6.3 scikit-learn==1.3.0 tensorflow==2.14 -pyarrow==12.0 \ No newline at end of file +pyarrow==12.0.1 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index a9f82f6..fe79d4b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,23 +11,16 @@ long_description = file: README.md [options] install_requires = - setuptools - wheel - grpcio==1.48.2 - ray[default]==2.6.3 - pydantic<2 - pyarrow>=6.0.1,!=7 - keras>=2.0.0 - tensorflow>=2.0.0 - numpy>=1.16 - pandas>=1.3.0 - scikit-learn>=1.1.2 - scipy - insilicoseq - biopython==1.78 - tqdm - cloudpickle - tune-sklearn + ray==2.6.3 + numpy>=1.2 + pandas>=2.0 + pyarrow==12.0.1 + cloudpickle>=2.2.1 + keras==2.14 + tensorflow==2.14 + scikit-learn==1.3.0 + biopython>=1.79 + InSilicoSeq==1.6.0 include_package_data = True python_requires = >=3.8 scripts = diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index 1e1da9c..9a9473c 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -50,7 +50,10 @@ def bacteria_classification(opt): # Verify need for scaling scaling = verify_need_scaling(db_data) - val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") + if opt['validation'] is not None: + val_data, val_ds = verify_load_metagenome(opt['validation']) + else: + val_data, val_ds = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") datasets = { TRAINING_DATASET_NAME : db_ds, @@ -102,9 +105,11 @@ def bacteria_classification(opt): # Dataset parser.add_argument('-mg','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify') parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files') + # Optional datasets + parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') # Parameters parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') - parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.') + parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index 1f707b9..f6d1422 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -56,8 +56,14 @@ def bacteria_classification_train_cv(opt): for taxa in lst_taxas: - test_ds, test_data = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}") - val_ds, val_data = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") + if opt['test'] is not None: + test_data, test_ds = verify_load_metagenome(opt['test']) + else: + test_data, test_ds = split_sim_dataset(db_ds, db_data, f"{TEST_DATASET_NAME}_{opt['database_name']}") + if opt['validation'] is not None: + val_data, val_ds = verify_load_metagenome(opt['validation']) + else: + val_data, val_ds = split_sim_dataset(db_ds, db_data, f"{VALIDATION_DATASET_NAME}_{opt['database_name']}") datasets = { TRAINING_DATASET_NAME : db_ds, @@ -94,9 +100,12 @@ def bacteria_classification_train_cv(opt): # Database parser.add_argument('-db','--data_bacteria', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the bacteria database') parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files') + # Optional datasets + parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') + parser.add_argument('-t','--test', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the test dataset') # Parameters parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') - parser.add_argument('-t','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.') + parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py index 8c539b6..0e9b75d 100644 --- a/src/Caribou_dimensions_decomposition.py +++ b/src/Caribou_dimensions_decomposition.py @@ -12,7 +12,9 @@ from pathlib import Path from ray.data.preprocessors import Chain +from data.reduction.nmf_decomposition import TensorNMFDecomposition from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer +from data.reduction.dictionnary_decomposition import TensorDictionnaryDecomposition from data.reduction.truncated_svd_decomposition import TensorTruncatedSVDDecomposition __author__ = "Nicolas de Montigny" @@ -54,7 +56,7 @@ def dimensions_decomposition(opt): ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz') - reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz') + reductor_file = os.path.join(outdirs['models_dir'], 'decomposed_components.npz') # Compute the decomposition preprocessor = Chain( @@ -62,7 +64,7 @@ def dimensions_decomposition(opt): features = kmers, file = scaler_file ), - TensorTruncatedSVDDecomposition( + TensorNMFDecomposition( features = kmers, nb_components = opt['nb_components'], file = reductor_file @@ -70,12 +72,12 @@ def dimensions_decomposition(opt): ) t_s = time() ds = preprocessor.fit_transform(ds) - t_decomposition = time() - t_s # Save decomposed dataset data['profile'] = f"{data['profile']}_decomposed" data['kmers'] = [f'feature_{i}' for i in np.arange(preprocessor.preprocessors[1]._nb_components)] ds.write_parquet(data['profile']) + t_decomposition = time() - t_s # Save decomposed data save_Xy_data(data, data_file) diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py index a2168fb..6228230 100644 --- a/src/Caribou_extraction.py +++ b/src/Caribou_extraction.py @@ -40,18 +40,30 @@ def bacteria_extraction(opt): if opt['model_type'] != 'onesvm': if opt['data_host'] is not None: - db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + if opt['merged'] is not None: + db_data, db_ds = verify_load_db(opt['merged']) + else: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) db_name = 'host_merged' else: db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['dataset_name'] - val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + if opt['validation'] is not None: + val_data, val_ds = verify_load_db(opt['validation']) + else: + val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') else: - db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + if opt['merged'] is not None: + db_data, db_ds = verify_load_db(opt['merged']) + else: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) db_name = 'host_merged' - val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + if opt['validation'] is not None: + val_data, val_ds = verify_load_db(opt['validation']) + else: + val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['dataset_name'] @@ -111,6 +123,9 @@ def bacteria_extraction(opt): # Dataset parser.add_argument('-dm','--data_metagenome', required=True, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the metagenome to classify') parser.add_argument('-mn','--metagenome_name', required=True, help='Name of the metagenome to classify used to name files') + # Optional datasets + parser.add_argument('-m','--merged', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the merged bacteria and host databases') + parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') # Parameters parser.add_argument('-model','--model_type', default=None, choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') diff --git a/src/Caribou_extraction_train_cv.py b/src/Caribou_extraction_train_cv.py index 2547886..8576535 100644 --- a/src/Caribou_extraction_train_cv.py +++ b/src/Caribou_extraction_train_cv.py @@ -34,20 +34,38 @@ def bacteria_extraction_train_cv(opt): if opt['model_type'] != 'onesvm': if opt['data_host'] is not None: - db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + if opt['merged'] is not None: + db_data, db_ds = verify_load_db(opt['merged']) + else: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) db_name = 'host_merged' else: db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['database_name'] - test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') - val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + if opt['test'] is not None: + test_data, test_ds = verify_load_db(opt['test']) + else: + test_data, test_ds = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') + if opt['validation'] is not None: + val_data, val_ds = verify_load_db(opt['validation']) + else: + val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') else: - db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) + if opt['merged'] is not None: + db_data, db_ds = verify_load_db(opt['merged']) + else: + db_data, db_ds = verify_load_host_merge(opt['data_bacteria'], opt['data_host']) db_name = 'host_merged' - test_ds, test_data = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') - val_ds, val_data = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') + if opt['test'] is not None: + test_data, test_ds = verify_load_db(opt['test']) + else: + test_data, test_ds = split_sim_dataset(db_ds, db_data, f'{TEST_DATASET_NAME}_{db_name}') + if opt['validation'] is not None: + val_data, val_ds = verify_load_db(opt['validation']) + else: + val_data, val_ds = split_sim_dataset(db_ds, db_data, f'{VALIDATION_DATASET_NAME}_{db_name}') db_data, db_ds = verify_load_db(opt['data_bacteria']) db_name = opt['database_name'] @@ -92,8 +110,12 @@ def bacteria_extraction_train_cv(opt): parser.add_argument('-dh','--data_host', default=None, type=Path, help='PATH to a npz file containing the data corresponding to the k-mers profile for the host') parser.add_argument('-dn','--database_name', required=True, help='Name of the bacteria database used to name files') parser.add_argument('-hn','--host_name', default=None, help='Name of the host database used to name files') + # Optional datasets + parser.add_argument('-m','--merged', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the merged bacteria and host databases') + parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') + parser.add_argument('-t','--test', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the test dataset') # Parameters - parser.add_argument('-model','--model_type', required = True, choices=['onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') + parser.add_argument('-model','--model_type', required=True, choices=['onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one is chosen, defaults to 100') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') diff --git a/src/data/reduction/dictionnary_decomposition.py b/src/data/reduction/dictionnary_decomposition.py new file mode 100644 index 0000000..5bbc4b5 --- /dev/null +++ b/src/data/reduction/dictionnary_decomposition.py @@ -0,0 +1,83 @@ + +import numpy as np +import pandas as pd + +from typing import List +from warnings import warn +from os.path import isfile +from ray.data import Dataset +from utils import save_Xy_data, load_Xy_data + +from sklearn.utils.extmath import randomized_svd +from sklearn.decomposition import DictionaryLearning +from sklearn.decomposition._dict_learning import _sparse_encode + +from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +TENSOR_COLUMN_NAME = '__value__' + +class TensorDictionnaryDecomposition(Preprocessor): + """ + Custom class for using Mini-Batch Dictionnary Learning as a Ray preprocessor. + This is inspired by sklearn.decomposition.DictionaryLearning and is fitted on batches before keeping the consensus components matrix. + Consensus components matrix is attained following the logic from sklearn.decomposition.MiniBatchDictionnaryLearning. + https://scikit-learn.org/stable/modules/decomposition.html#nmf + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchNMF.html + """ + def __init__(self, features: List[str], nb_components: int = 10000, file: str = ''): + # Parameters + self.features = features + self._nb_features = len(features) + self._nb_components = nb_components + self._file = file + + def _fit(self, ds: Dataset) -> Preprocessor: + def batch_dict(batch): + batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) + dict = DictionaryLearning( + n_components = self._nb_components, + max_iter = 10, + transform_algorithm = 'cd', + ) + dict.fit(batch) + return {'components' : [dict.components_]} + + components = [] + if self._nb_features > self._nb_components: + if isfile(self._file): + components = np.array(load_Xy_data(self._file)) + else: + dct = ds.map_batches(batch_dict, batch_format = 'numpy') + + for row in dct.iter_rows(): + components.append(row['components']) + components = np.mean(components, axis = 0) + + save_Xy_data(components, self._file) + + self.stats_ = {'components' : components} + else: + warn('No features reduction to do because the number of features is already lower than the required number of components') + self.stats_ = {'components' : False} + + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + components = self.stats_['components'] + + if components is not False: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = np.dot(tensor_col, components.T) + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + return df + + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})") + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') diff --git a/src/data/reduction/nmf_decomposition.py b/src/data/reduction/nmf_decomposition.py new file mode 100644 index 0000000..85abe03 --- /dev/null +++ b/src/data/reduction/nmf_decomposition.py @@ -0,0 +1,81 @@ + +import numpy as np +import pandas as pd + +from typing import List +from warnings import warn +from os.path import isfile +from ray.data import Dataset +from utils import save_Xy_data, load_Xy_data + +from sklearn.utils.extmath import randomized_svd +from sklearn.decomposition import DictionaryLearning, NMF, MiniBatchNMF + +from ray.data.preprocessor import Preprocessor +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +TENSOR_COLUMN_NAME = '__value__' + +class TensorNMFDecomposition(Preprocessor): + """ + Custom class for using Mini-Batch Non-Negative Matrix Factorization (NMF) as a Ray preprocessor. + This is inspired by sklearn.decomposition.NMF and is fitted on batches before keeping the consensus components matrix. + Consensus components matrix is attained following the logic from sklearn.decomposition.MiniBatchNMF. + https://scikit-learn.org/stable/modules/decomposition.html#nmf + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchNMF.html + """ + def __init__(self, features: List[str], nb_components: int = 10000, file: str = ''): + # Parameters + self.features = features + self._nb_features = len(features) + self._nb_components = nb_components + self._file = file + + def _fit(self, ds: Dataset) -> Preprocessor: + def batch_nmf(batch): + batch = batch[TENSOR_COLUMN_NAME] + batch = _unwrap_ndarray_object_type_if_needed(batch) + model = NMF( + n_components = self._nb_components, + init = 'random' + ) + model.fit(batch) + return {'components' : [model.components_]} + + components = [] + if self._nb_features > self._nb_components: + if isfile(self._file): + components = np.array(load_Xy_data(self._file)) + else: + nmf = ds.map_batches(batch_nmf, batch_format = 'numpy') + + for row in nmf.iter_rows(): + components.append(row['components']) + components = np.mean(components, axis = 0) + + save_Xy_data(components, self._file) + + self.stats_ = {'components' : components} + else: + warn('No features reduction to do because the number of features is already lower than the required number of components') + self.stats_ = {'components' : False} + + def _transform_pandas(self, df: pd.DataFrame) -> pd.DataFrame: + # _validate_df(df, TENSOR_COLUMN_NAME, self._nb_features) + components = self.stats_['components'] + + if components is not False: + tensor_col = df[TENSOR_COLUMN_NAME] + tensor_col = _unwrap_ndarray_object_type_if_needed(tensor_col) + tensor_col = np.dot(tensor_col, components.T) + df[TENSOR_COLUMN_NAME] = pd.Series(list(tensor_col)) + + return df + + def __repr__(self): + return (f"{self.__class__.__name__}(features={self._nb_features!r}, file={self._file!r})") + +def _validate_df(df: pd.DataFrame, column: str, nb_features: int) -> None: + if len(df.loc[0, column]) != nb_features: + raise ValueError('Discordant number of features in the tensor column with the one from the dataframe used for fitting') diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py index 8c27ac5..ca0eed4 100644 --- a/src/data/reduction/truncated_svd_decomposition.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -35,7 +35,6 @@ def __init__(self, features: List[str], nb_components: int = 10000, file: str = self._file = file def _fit(self, ds: Dataset) -> Preprocessor: - # Parallel """ Possibilities for parallel TruncatedSVD * sklearn minibatch PCA -> PCA / SVD mostly equivalent @@ -93,7 +92,7 @@ def batch_svd(batch): return {'dictonnary' : [dict.components_]} components = [] if self._nb_features > self._nb_components: - if os.path.isfile(self._file): + if isfile(self._file): components = np.array(load_Xy_data(self._file)) else: svd = ds.map_batches(batch_svd, batch_format = 'numpy') diff --git a/src/models/classification.py b/src/models/classification.py index 30747a4..5da3f9c 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -280,7 +280,7 @@ def _get_true_classif(self, ds, taxas): Extract the true classification of the dataset used for cross-validation """ classif = {taxa : [] for taxa in taxas} - + cols2drop = [col for col in ds.schema().names if col not in ['id', taxas[0]]] classif_ds = ds.drop_columns(cols2drop) @@ -304,13 +304,10 @@ def _score_cv(self, y_true, y_pred, taxa): cv_csv = os.path.join(self._outdirs['results_dir'],f'{self._database}_{model}_{taxa}_cv_scores.csv') - y_compare = pd.DataFrame({ 'y_true': y_true[taxa], 'y_pred': y_pred[taxa] }) - y_compare['y_true'] = y_compare['y_true'].str.lower() - y_compare['y_pred'] = y_compare['y_pred'].str.lower() y_compare.to_csv(os.path.join(self._outdirs['models_dir'], f'y_compare_{self._database}_{model}_{taxa}.csv')) support = precision_recall_fscore_support( diff --git a/src/models/encoders/model_label_encoder.py b/src/models/encoders/model_label_encoder.py index 3317257..b635108 100644 --- a/src/models/encoders/model_label_encoder.py +++ b/src/models/encoders/model_label_encoder.py @@ -25,8 +25,8 @@ def _fit(self, dataset: Dataset) -> Preprocessor: return self def _transform_pandas(self, df: pd.DataFrame): - s_values = self.stats_ def column_label_encoder(s: pd.Series): + s_values = self.stats_[f"unique_values({s.name})"] return s.map(s_values) df[self.label_column] = df[self.label_column].transform(column_label_encoder) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 28a5767..62538f4 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -127,8 +127,6 @@ def __init__( def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') - labels = [] - encoded = [] for row in ds.iter_rows(): labels.append(row[self.taxa]) self._nb_classes = len(np.unique(labels)) @@ -152,16 +150,17 @@ def preprocess(self, ds, scaling = False, scaler_file = None): labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) else: labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys()) - encoded = np.arange(len(labels)) + self._encoded = np.arange(len(labels)) labels = np.append(labels, 'unknown') - encoded = np.append(encoded, -1) - self._labels_map = zip(labels, encoded) + self._encoded = np.append(self._encoded, -1) + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded self._compute_weights() def _label_decode(self, predict): print('_label_decode') decoded = pd.Series(np.empty(len(predict), dtype=object)) - for label, encoded in self._labels_map: + for label, encoded in self._labels_map.items(): decoded[predict == encoded] = label return np.array(decoded) diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 3bf2e5e..a7fbdb7 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -76,11 +76,12 @@ def __init__( self._clf = None self._weights = {} self._scaler = None + self._encoded = [] self._encoder = None self._trainer = None self._reductor = None self._predictor = None - self._labels_map = None + self._labels_map = {} self._model_ckpt = None self._train_params = {} self._preprocessor = None @@ -115,12 +116,18 @@ def _compute_weights(self): """ Set class weights depending on their abundance in data-associated classes csv """ + if isinstance(self._csv, tuple): + cls = pd.concat([pd.read_csv(self._csv[0]),pd.read_csv(self._csv[1])], axis = 0, join = 'inner', ignore_index = True) cls = pd.read_csv(self._csv) + if self.taxa == 'domain': + cls.loc[cls['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' classes = list(cls[self.taxa].unique()) weights = compute_class_weight( class_weight = 'balanced', classes = classes, y = cls[self.taxa] ) - for lab, encoded in self._labels_map: - self._weights[encoded] = weights[classes.index(lab)] \ No newline at end of file + + for lab, encoded in self._labels_map.items(): + if lab != 'unknown': + self._weights[encoded] = weights[classes.index(lab)] \ No newline at end of file diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index 463c077..197a3f4 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -218,7 +218,7 @@ def _verify_sim_arguments(self, k, kmers_list): def split_sim_dataset(ds, data, name): splitted_path = os.path.join(os.path.dirname(data['profile']), f'Xy_genome_simulation_{name}_data_K{len(data["kmers"][0])}.npz') - if os.path.exists(splitted_path): + if os.path.isfile(splitted_path): warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset') splitted_data = load_Xy_data(splitted_path) files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet')) @@ -229,8 +229,8 @@ def split_sim_dataset(ds, data, name): if splitted_ds.count() == 0: nb_samples = round(ds.count() * 0.1) splitted_ds = ds.random_shuffle().limit(nb_samples) - splitted_ds, splitted_data = sim_dataset(splitted_ds, data, name) - return splitted_ds, splitted_data + splitted_data, splitted_ds = sim_dataset(splitted_ds, data, name) + return splitted_data, splitted_ds def sim_dataset(ds, data, name): """ @@ -247,4 +247,4 @@ def sim_dataset(ds, data, name): sim_data = cv_sim.simulation(k, data['kmers']) files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - return sim_ds, sim_data \ No newline at end of file + return sim_data, sim_ds \ No newline at end of file diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 6a7ef3f..39af3de 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -85,9 +85,7 @@ def __init__( kmers_list, csv ) - # Parameters - self._encoded = [] - + def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') if self.classifier == 'onesvm': @@ -109,13 +107,15 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._encoded = np.arange(len(labels)) labels = np.append(labels, 'unknown') self._encoded = np.append(self._encoded, -1) - self._labels_map = zip(labels, self._encoded) - self._compute_weights() + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded + if self.classifier != 'onesvm': + self._compute_weights() def _label_decode(self, predict): print('_label_decode') decoded = pd.Series(np.empty(len(predict), dtype=object)) - for label, encoded in self._labels_map: + for label, encoded in self._labels_map.items(): decoded[predict == encoded] = label return np.array(decoded) diff --git a/src/models/sklearn/partial_trainer.py b/src/models/sklearn/partial_trainer.py index 046c88c..9545581 100644 --- a/src/models/sklearn/partial_trainer.py +++ b/src/models/sklearn/partial_trainer.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd -from tqdm import tqdm from joblib import parallel_backend from sklearn.metrics import check_scoring @@ -202,7 +201,7 @@ def training_loop(self): _set_cpu_params(self.estimator, num_cpus) - for epoch_X, epoch_y in tqdm(zip(X_train.iter_epochs(), y_train.iter_epochs())): + for epoch_X, epoch_y in zip(X_train.iter_epochs(), y_train.iter_epochs()): with parallel_backend("ray", n_jobs=num_cpus): start_time = time() for batch_X, batch_y in zip( diff --git a/src/utils.py b/src/utils.py index b45b0b1..9194f63 100644 --- a/src/utils.py +++ b/src/utils.py @@ -86,20 +86,20 @@ def save_Xy_data(data, Xy_file): ######################################################################################################### def verify_file(file : Path): - if file is not None and not os.path.exists(file): + if file is not None and not os.path.isfile(file): raise ValueError(f'Cannot find file {file} !') def verify_fasta(file : Path): - if not os.path.isfile(file) and not os.path.isdir(file): + if not os.path.exists(file): raise ValueError('Fasta must be an interleaved fasta file or a directory containing fasta files.') def verify_data_path(dir : Path): - if not os.path.exists(dir): + if not os.path.isdir(dir): raise ValueError(f"Cannot find data folder {dir} ! Exiting") def verify_saving_path(dir : Path): path, folder = os.path.split(dir) - if not os.path.exists(path): + if not os.path.isdir(path): raise ValueError("Cannot find where to create output folder !") def verify_host(host : str): @@ -341,7 +341,7 @@ def merge_db_host(db_data, host_data): merged_db_host = {} merged_db_host_file = f"{db_data['profile']}_host_merged.npz" - if os.path.exists(merged_db_host_file): + if os.path.isfile(merged_db_host_file): merged_db_host = load_Xy_data(merged_db_host_file) files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) @@ -366,7 +366,8 @@ def merge_db_host(db_data, host_data): merged_db_host['kmers'] = db_data['kmers'] # Features merged_db_host['taxas'] = ['domain'] # Known taxas for classification merged_db_host['fasta'] = (db_data['fasta'], host_data['fasta']) # Fasta file needed for reads simulation - + merged_db_host['csv'] = (db_data['csv'], host_data['csv']) # csv file needed for classes weights + save_Xy_data(merged_db_host, merged_db_host_file) return merged_db_host, merged_ds From 7511e7561a3b80773ff54edfd3e9fa8edeba39d5 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 23 Nov 2023 06:14:25 -0500 Subject: [PATCH 47/92] decomposition revert back to TruncatedSVD --- src/Caribou_dimensions_decomposition.py | 4 ++-- src/data/reduction/dictionnary_decomposition.py | 12 +++++++----- src/data/reduction/nmf_decomposition.py | 8 ++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py index 0e9b75d..0141abb 100644 --- a/src/Caribou_dimensions_decomposition.py +++ b/src/Caribou_dimensions_decomposition.py @@ -56,7 +56,7 @@ def dimensions_decomposition(opt): ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz') - reductor_file = os.path.join(outdirs['models_dir'], 'decomposed_components.npz') + reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz') # Compute the decomposition preprocessor = Chain( @@ -64,7 +64,7 @@ def dimensions_decomposition(opt): features = kmers, file = scaler_file ), - TensorNMFDecomposition( + TensorTruncatedSVDDecomposition( features = kmers, nb_components = opt['nb_components'], file = reductor_file diff --git a/src/data/reduction/dictionnary_decomposition.py b/src/data/reduction/dictionnary_decomposition.py index 5bbc4b5..27e9f67 100644 --- a/src/data/reduction/dictionnary_decomposition.py +++ b/src/data/reduction/dictionnary_decomposition.py @@ -8,9 +8,7 @@ from ray.data import Dataset from utils import save_Xy_data, load_Xy_data -from sklearn.utils.extmath import randomized_svd -from sklearn.decomposition import DictionaryLearning -from sklearn.decomposition._dict_learning import _sparse_encode +from sklearn.decomposition import DictionaryLearning, MiniBatchDictionaryLearning from ray.data.preprocessor import Preprocessor from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed @@ -37,10 +35,14 @@ def _fit(self, ds: Dataset) -> Preprocessor: def batch_dict(batch): batch = batch[TENSOR_COLUMN_NAME] batch = _unwrap_ndarray_object_type_if_needed(batch) - dict = DictionaryLearning( + dict = MiniBatchDictionaryLearning( n_components = self._nb_components, max_iter = 10, - transform_algorithm = 'cd', + fit_algorithm = 'cd', + transform_algorithm = 'lars', + positive_code = True, + positive_dict = True, + batch_size = 10 ) dict.fit(batch) return {'components' : [dict.components_]} diff --git a/src/data/reduction/nmf_decomposition.py b/src/data/reduction/nmf_decomposition.py index 85abe03..6048cbb 100644 --- a/src/data/reduction/nmf_decomposition.py +++ b/src/data/reduction/nmf_decomposition.py @@ -8,8 +8,7 @@ from ray.data import Dataset from utils import save_Xy_data, load_Xy_data -from sklearn.utils.extmath import randomized_svd -from sklearn.decomposition import DictionaryLearning, NMF, MiniBatchNMF +from sklearn.decomposition import NMF, MiniBatchNMF from ray.data.preprocessor import Preprocessor from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed @@ -36,9 +35,10 @@ def _fit(self, ds: Dataset) -> Preprocessor: def batch_nmf(batch): batch = batch[TENSOR_COLUMN_NAME] batch = _unwrap_ndarray_object_type_if_needed(batch) - model = NMF( + model = MiniBatchNMF( n_components = self._nb_components, - init = 'random' + init = 'random', + batch_size = 10 ) model.fit(batch) return {'components' : [model.components_]} From fa5495095d5b70f88bc5289461635b28c0557186 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 23 Nov 2023 13:43:16 -0500 Subject: [PATCH 48/92] debug onesvm labels encoding --- src/models/classification.py | 5 ++++- src/models/encoders/onesvm_label_encoder.py | 4 ++-- src/models/sklearn/models.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/models/classification.py b/src/models/classification.py index 5da3f9c..1d0e3c8 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -289,7 +289,10 @@ def _get_true_classif(self, ds, taxas): for row in classif_ds.iter_rows(): for taxa in taxas: - classif[taxa].append(row[taxa]) + if self._classifier_binary == 'onesvm' and row[taxa] not in ['Bacteria','bacteria','bact']: + classif[taxa].append('Unknown') + else: + classif[taxa].append(row[taxa]) return classif, ds diff --git a/src/models/encoders/onesvm_label_encoder.py b/src/models/encoders/onesvm_label_encoder.py index 1743f95..23b121d 100644 --- a/src/models/encoders/onesvm_label_encoder.py +++ b/src/models/encoders/onesvm_label_encoder.py @@ -20,14 +20,14 @@ def __init__(self, label_column: str): def _fit(self, dataset : Dataset) -> Preprocessor: self.stats_ = OrderedDict() self.stats_[f"unique_values({self.label_column})"] = { - 'bacteria' : 1, + 'Bacteria' : 1 } return self def _transform_pandas(self, df: pd.DataFrame): _validate_df(df, self.label_column) mapping = self.stats_[f"unique_values({self.label_column})"] - df[self.label_column] = df[self.label_column].str.lower() + df[self.label_column] = df[self.label_column] df[self.label_column] = df[self.label_column].map(mapping) df[self.label_column] = df[self.label_column].fillna(-1) diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 39af3de..1be9a15 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -91,7 +91,7 @@ def preprocess(self, ds, scaling = False, scaler_file = None): if self.classifier == 'onesvm': self._encoder = OneClassSVMLabelEncoder(self.taxa) self._encoded = np.array([1,-1], dtype = np.int32) - labels = np.array(['bacteria', 'unknown'], dtype = object) + labels = np.array(['Bacteria', 'Unknown'], dtype = object) else: self._encoder = ModelLabelEncoder(self.taxa) @@ -105,7 +105,7 @@ def preprocess(self, ds, scaling = False, scaler_file = None): if self.classifier != 'onesvm': labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) self._encoded = np.arange(len(labels)) - labels = np.append(labels, 'unknown') + labels = np.append(labels, 'Unknown') self._encoded = np.append(self._encoded, -1) for (label, encoded) in zip(labels, self._encoded): self._labels_map[label] = encoded From 3fdaf17e05c18e2dedbd0e9a5fdf0de57726f61f Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 25 Nov 2023 12:21:15 -0500 Subject: [PATCH 49/92] handle import error for parquet reading --- src/Caribou_dimensions_decomposition.py | 5 ++-- src/Caribou_reduce_features.py | 5 ++-- src/models/models_utils.py | 2 +- src/models/reads_simulation.py | 6 ++--- src/utils.py | 31 +++++++++++++++++-------- 5 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/Caribou_dimensions_decomposition.py b/src/Caribou_dimensions_decomposition.py index 0141abb..3c4a343 100644 --- a/src/Caribou_dimensions_decomposition.py +++ b/src/Caribou_dimensions_decomposition.py @@ -51,9 +51,8 @@ def dimensions_decomposition(opt): if not os.path.exists(data_file): if opt['nb_components'] < len(kmers): - # Load data - files_lst = glob(os.path.join(data['profile'],'*.parquet')) - ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + # Load data + ds = read_parquet_files(data['profile']) scaler_file = os.path.join(outdirs['models_dir'], 'TF-IDF_diag.npz') reductor_file = os.path.join(outdirs['models_dir'], 'TruncatedSVD_components.npz') diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index 5fe02cd..efe88db 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -64,9 +64,8 @@ def features_reduction(opt): if not os.path.exists(data_file): # Load data - files_lst = glob(os.path.join(data['profile'],'*.parquet')) - export_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - train_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + export_ds = read_parquet_files(data['profile']) + train_ds = read_parquet_files(data['profile']) # Time the computation of transformations t_start = time() # Features scaling diff --git a/src/models/models_utils.py b/src/models/models_utils.py index a7fbdb7..c9990bd 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -129,5 +129,5 @@ def _compute_weights(self): ) for lab, encoded in self._labels_map.items(): - if lab != 'unknown': + if lab.lower() != 'unknown': self._weights[encoded] = weights[classes.index(lab)] \ No newline at end of file diff --git a/src/models/reads_simulation.py b/src/models/reads_simulation.py index 197a3f4..cf4c8eb 100644 --- a/src/models/reads_simulation.py +++ b/src/models/reads_simulation.py @@ -221,8 +221,7 @@ def split_sim_dataset(ds, data, name): if os.path.isfile(splitted_path): warnings.warn(f'The {name} dataset already exists, skipping simulation and loading the dataset') splitted_data = load_Xy_data(splitted_path) - files_lst = glob(os.path.join(splitted_data['profile'],'*.parquet')) - splitted_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + splitted_ds = read_parquet_files(splitted_data['profile']) return splitted_ds, splitted_data else: splitted_ds = ds.random_sample(0.1) @@ -245,6 +244,5 @@ def sim_dataset(ds, data, name): sim_outdir = os.path.dirname(data['profile']) cv_sim = readsSimulation(data['fasta'], cls, list(cls['id']), 'miseq', sim_outdir, name) sim_data = cv_sim.simulation(k, data['kmers']) - files_lst = glob(os.path.join(sim_data['profile'], '*.parquet')) - sim_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + sim_ds = read_parquet_files(sim_data['profile']) return sim_data, sim_ds \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 9194f63..83e84b2 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,18 +5,21 @@ import numpy as np import pandas as pd import pyarrow as pa +import pyarrow.parquet as pq from glob import glob from pathlib import Path from warnings import warn from psutil import virtual_memory + __author__ = "Nicolas de Montigny" __all__ = [ 'init_ray_cluster', 'load_Xy_data', 'save_Xy_data', + 'read_parquet_files', 'verify_file', 'verify_fasta', 'verify_data_path', @@ -82,6 +85,19 @@ def load_Xy_data(Xy_file): def save_Xy_data(data, Xy_file): np.savez(Xy_file, data = data) +# Read parquet files and handle FileSystem build ImportError +def read_parquet_files(profile): + files_lst = glob(os.path.join(profile, '*.parquet')) + try: + ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + except ImportError: + tables_lst = [] + for file in files_lst: + tables_lst.append(pq.read_table(file)) + ds = ray.data.from_arrow(tables_lst) + + return ds + # User arguments verification ######################################################################################################### @@ -306,8 +322,7 @@ def verify_load_metagenome(data): Wrapper function for verifying and loading the metagenome dataset """ data = verify_load_data(data) - files_lst = glob(os.path.join(data['profile'], '*.parquet')) - ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + ds = read_parquet_files(data['profile']) return data, ds @@ -317,8 +332,7 @@ def verify_load_db(db_data): Wrapper function for verifying and loading the db dataset """ db_data = verify_load_data(db_data) - files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + db_ds = read_parquet_files(db_data['profile']) db_ds = db_ds.map_batches(convert_archaea_bacteria, batch_format = 'pandas') return db_data, db_ds @@ -343,14 +357,11 @@ def merge_db_host(db_data, host_data): if os.path.isfile(merged_db_host_file): merged_db_host = load_Xy_data(merged_db_host_file) - files_lst = glob(os.path.join(merged_db_host['profile'], '*.parquet')) - merged_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + merged_ds = read_parquet_files(merge_db_host['profile']) else: merged_db_host['profile'] = f"{db_data['profile']}_host_merged" - files_lst = glob(os.path.join(db_data['profile'], '*.parquet')) - db_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) - files_lst = glob(os.path.join(host_data['profile'], '*.parquet')) - host_ds = ray.data.read_parquet_bulk(files_lst, parallelism = len(files_lst)) + db_ds = read_parquet_files(db_data['profile']) + host_ds = read_parquet_files(host_data['profile']) cols2drop = [col for col in db_ds.schema().names if col not in ['id','domain',TENSOR_COLUMN_NAME]] db_ds = db_ds.drop_columns(cols2drop) From 3964306d4c1f3397b19d80c01f837367de050984 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 25 Nov 2023 20:55:09 -0500 Subject: [PATCH 50/92] debug keras for cv --- src/models/classification.py | 2 + src/models/kerasTF/build_neural_networks.py | 52 +++++++++---------- src/models/kerasTF/models.py | 56 ++++++++++----------- src/models/models_utils.py | 2 +- src/utils.py | 51 +++++++++++++------ 5 files changed, 92 insertions(+), 71 deletions(-) diff --git a/src/models/classification.py b/src/models/classification.py index 1d0e3c8..94455ad 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -172,6 +172,7 @@ def _cv_predict(self, ds, model_map): mapping = {} for taxa, model in model_map.items(): mapping[taxa] = model.predict(ds) # np.array + return mapping # Private training secondary functions @@ -311,6 +312,7 @@ def _score_cv(self, y_true, y_pred, taxa): 'y_true': y_true[taxa], 'y_pred': y_pred[taxa] }) + y_compare.to_csv(os.path.join(self._outdirs['models_dir'], f'y_compare_{self._database}_{model}_{taxa}.csv')) support = precision_recall_fscore_support( diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 80bdc07..b037d24 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -19,19 +19,19 @@ def build_attention(nb_features): VirNet package [Abdelkareem et al. 2018] https://github.com/alyosama/virnet/blob/master/NNClassifier.py """ - inputs = Input(shape = (nb_features,)) - x = Embedding(nb_features, 128)(inputs) + inputs = Input(shape = (nb_features,1)) + # x = Embedding(nb_features, 128)(inputs) - x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x) + x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(inputs) x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x) x = AttentionWeightedAverage()(x) x = Dense(128, activation = "relu")(x) x = Dropout(0.1)(x) - x = Dense(1, activation = "tanh")(x) + x = Dense(1, activation = "sigmoid")(x) model = Model(inputs = inputs, outputs = x) - model.compile(loss = BinaryCrossentropy(from_logits = False), optimizer = 'adam', metrics = ['accuracy'], jit_compile = True) + model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'], jit_compile = True) return model @@ -43,15 +43,15 @@ def build_LSTM(nb_features): https://github.com/gussow/seeker/blob/master/train_model/train_model.py """ - inputs = Input(shape = (nb_features,)) - x = Embedding(nb_features, 128)(inputs) + inputs = Input(shape = (nb_features,1)) + # x = Embedding(nb_features, 128)(inputs) - x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(x) + x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(inputs) x = Dense(1, activation = 'tanh')(x) model = Model(inputs = inputs, outputs = x) - model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -63,10 +63,10 @@ def build_deepLSTM(nb_features): https://github.com/wandreopoulos/deeplasmid/blob/docker/classifier/dl/DL_Model.py """ - inputs = Input(shape=(nb_features,)) + inputs = Input(shape=(nb_features,1)) - netA = Embedding(nb_features, 128)(inputs) - netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (netA) + # netA = Embedding(nb_features, 128)(inputs) + netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (inputs) netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA) netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs) @@ -82,7 +82,7 @@ def build_deepLSTM(nb_features): outputs = Dense(1, activation='sigmoid', name='score')(net) model = Model(inputs=inputs, outputs=outputs) - model.compile(loss=BinaryCrossentropy(from_logits = False), optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -95,9 +95,9 @@ def build_LSTM_attention(nb_features, nb_classes): https://github.com/MicrobeLab/DeepMicrobes/blob/master/models/embed_lstm_attention.py """ - inputs = Input(shape = (nb_features,)) - net = Embedding(nb_features, 100)(inputs) - net = Bidirectional(LSTM(300, return_sequences=True))(net) + inputs = Input(shape = (nb_features,1)) + # net = Embedding(nb_features, 100)(inputs) + net = Bidirectional(LSTM(300, return_sequences=True))(inputs) net = Attention(dropout = 0.2)([net,net]) # MLP net = Dense((nb_features * 300 * 2), activation = 'relu')(net) @@ -108,7 +108,7 @@ def build_LSTM_attention(nb_features, nb_classes): net = Dense(nb_classes)(net) outputs = Activation('softmax')(net) model = Model(inputs = inputs, outputs = outputs) - model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -134,7 +134,7 @@ def build_CNN(nb_features, nb_classes): model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) - model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -146,20 +146,20 @@ def build_wideCNN(nb_features, nb_classes): https://github.com/KennthShang/CHEER/blob/master/Classifier/model/Wcnn.py """ - inputs = Input(shape = (nb_features,)) - embed = Embedding(248, 100)(inputs) - embed = Reshape((nb_features, -1, 1))(embed) + inputs = Input(shape = (nb_features,1)) + # embed = Embedding(248, 100)(inputs) + # embed = Reshape((nb_features, -1, 1))(embed) - conv1 = Conv2D(256, 3, activation = 'relu')(embed) + conv1 = Conv2D(256, 3, activation = 'relu')(inputs) conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv1) - conv2 = Conv2D(256, 7, activation = 'relu')(embed) + conv2 = Conv2D(256, 7, activation = 'relu')(inputs) conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv2) - conv3 = Conv2D(256, 11, activation = 'relu')(embed) + conv3 = Conv2D(256, 11, activation = 'relu')(inputs) conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv3) - conv4 = Conv2D(256, 15, activation = 'relu')(embed) + conv4 = Conv2D(256, 15, activation = 'relu')(inputs) conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv4) net = Concatenate(axis = 1)([conv1,conv2,conv3,conv4]) @@ -172,6 +172,6 @@ def build_wideCNN(nb_features, nb_classes): net = Dense(nb_classes)(net) outputs = Activation('softmax')(net) model = Model(inputs = inputs, outputs = outputs) - model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 62538f4..ebb37f2 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -127,31 +127,22 @@ def __init__( def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') - for row in ds.iter_rows(): - labels.append(row[self.taxa]) - self._nb_classes = len(np.unique(labels)) - if self._nb_classes == 2: - self._encoder = ModelLabelEncoder(self.taxa) - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - else: + self._encoder = ModelLabelEncoder(self.taxa) + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) + self._scaler.fit(ds) + self._encoder.fit(ds) + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + self._nb_classes = len(self._encoder.stats_[f'unique_values({self.taxa})']) + if self._nb_classes > 2 : self._encoder = Chain( - LabelEncoder(self.taxa), + self._encoder, OneHotTensorEncoder(self.taxa) ) - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - - self._encoder.fit(ds) - if scaling: - self._scaler.fit(ds) - # Labels mapping - if self._nb_classes == 2: - labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) - else: - labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys()) + self._encoder.fit(ds) + self._encoded = np.arange(len(labels)) - labels = np.append(labels, 'unknown') + labels = np.append(labels, 'Unknown') self._encoded = np.append(self._encoded, -1) for (label, encoded) in zip(labels, self._encoded): self._labels_map[label] = encoded @@ -161,6 +152,7 @@ def _label_decode(self, predict): print('_label_decode') decoded = pd.Series(np.empty(len(predict), dtype=object)) for label, encoded in self._labels_map.items(): + print(predict == encoded) decoded[predict == encoded] = label return np.array(decoded) @@ -207,7 +199,7 @@ def fit(self, datasets): ), datasets=datasets, ) - + training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] @@ -218,6 +210,7 @@ def predict(self, ds, threshold=0.8): col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] ds = ds.drop_columns(col_2_drop) + # Preprocess if self._scaler is not None: ds = self._scaler.transform(ds) @@ -230,12 +223,15 @@ def predict(self, ds, threshold=0.8): ) predictions = self._predictor.predict( data = ds, - batch_size = self.batch_size + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_cpus_per_worker = self._nb_CPU_per_worker, + num_gpus_per_worker = self._nb_GPU_per_worker ) # Convert predictions to labels predictions = self._prob_2_cls(predictions, threshold) - + return self._label_decode(predictions) else: raise ValueError('No data to predict') @@ -245,8 +241,8 @@ def _prob_2_cls(self, predictions, threshold): print('_prob_2_cls') def map_predicted_label_binary(ds, threshold): ds = np.ravel(ds['predictions']) - lower_threshold = 0.5 - (threshold * 0.5) - upper_threshold = 0.5 + (threshold * 0.5) + lower_threshold = 0.5 #- (threshold * 0.5) + upper_threshold = 0.5 #+ (threshold * 0.5) predict = pd.DataFrame({ 'proba': ds, 'predicted_label': np.full(len(ds), -1) @@ -265,12 +261,12 @@ def map_predicted_label_multiclass(ds, threshold): return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} - if self._nb_classes == 2: - print('map_predicted_label_binary') - fn = map_predicted_label_binary - else: + if self._nb_classes > 2: print('map_predicted_label_multiclass') fn = map_predicted_label_multiclass + else: + print('map_predicted_label_binary') + fn = map_predicted_label_binary predict = [] predictions = predictions.map_batches( diff --git a/src/models/models_utils.py b/src/models/models_utils.py index c9990bd..17d5c44 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -130,4 +130,4 @@ def _compute_weights(self): for lab, encoded in self._labels_map.items(): if lab.lower() != 'unknown': - self._weights[encoded] = weights[classes.index(lab)] \ No newline at end of file + self._weights[int(encoded)] = weights[classes.index(lab)] \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 83e84b2..74ca704 100644 --- a/src/utils.py +++ b/src/utils.py @@ -11,7 +11,7 @@ from pathlib import Path from warnings import warn from psutil import virtual_memory - +from tensorflow.config import list_physical_devices __author__ = "Nicolas de Montigny" @@ -49,6 +49,8 @@ 'merge_db_host' ] +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + # Constants ######################################################################################################### @@ -59,19 +61,40 @@ # Initialize ray cluster def init_ray_cluster(workdir): - mem = virtual_memory().total - frac = 0.8 - while not ray.is_initialized(): - try: - ray.init( - object_store_memory = mem * frac, - _temp_dir = str(workdir), - ) - logging.getLogger("ray").setLevel(logging.WARNING) - ray.data.DataContext.get_current().execution_options.verbose_progress = True - except ValueError : - ray.shutdown() - frac -= 0.05 + """ + 1. Get physical material available + Number of available CPUs and GPUs + 2. Get host IP from OS + Defaults to 172.24.94.34 + 3. Start the ray cluster at OS level + """ + nb_CPU = os.cpu_count() + nb_GPU = len(list_physical_devices('GPU')) + + try: + host_ip = os.environ['HOST_IP'] + except KeyError: + host_ip = '172.24.94.34' + + cmd = f'ray start --head --node-ip-address {host_ip} --port 34567 --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}' + os.system(cmd) + + ray.init() + logging.getLogger("ray").setLevel(logging.WARNING) + ray.data.DataContext.get_current().execution_options.verbose_progress = True + # mem = virtual_memory().total + # frac = 0.8 + # while not ray.is_initialized(): + # try: + # ray.init( + # object_store_memory = mem * frac, + # _temp_dir = str(workdir), + # ) + # logging.getLogger("ray").setLevel(logging.WARNING) + # ray.data.DataContext.get_current().execution_options.verbose_progress = True + # except ValueError : + # ray.shutdown() + # frac -= 0.05 # Data I/O ######################################################################################################### From 8dcfdf3ef083544bef42851e57bc90e434e37069 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 30 Nov 2023 12:13:37 -0500 Subject: [PATCH 51/92] NN debug for CCDB --- .../reduction/truncated_svd_decomposition.py | 3 + src/models/classification.py | 19 +- src/models/kerasTF/binary_models.py | 73 ++++++ src/models/kerasTF/build_neural_networks.py | 2 +- src/models/kerasTF/models.py | 87 +++++-- src/models/kerasTF/multiclass_models.py | 73 ++++++ src/models/models_utils.py | 10 +- src/models/multiclass_utils.py | 127 +++++++++ src/models/sklearn/binary_models.py | 216 ++++++++++++++++ src/models/sklearn/models.py | 200 +++------------ src/models/sklearn/multiclass_models.py | 240 ++++++++++++++++++ 11 files changed, 856 insertions(+), 194 deletions(-) create mode 100644 src/models/kerasTF/binary_models.py create mode 100644 src/models/kerasTF/multiclass_models.py create mode 100644 src/models/multiclass_utils.py create mode 100644 src/models/sklearn/binary_models.py create mode 100644 src/models/sklearn/multiclass_models.py diff --git a/src/data/reduction/truncated_svd_decomposition.py b/src/data/reduction/truncated_svd_decomposition.py index ca0eed4..74a4a0b 100644 --- a/src/data/reduction/truncated_svd_decomposition.py +++ b/src/data/reduction/truncated_svd_decomposition.py @@ -36,6 +36,9 @@ def __init__(self, features: List[str], nb_components: int = 10000, file: str = def _fit(self, ds: Dataset) -> Preprocessor: """ + TODO: adapt by using the metho from PySpark SVD + https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.linalg.distributed.SingularValueDecomposition.html?highlight=svd + Possibilities for parallel TruncatedSVD * sklearn minibatch PCA -> PCA / SVD mostly equivalent * implement parallel based on other library diff --git a/src/models/classification.py b/src/models/classification.py index 94455ad..3c62ddf 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -7,8 +7,11 @@ from warnings import warn from typing import Dict, List -from models.sklearn.models import SklearnModel -from models.kerasTF.models import KerasTFModel +from models.kerasTF.models import KerasTFModels +from models.sklearn.binary_models import SklearnBinaryModels +# from models.kerasTF.binary_models import KerasTFBinaryModels +from models.sklearn.multiclass_models import SklearnMulticlassModels +# from models.kerasTF.multiclass_models import KerasTFMulticlassModels # CV metrics from sklearn.metrics import precision_recall_fscore_support @@ -149,7 +152,7 @@ def _predict(self, ds, model_map): if self.is_fitted: try: for taxa, model in model_map.items(): - predictions = model.predict(ds) # np.array + predictions = model.predict_proba(ds) # np.array ds, predictions, ids = self._remove_unknown(ds, predictions) file = self._save_dataset(ds, taxa) mapping[taxa] = { @@ -181,7 +184,7 @@ def _cv_predict(self, ds, model_map): def _binary_training(self, datasets, taxa, file): print('_binary_training') if self._classifier_binary == 'onesvm': - model = SklearnModel( + model = SklearnBinaryModels( self._classifier_binary, self._outdirs['models_dir'], self._batch_size, @@ -191,7 +194,7 @@ def _binary_training(self, datasets, taxa, file): self._database_data['csv'] ) elif self._classifier_binary == 'linearsvm': - model = SklearnModel( + model = SklearnBinaryModels( self._classifier_binary, self._outdirs['models_dir'], self._batch_size, @@ -201,7 +204,7 @@ def _binary_training(self, datasets, taxa, file): self._database_data['csv'] ) else: - model = KerasTFModel( + model = KerasTFModels( self._classifier_binary, self._outdirs['models_dir'], self._batch_size, @@ -222,7 +225,7 @@ def _binary_training(self, datasets, taxa, file): def _multiclass_training(self, datasets, taxa, file): print('_multiclass_training') if self._classifier_multiclass in ['sgd','mnb']: - model = SklearnModel( + model = SklearnMulticlassModels( self._classifier_multiclass, self._outdirs['models_dir'], self._batch_size, @@ -232,7 +235,7 @@ def _multiclass_training(self, datasets, taxa, file): self._database_data['csv'] ) else: - model = KerasTFModel( + model = KerasTFModels( self._classifier_multiclass, self._outdirs['models_dir'], self._batch_size, diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py new file mode 100644 index 0000000..bc79f35 --- /dev/null +++ b/src/models/kerasTF/binary_models.py @@ -0,0 +1,73 @@ +import os +import gc +import warnings +import numpy as np +import pandas as pd + +# Preprocessing +from ray.data.preprocessors import LabelEncoder, Chain +from models.encoders.model_label_encoder import ModelLabelEncoder +from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer + +# Parent class / models +from models.kerasTF.models import KerasTFModels +from models.kerasTF.build_neural_networks import * + +# Training +import tensorflow as tf +from ray.air import session +# from ray.air.integrations.keras import Callback +from ray.air.config import ScalingConfig +from ray.air.integrations.keras import ReportCheckpointCallback +from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint + +# Tuning +from ray.air.config import RunConfig + +# Predicting +from ray.train.tensorflow import TensorflowPredictor +from ray.train.batch_predictor import BatchPredictor + +__author__ = 'Nicolas de Montigny' + +__all__ = ['KerasTFModel'] + +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' + +# Ignore warnings to have a more comprehensible output on stdout +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +warnings.filterwarnings('ignore') + +class KerasTFBinaryModels(KerasTFModels): + """ + Class used to build, train and predict models using Ray with Keras Tensorflow backend + + ---------- + Attributes + ---------- + + clf_file : string + Path to a file containing the trained model for this object + + nb_classes : int + Number of classes for learning + + ---------- + Methods + ---------- + + preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation + + train : train a model using the given datasets + + predict : predict the classes of a dataset + ds : ray.data.Dataset + Dataset containing K-mers profiles of sequences to be classified + + threshold : float + Minimum percentage of probability to effectively classify. + Sequences will be classified as 'unknown' if the probability is under this threshold. + Defaults to 80% + """ \ No newline at end of file diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index b037d24..8294110 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -72,7 +72,7 @@ def build_deepLSTM(nb_features): netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs) netB = Dense(40, activation='tanh',name='H_%d'%40) (netB) - net = Concatenate()([netA,netB]) + net = Concatenate()([netA,netB]) # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 1000, 40)] net = Dense(200, activation='relu', name='C_%d'%(10*2))(net) net = Dropout(0.1,name='fr_%.1f'%0.1)(net) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index ebb37f2..989f934 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -4,6 +4,9 @@ import numpy as np import pandas as pd +# Class construction +from abc import ABC, abstractmethod + # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.encoders.model_label_encoder import ModelLabelEncoder @@ -40,7 +43,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' warnings.filterwarnings('ignore') -class KerasTFModel(ModelsUtils): +class KerasTFModels(ModelsUtils): """ Class used to build, train and predict models using Ray with Keras Tensorflow backend @@ -70,7 +73,6 @@ class KerasTFModel(ModelsUtils): Minimum percentage of probability to effectively classify. Sequences will be classified as 'unknown' if the probability is under this threshold. Defaults to 80% - """ def __init__( @@ -152,7 +154,6 @@ def _label_decode(self, predict): print('_label_decode') decoded = pd.Series(np.empty(len(predict), dtype=object)) for label, encoded in self._labels_map.items(): - print(predict == encoded) decoded[predict == encoded] = label return np.array(decoded) @@ -203,14 +204,34 @@ def fit(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] - def predict(self, ds, threshold=0.8): + def predict(self, ds): print('predict') + # Predict with model + predictions = self._make_predictions(ds) + + # Convert predictions to labels for cross-validation of classification + predictions = self._get_abs_pred(predictions) + + # Return decoded labels + return self._label_decode(predictions) + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + # Predict with model + predictions = self._make_predictions(ds) + + # Convert predictions to labels with threshold for top-down classification + predictions = self._get_threshold_pred(predictions, threshold) + + # Return decoded labels + return self._label_decode(predictions) + + def _make_predictions(self, ds): if ds.count() > 0: if len(ds.schema().names) > 1: col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] ds = ds.drop_columns(col_2_drop) - # Preprocess if self._scaler is not None: ds = self._scaler.transform(ds) @@ -228,21 +249,57 @@ def predict(self, ds, threshold=0.8): num_cpus_per_worker = self._nb_CPU_per_worker, num_gpus_per_worker = self._nb_GPU_per_worker ) - - # Convert predictions to labels - predictions = self._prob_2_cls(predictions, threshold) - - return self._label_decode(predictions) + return predictions else: raise ValueError('No data to predict') + + def _get_abs_pred(self, predictions): + print('_get_abs_pred') + def map_predicted_label_binary(ds): + ds = np.ravel(ds['predictions']) + lower_threshold = 0.5 + upper_threshold = 0.5 + predict = pd.DataFrame({ + 'proba': ds, + 'predicted_label': np.full(len(ds), -1) + }) + predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 + predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 + return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} + + def map_predicted_label_multiclass(ds): + ds = ds['predictions'] + pred = pd.DataFrame({ + 'best_proba': [np.max(arr) for arr in ds], + 'predicted_label' : [np.argmax(arr) for arr in ds] + }) + + return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} + + if self._nb_classes > 2: + print('map_predicted_label_multiclass') + fn = map_predicted_label_multiclass + else: + print('map_predicted_label_binary') + fn = map_predicted_label_binary + + predict = [] + predictions = predictions.map_batches( + lambda batch : fn(batch), + batch_format = 'numpy', + batch_size = self.batch_size + ) + for row in predictions.iter_rows(): + predict.append(row['predictions']) + + return predict - # Iterate over batches of predictions to transform probabilities to labels without mapping - def _prob_2_cls(self, predictions, threshold): - print('_prob_2_cls') + def _get_threshold_pred(self, predictions, threshold): + print('_get_threshold_pred') def map_predicted_label_binary(ds, threshold): ds = np.ravel(ds['predictions']) - lower_threshold = 0.5 #- (threshold * 0.5) - upper_threshold = 0.5 #+ (threshold * 0.5) + lower_threshold = 0.5 - (threshold * 0.5) + upper_threshold = 0.5 + (threshold * 0.5) predict = pd.DataFrame({ 'proba': ds, 'predicted_label': np.full(len(ds), -1) diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py new file mode 100644 index 0000000..b422ff8 --- /dev/null +++ b/src/models/kerasTF/multiclass_models.py @@ -0,0 +1,73 @@ +import os +import gc +import warnings +import numpy as np +import pandas as pd + +# Preprocessing +from ray.data.preprocessors import LabelEncoder, Chain +from models.encoders.model_label_encoder import ModelLabelEncoder +from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer + +# Parent class / models +from models.models_utils import ModelsUtils +from models.kerasTF.build_neural_networks import * + +# Training +import tensorflow as tf +from ray.air import session +# from ray.air.integrations.keras import Callback +from ray.air.config import ScalingConfig +from ray.air.integrations.keras import ReportCheckpointCallback +from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint + +# Tuning +from ray.air.config import RunConfig + +# Predicting +from ray.train.tensorflow import TensorflowPredictor +from ray.train.batch_predictor import BatchPredictor + +__author__ = 'Nicolas de Montigny' + +__all__ = ['KerasTFModel'] + +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' + +# Ignore warnings to have a more comprehensible output on stdout +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +warnings.filterwarnings('ignore') + +class KerasTFMulticlassModels(ModelsUtils): + """ + Class used to build, train and predict models using Ray with Keras Tensorflow backend + + ---------- + Attributes + ---------- + + clf_file : string + Path to a file containing the trained model for this object + + nb_classes : int + Number of classes for learning + + ---------- + Methods + ---------- + + preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation + + train : train a model using the given datasets + + predict : predict the classes of a dataset + ds : ray.data.Dataset + Dataset containing K-mers profiles of sequences to be classified + + threshold : float + Minimum percentage of probability to effectively classify. + Sequences will be classified as 'unknown' if the probability is under this threshold. + Defaults to 80% + """ \ No newline at end of file diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 17d5c44..c665f0e 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -19,7 +19,7 @@ class ModelsUtils(ABC): """ - Utilities for preprocessing data and doing cross validation using ray + Abstract class for both frameworks to initialize their attributes. ---------- Attributes @@ -103,7 +103,7 @@ def predict(self): """ @abstractmethod - def _prob_2_cls(self): + def _get_threshold_pred(self): """ """ @@ -116,13 +116,14 @@ def _compute_weights(self): """ Set class weights depending on their abundance in data-associated classes csv """ + weights = {} if isinstance(self._csv, tuple): cls = pd.concat([pd.read_csv(self._csv[0]),pd.read_csv(self._csv[1])], axis = 0, join = 'inner', ignore_index = True) cls = pd.read_csv(self._csv) if self.taxa == 'domain': cls.loc[cls['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' classes = list(cls[self.taxa].unique()) - weights = compute_class_weight( + cls_weights = compute_class_weight( class_weight = 'balanced', classes = classes, y = cls[self.taxa] @@ -130,4 +131,5 @@ def _compute_weights(self): for lab, encoded in self._labels_map.items(): if lab.lower() != 'unknown': - self._weights[int(encoded)] = weights[classes.index(lab)] \ No newline at end of file + weights[int(encoded)] = cls_weights[classes.index(lab)] + return weights \ No newline at end of file diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py new file mode 100644 index 0000000..ad07191 --- /dev/null +++ b/src/models/multiclass_utils.py @@ -0,0 +1,127 @@ +import os +import ray +import warnings +import numpy as np +import pandas as pd + +# Class construction +from abc import ABC, abstractmethod + +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +__author__ = 'Nicolas de Montigny' + +__all__ = ['ModelsUtils'] + +TENSOR_COLUMN_NAME = '__value__' + +class MulticlassUtils(ABC): + """ + Abstract class to provide utilities for multiclass classification models. + These methods are meant to be used when decomposing data into taxonomic groups before training one model per group + ----------------------- + Mixture-of-Experts (MoE) + ----------------------- + 1. Train each expert on their task-associated data + * Split training data into 80/20% splits + * Train/val over multiple epochs + 2. Train a gating network on the whole task + * Perceptron NN for gating + * Train on whole training ds + * Validation on simulated reads ds + * CV on test simulated reads ds + https://medium.com/@bensalemh300/harnessing-the-best-of-both-worlds-how-mixture-of-experts-meets-pyspark-for-mnist-mastery-315f82e65a0e + https://machinelearningmastery.com/mixture-of-experts/ + + 1. Cluster Data Split: Data within each cluster is divided into training and testing sets. + 2. Decision Tree Classifiers: For clusters where there’s more than one unique class in the training data, we train Decision Tree classifiers. These classifiers can distinguish between different classes within the cluster. + 3. Storing Expert Models: Trained Decision Tree models are stored in a dictionary, where each expert corresponds to a specific cluster. + 4. Performance Evaluation: The performance of each expert model is assessed by evaluating its accuracy on the corresponding test data. + + Sklearn LogisticRegression : https://github.com/zermelozf/esn-lm/blob/master/esnlm/readouts/smoe.py + Keras/TF : https://abdulkaderhelwan.medium.com/mixture-of-experts-introduction-39f244a4ff05 + Keras/TF on article 2018 : https://github.com/drawbridge/keras-mmoe + Keras/TF 2018 : https://github.com/eminorhan/mixture-of-experts + Detailed example : https://mattgorb.github.io/moe + Detailed example : https://towardsdatascience.com/how-to-build-a-wide-and-deep-model-using-keras-in-tensorflow-2-0-2f7a236b5a4b + Keras example : https://keras.io/examples/nlp/text_classification_with_switch_transformer/ + Keras example : https://stackoverflow.com/questions/77551865/how-to-extend-keras-gpt2-model-moe-example + FastMoE PyTorch : https://fastmoe.ai/ + Tutel PyTorch : https://www.microsoft.com/en-us/research/blog/tutel-an-efficient-mixture-of-experts-implementation-for-large-dnn-model-training/ + """ + + def _split_dataset(self, ds, taxa, csv): + """ + Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels + + Makes assumption that classes are order specific -> broad in csv columns + + Ray data GroupBy https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key) + 1. GroupBy previous taxa + 2. Fx for model training (train_fx) + 3. ds.map_groups(train_fx) to exec the training of models in parallel + 4. Write results to file / save models + """ + ds_collection = {} + # cls = pd.read_csv(csv) + # prev_tax = list(cls.columns) + # prev_tax = prev_tax[prev_tax.index(taxa) + 1] + # unique_labs = cls[prev_tax].unique() + + + # for lab in unique_labs: + + # def map_split(ds): + # logging.getLogger("ray").info(ds[ds[prev_tax] == lab]) + # return ds[ds[prev_tax] == lab] + + # test = ds.map(map_split) + + # partial_ds = ds.map_batches(map_split, batch_format = 'pandas') + # file = '/home/nick/github/test' + # partial_ds.write_parquet(file) + # ds_collection[lab] = partial_ds + + # for k, v in ds_collection.items(): + # # print(v.to_pandas()) + # print(v) + """ + for lab in unique_labs: + ds_collection[lab] = [] + + for batch in ds.iter_batches(batch_format = 'pandas'): + labs_batch = batch[prev_tax].unique() + for lab in labs_batch: + ds_collection[lab].append(batch[batch[prev_tax] == lab]) + + for lab in unique_labs: + ds_collection[lab] = pd.concat(ds_collection[lab]) + """ + return ds_collection + + def _predictions_cv(self, predictions): + """ + Brings back together the predictions made by multiple models trained on subclasses of the original dataset + + If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to + + ---------- + Cross-validation + ---------- + * We know the classes from the previous taxa, can make each model CV on their subpart + * Metrics for CV overall per taxa ~k-fold strategy (mean / mode) + """ + + + def _predictions_classif(self, predictions): + """ + Brings back together the predictions made by multiple models trained on subclasses of the original dataset + + If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to + + ---------- + Classification + ---------- + * Since we know the previous taxa classified per sequence, we can run this specific model to classify at the current level + * See multi-stage classification + """ diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py new file mode 100644 index 0000000..90d3679 --- /dev/null +++ b/src/models/sklearn/binary_models.py @@ -0,0 +1,216 @@ +import os +import ray +import warnings +import numpy as np +import pandas as pd + +# Preprocessing +from models.encoders.model_label_encoder import ModelLabelEncoder +from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer + +# Training +from ray.air.config import ScalingConfig +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from models.sklearn.partial_trainer import SklearnPartialTrainer +from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM + +# Tuning +from ray.air.config import RunConfig + +# Predicting +from ray.train.batch_predictor import BatchPredictor +from models.sklearn.tensor_predictor import SklearnTensorPredictor +from models.sklearn.probability_predictor import SklearnTensorProbaPredictor + +# Parent class +from models.sklearn.models import SklearnModels + +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' + +__author__ = 'Nicolas de Montigny' + +__all__ = ['SklearnModel'] + +# Ignore warnings to have a more comprehensible output on stdout +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +warnings.filterwarnings('ignore') + +class SklearnBinaryModels(SklearnModels): + """ + Class used to build, train and predict binary models using Ray with Scikit-learn backend + + ---------- + Attributes + ---------- + + clf_file : string + Path to a file containing the trained model for this object + + ---------- + Methods + ---------- + + preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation + + train : train a model using the given datasets + + predict : predict the classes of a dataset + ds : ray.data.Dataset + Dataset containing K-mers profiles of sequences to be classified + + threshold : float + Minimum percentage of probability to effectively classify. + Sequences will be classified as 'unknown' if the probability is under this threshold. + Defaults to 80% + """ + def __init__( + self, + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ): + super().__init__( + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ) + + def preprocess(self, ds, scaling = False, scaler_file = None): + print('preprocess') + if self.classifier == 'onesvm': + self._encoder = OneClassSVMLabelEncoder(self.taxa) + self._encoded = np.array([1,-1], dtype = np.int32) + labels = np.array(['Bacteria', 'Unknown'], dtype = object) + self._encoder.fit(ds) + else: + self._encoder = ModelLabelEncoder(self.taxa) + self._encoder.fit(ds) + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + self._encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + self._encoded = np.append(self._encoded, -1) + self._weights = self._compute_weights() + + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) + self._scaler.fit(ds) + + # Labels mapping + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded + + def _build(self): + print('_build') + if self.classifier == 'onesvm': + print('Training bacterial extractor with One Class SVM') + self._clf = ScoringSGDOneClassSVM() + self._train_params = { + 'nu' : 0.026441491, + 'learning_rate' : 'constant', + 'tol' : 1e-3, + 'eta0' : 0.001 + } + else : + print('Training bacterial / host classifier with SGD') + self._clf = SGDClassifier() + self._train_params = { + 'loss' : 'hinge', + 'penalty' : 'elasticnet', + 'alpha' : 141.6146176, + 'learning_rate' : 'adaptive', + 'class_weight' : self._weights, + 'eta0' : 0.001, + 'n_jobs' : -1 + } + + def fit(self, datasets): + print('_fit_model') + # Define model + self._build() + for name, ds in datasets.items(): + ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + # Trigger the preprocessing computations before ingest in trainer + # Otherwise, it would be executed at each epoch + ds = ds.materialize() + datasets[name] = ray.put(ds) + + try: + training_labels = self._encoded.copy() + training_labels = np.delete(training_labels, np.where(training_labels == -1)) + except: + pass + + # Define trainer + self._trainer = SklearnPartialTrainer( + estimator=self._clf, + labels_list=training_labels, + features_list=self.kmers, + params=self._train_params, + datasets=datasets, + batch_size=self.batch_size, + training_epochs=self._training_epochs, + set_estimator_cpus=True, + scaling_config=ScalingConfig( + trainer_resources={ + 'CPU': int(os.cpu_count()*0.6) + } + ), + run_config=RunConfig( + name=self.classifier, + local_dir=self._workdir + ), + ) + + # Training execution + training_result = self._trainer.fit() + self._model_ckpt = training_result.checkpoint + + def predict(self, ds): + print('predict') + if ds.count() > 0: + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} + self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) + predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) + predictions = np.array(predictions.to_pandas()).reshape(-1) + return self._label_decode(predictions) + else: + raise ValueError('No data to predict') + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + return self.predict(ds) + + def _get_threshold_pred(self, predict, nb_cls, threshold): + print('_get_threshold_pred') + def map_predicted_label(ds : pd.DataFrame): + predict = pd.DataFrame({ + 'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))], + 'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))] + }) + predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1 + return pd.DataFrame(predict['predicted_label']) + + if nb_cls == 1: + predict = np.round(abs(np.concatenate(predict.to_pandas()['predictions']))) + else: + predict = predict.map_batches(map_predicted_label, batch_format = 'pandas') + predict = np.ravel(np.array(predict.to_pandas())) + + return predict \ No newline at end of file diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 1be9a15..eb379b0 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -1,27 +1,11 @@ import os -import ray import warnings + import numpy as np import pandas as pd -# Preprocessing -from models.encoders.model_label_encoder import ModelLabelEncoder -from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder -from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer - -# Training -from ray.air.config import ScalingConfig -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import SGDClassifier -from models.sklearn.partial_trainer import SklearnPartialTrainer -from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM - -# Tuning -from ray.air.config import RunConfig - -# Predicting -from ray.train.batch_predictor import BatchPredictor -from models.sklearn.tensor_predictor import SklearnTensorPredictor +# Class construction +from abc import ABC, abstractmethod # Parent class from models.models_utils import ModelsUtils @@ -37,7 +21,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' warnings.filterwarnings('ignore') -class SklearnModel(ModelsUtils): +class SklearnModels(ModelsUtils, ABC): """ Class used to build, train and predict models using Ray with Scikit-learn backend @@ -64,7 +48,6 @@ class SklearnModel(ModelsUtils): Minimum percentage of probability to effectively classify. Sequences will be classified as 'unknown' if the probability is under this threshold. Defaults to 80% - """ def __init__( self, @@ -86,155 +69,40 @@ def __init__( csv ) - def preprocess(self, ds, scaling = False, scaler_file = None): - print('preprocess') - if self.classifier == 'onesvm': - self._encoder = OneClassSVMLabelEncoder(self.taxa) - self._encoded = np.array([1,-1], dtype = np.int32) - labels = np.array(['Bacteria', 'Unknown'], dtype = object) - else: - self._encoder = ModelLabelEncoder(self.taxa) - - self._encoder.fit(ds) - - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) - - # Labels mapping - if self.classifier != 'onesvm': - labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) - self._encoded = np.arange(len(labels)) - labels = np.append(labels, 'Unknown') - self._encoded = np.append(self._encoded, -1) - for (label, encoded) in zip(labels, self._encoded): - self._labels_map[label] = encoded - if self.classifier != 'onesvm': - self._compute_weights() + @abstractmethod + def preprocess(self): + """ + """ + @abstractmethod + def _build(self): + """ + """ + + @abstractmethod + def fit(self, datasets): + """ + """ + + @abstractmethod + def predict(self, ds): + """ + """ + + @abstractmethod + def predict_proba(self): + """ + """ + + @abstractmethod + def _get_threshold_pred(self): + """ + """ + def _label_decode(self, predict): print('_label_decode') decoded = pd.Series(np.empty(len(predict), dtype=object)) for label, encoded in self._labels_map.items(): decoded[predict == encoded] = label - return np.array(decoded) - - def _build(self): - print('_build') - if self.classifier == 'onesvm': - print('Training bacterial extractor with One Class SVM') - self._clf = ScoringSGDOneClassSVM() - self._train_params = { - 'nu' : 0.026441491, - 'learning_rate' : 'constant', - 'tol' : 1e-3, - 'eta0' : 0.001 - } - elif self.classifier == 'linearsvm': - print('Training bacterial / host classifier with SGD') - self._clf = SGDClassifier() - self._train_params = { - 'loss' : 'hinge', - 'penalty' : 'elasticnet', - 'alpha' : 141.6146176, - 'learning_rate' : 'adaptive', - 'class_weight' : self._weights, - 'eta0' : 0.001, - 'n_jobs' : -1 - } -# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems - elif self.classifier == 'sgd': - print('Training multiclass SGD classifier') - self._clf = SGDClassifier() - self._train_params = { - 'alpha' : 173.5667373, - 'learning_rate' : 'optimal', - 'loss': 'modified_huber', - 'penalty' : 'l2', - 'class_weight' : self._weights, - } - elif self.classifier == 'mnb': - print('Training multiclass Multinomial Naive Bayes classifier') - self._clf = MultinomialNB() - self._train_params = { - 'alpha' : 0.243340248, - 'fit_prior' : True - } - - def fit(self, datasets): - print('_fit_model') - # Define model - self._build() - for name, ds in datasets.items(): - ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) - # Trigger the preprocessing computations before ingest in trainer - # Otherwise, it would be executed at each epoch - ds = ds.materialize() - datasets[name] = ray.put(ds) - - try: - training_labels = self._encoded.copy() - training_labels = np.delete(training_labels, np.where(training_labels == -1)) - except: - pass - - # Define trainer - self._trainer = SklearnPartialTrainer( - estimator=self._clf, - labels_list=training_labels, - features_list=self.kmers, - params=self._train_params, - datasets=datasets, - batch_size=self.batch_size, - training_epochs=self._training_epochs, - set_estimator_cpus=True, - scaling_config=ScalingConfig( - trainer_resources={ - 'CPU': int(os.cpu_count()*0.6) - } - ), - run_config=RunConfig( - name=self.classifier, - local_dir=self._workdir - ), - ) - - # Training execution - training_result = self._trainer.fit() - self._model_ckpt = training_result.checkpoint - - def predict(self, ds, threshold = 0.8): - print('predict') - if ds.count() > 0: - if self._scaler is not None: - ds = self._scaler.transform(ds) - ds = ds.materialize() - predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} - self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) - predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) - predictions = np.array(predictions.to_pandas()).reshape(-1) - return self._label_decode(predictions) - else: - raise ValueError('No data to predict') - - def _prob_2_cls(self, predict, nb_cls, threshold): - print('_prob_2_cls') - def map_predicted_label(ds : pd.DataFrame): - predict = pd.DataFrame({ - 'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))], - 'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))] - }) - predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1 - return pd.DataFrame(predict['predicted_label']) - - if nb_cls == 1: - predict = np.round(abs(np.concatenate(predict.to_pandas()['predictions']))) - else: - predict = predict.map_batches(map_predicted_label, batch_format = 'pandas') - predict = np.ravel(np.array(predict.to_pandas())) - - return predict \ No newline at end of file + return np.array(decoded) \ No newline at end of file diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py new file mode 100644 index 0000000..59926f8 --- /dev/null +++ b/src/models/sklearn/multiclass_models.py @@ -0,0 +1,240 @@ +import os +import ray +import warnings +import numpy as np +import pandas as pd + +# Preprocessing +from models.encoders.model_label_encoder import ModelLabelEncoder +from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer + +# Training +from ray.air.config import ScalingConfig +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from models.sklearn.partial_trainer import SklearnPartialTrainer +from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM + +# Tuning +from ray.air.config import RunConfig + +# Predicting +from ray.train.batch_predictor import BatchPredictor +from models.sklearn.tensor_predictor import SklearnTensorPredictor +from models.sklearn.probability_predictor import SklearnTensorProbaPredictor + +# Parent classes +from models.sklearn.models import SklearnModels +from models.multiclass_utils import MulticlassUtils + +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' + +__author__ = 'Nicolas de Montigny' + +__all__ = ['SklearnModel'] + +# Ignore warnings to have a more comprehensible output on stdout +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +warnings.filterwarnings('ignore') + +class SklearnMulticlassModels(SklearnModels, MulticlassUtils): + """ + Class used to build, train and predict multiclass models using Ray with Scikit-learn backend + + ---------- + Attributes + ---------- + + clf_file : string + Path to a file containing the trained model for this object + + ---------- + Methods + ---------- + + preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation + + train : train a model using the given datasets + + predict : predict the classes of a dataset + ds : ray.data.Dataset + Dataset containing K-mers profiles of sequences to be classified + + threshold : float + Minimum percentage of probability to effectively classify. + Sequences will be classified as 'unknown' if the probability is under this threshold. + Defaults to 80% + """ + def __init__( + self, + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ): + super().__init__( + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ) + self._training_collection = {} + self._encoder = {} + self._trainer = {} + self._model_ckpt = {} + self._predictor = {} + + def preprocess(self, ds, scaling = False, scaler_file = None): + print('preprocess') + + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) + self._scaler.fit(ds) + + self._training_collection = self._split_dataset(ds, self.taxa, self._csv) + + for prev_taxa, ds in self._training_collection.items(): + self._encoder[prev_taxa] = ModelLabelEncoder(self.taxa) + self._encoder[prev_taxa].fit(ds) + + # Labels mapping + labels = list(self._encoder[prev_taxa].stats_[f'unique_values({self.taxa})'].keys()) + encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + encoded = np.append(encoded, -1) + + self._labels_map[prev_taxa] = {} + for (label, encode) in zip(labels, encoded): + self._labels_map[prev_taxa][label] = encode + + # self._weights[prev_taxa] = self._compute_weights() + + def _build(self): + print('_build') +# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems + # if self.classifier == 'sgd': + print('Training multiclass SGD classifier') + self._clf = SGDClassifier() + self._train_params = { + 'alpha' : 173.5667373, + 'learning_rate' : 'optimal', + 'loss': 'modified_huber', + 'penalty' : 'l2', + # 'class_weight' : self._weights, + } + # elif self.classifier == 'mnb': + # print('Training multiclass Multinomial Naive Bayes classifier') + # self._clf = MultinomialNB() + # self._train_params = { + # 'alpha' : 0.243340248, + # 'fit_prior' : True + # } + + def fit(self, datasets): + print('_fit_model') + # Define model + self._build() + training_result = {} + for prev_taxa, ds in self._training_collection.items(): + ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + # Trigger the preprocessing computations before ingest in trainer + # Otherwise, it would be executed at each epoch + ds = ds.materialize() + datasets['train'] = ray.put(ds) + + try: + training_labels = list(self._labels_map[prev_taxa].values()) + training_labels = np.delete(training_labels, np.where(training_labels == -1)) + except: + pass + + # Define trainer + self._trainer[prev_taxa] = SklearnPartialTrainer( + estimator=self._clf, + labels_list=training_labels, + features_list=self.kmers, + params=self._train_params, + datasets=datasets, + batch_size=self.batch_size, + training_epochs=self._training_epochs, + set_estimator_cpus=True, + scaling_config=ScalingConfig( + trainer_resources={ + 'CPU': int(os.cpu_count()*0.6) + } + ), + run_config=RunConfig( + name=self.classifier, + local_dir=self._workdir + ), + ) + + # Training execution + training_result[prev_taxa] = self._trainer.fit() + self._model_ckpt[prev_taxa] = training_result[prev_taxa].checkpoint + + def predict(self, ds): + print('predict') + if ds.count() > 0: + if self._scaler is not None: + ds = self._scaler.transform(ds) + + ds = ds.materialize() + predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} + + for prev_taxa, ckpt in self._model_ckpt.items(): + self._predictor[prev_taxa] = BatchPredictor.from_checkpoint(ckpt, SklearnTensorProbaPredictor) + predictions = self._predictor[prev_taxa].predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) + predictions = self._predictions_grouping(predictions) + return self._label_decode(predictions) + else: + raise ValueError('No data to predict') + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + print('predict') + if ds.count() > 0: + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} + self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor) + predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) + predictions = np.array(predictions.to_pandas()).reshape(-1) + return self._label_decode(predictions) + else: + raise ValueError('No data to predict') + + def _get_threshold_pred(self, predict, threshold): + print('_get_threshold_pred') + def map_predicted_label(ds : pd.DataFrame): + predict = pd.DataFrame({ + 'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))], + 'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))] + }) + predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1 + return pd.DataFrame(predict['predicted_label']) + + predict = predict.map_batches(map_predicted_label, batch_format = 'pandas') + predict = np.ravel(np.array(predict.to_pandas())) + + return predict + + def _label_decode(self, predict): + print('_label_decode') + decoded = pd.Series(np.empty(len(predict), dtype=object)) + for label, encoded in self._labels_map.items(): + decoded[predict == encoded] = label + + return np.array(decoded) \ No newline at end of file From 3b2e1506f21729a8c36f1c9d15e56d8502a9ce59 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 2 Dec 2023 19:23:51 -0500 Subject: [PATCH 52/92] sklearn multiclass mini-models + bagging strategy --- src/models/models_utils.py | 3 + src/models/multiclass_utils.py | 78 ++++---- src/models/sklearn/models.py | 5 - src/models/sklearn/multiclass_models.py | 241 +++++++++++++----------- 4 files changed, 166 insertions(+), 161 deletions(-) diff --git a/src/models/models_utils.py b/src/models/models_utils.py index c665f0e..55c1d12 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -73,6 +73,7 @@ def __init__( self._nb_kmers = len(kmers_list) self._training_epochs = training_epochs # Initialize empty + # TODO: remove the variable that are not required to be kept throughout the classes self._clf = None self._weights = {} self._scaler = None @@ -87,6 +88,8 @@ def __init__( self._preprocessor = None self._workdir = outdir_model + + @abstractmethod def preprocess(self, ds): """ diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py index ad07191..d0db7f6 100644 --- a/src/models/multiclass_utils.py +++ b/src/models/multiclass_utils.py @@ -18,7 +18,18 @@ class MulticlassUtils(ABC): """ Abstract class to provide utilities for multiclass classification models. + These methods are meant to be used when decomposing data into taxonomic groups before training one model per group + + ----------------------- + Ray data GroupBy + ----------------------- + https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key) + 1. GroupBy previous taxa + 2. Fx for model training (train_fx) + 3. ds.map_groups(train_fx) to exec the training of models in parallel + 4. Write results to file / save models + ----------------------- Mixture-of-Experts (MoE) ----------------------- @@ -50,54 +61,36 @@ class MulticlassUtils(ABC): Tutel PyTorch : https://www.microsoft.com/en-us/research/blog/tutel-an-efficient-mixture-of-experts-implementation-for-large-dnn-model-training/ """ - def _split_dataset(self, ds, taxa, csv): + def _get_count_previous_taxa(self, taxa, csv): """ - Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels - - Makes assumption that classes are order specific -> broad in csv columns + Fetch the previous taxa and computes the number of classes in it - Ray data GroupBy https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key) - 1. GroupBy previous taxa - 2. Fx for model training (train_fx) - 3. ds.map_groups(train_fx) to exec the training of models in parallel - 4. Write results to file / save models + Makes assumption that classes are ordered ``specific -> broad`` in csv columns + + Used to determine if the dataset should be splitted according to the previous taxonomic level labels """ - ds_collection = {} - # cls = pd.read_csv(csv) - # prev_tax = list(cls.columns) - # prev_tax = prev_tax[prev_tax.index(taxa) + 1] - # unique_labs = cls[prev_tax].unique() - - - # for lab in unique_labs: - - # def map_split(ds): - # logging.getLogger("ray").info(ds[ds[prev_tax] == lab]) - # return ds[ds[prev_tax] == lab] + prev_taxa = None + cls = pd.read_csv(csv) + cols = list(cls.columns) + prev_taxa = cols[cols.index(taxa) + 1] - # test = ds.map(map_split) + return prev_taxa, len(cls[prev_taxa].unique()) - # partial_ds = ds.map_batches(map_split, batch_format = 'pandas') - # file = '/home/nick/github/test' - # partial_ds.write_parquet(file) - # ds_collection[lab] = partial_ds - - # for k, v in ds_collection.items(): - # # print(v.to_pandas()) - # print(v) + def _prev_taxa_split_dataset(self, ds, prev_taxa): """ - for lab in unique_labs: - ds_collection[lab] = [] - - for batch in ds.iter_batches(batch_format = 'pandas'): - labs_batch = batch[prev_tax].unique() - for lab in labs_batch: - ds_collection[lab].append(batch[batch[prev_tax] == lab]) - - for lab in unique_labs: - ds_collection[lab] = pd.concat(ds_collection[lab]) + Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels """ - return ds_collection + return ds.groupby(prev_taxa) + + def _random_split_dataset(self, ds): + """ + Assigns random numbers to a new column and group samples by it to form a collection of smaller random datasets + + Used when there is not enough labels in previous taxa for splitting according to the previous taxonomic level labels + """ + nb_clusters = int(ds.count() / 10) + ds = ds.repartition(nb_clusters).add_column('cluster', lambda df: df.index % nb_clusters) + return ds.groupby('cluster') def _predictions_cv(self, predictions): """ @@ -110,8 +103,8 @@ def _predictions_cv(self, predictions): ---------- * We know the classes from the previous taxa, can make each model CV on their subpart * Metrics for CV overall per taxa ~k-fold strategy (mean / mode) + TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy() """ - def _predictions_classif(self, predictions): """ @@ -124,4 +117,5 @@ def _predictions_classif(self, predictions): ---------- * Since we know the previous taxa classified per sequence, we can run this specific model to classify at the current level * See multi-stage classification + TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy() """ diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index eb379b0..386f684 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -74,11 +74,6 @@ def preprocess(self): """ """ - @abstractmethod - def _build(self): - """ - """ - @abstractmethod def fit(self, datasets): """ diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 59926f8..7a7e634 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -10,6 +10,7 @@ from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Training +import ray.cloudpickle as cpickle from ray.air.config import ScalingConfig from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier @@ -28,6 +29,9 @@ from models.sklearn.models import SklearnModels from models.multiclass_utils import MulticlassUtils +# Data +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + TENSOR_COLUMN_NAME = '__value__' LABELS_COLUMN_NAME = 'labels' @@ -99,137 +103,146 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) self._scaler.fit(ds) - self._training_collection = self._split_dataset(ds, self.taxa, self._csv) + self._encoder = ModelLabelEncoder(self.taxa) + self._encoder.fit(ds) + + # Labels mapping + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + encoded = np.append(encoded, -1) - for prev_taxa, ds in self._training_collection.items(): - self._encoder[prev_taxa] = ModelLabelEncoder(self.taxa) - self._encoder[prev_taxa].fit(ds) - - # Labels mapping - labels = list(self._encoder[prev_taxa].stats_[f'unique_values({self.taxa})'].keys()) - encoded = np.arange(len(labels)) - labels = np.append(labels, 'Unknown') - encoded = np.append(encoded, -1) - - self._labels_map[prev_taxa] = {} - for (label, encode) in zip(labels, encoded): - self._labels_map[prev_taxa][label] = encode - - # self._weights[prev_taxa] = self._compute_weights() - - def _build(self): - print('_build') -# TODO: Test performances for classifiers, if need more accuracy -> sklearn.multiclass.OneVsRestClassifier for multiple binary problems - # if self.classifier == 'sgd': - print('Training multiclass SGD classifier') - self._clf = SGDClassifier() - self._train_params = { - 'alpha' : 173.5667373, - 'learning_rate' : 'optimal', - 'loss': 'modified_huber', - 'penalty' : 'l2', - # 'class_weight' : self._weights, - } - # elif self.classifier == 'mnb': - # print('Training multiclass Multinomial Naive Bayes classifier') - # self._clf = MultinomialNB() - # self._train_params = { - # 'alpha' : 0.243340248, - # 'fit_prior' : True - # } + self._labels_map = {} + for (label, encode) in zip(labels, encoded): + self._labels_map[label] = encode + + # self._weights = self._compute_weights() def fit(self, datasets): - print('_fit_model') - # Define model - self._build() - training_result = {} - for prev_taxa, ds in self._training_collection.items(): - ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) - # Trigger the preprocessing computations before ingest in trainer - # Otherwise, it would be executed at each epoch - ds = ds.materialize() - datasets['train'] = ray.put(ds) - - try: - training_labels = list(self._labels_map[prev_taxa].values()) - training_labels = np.delete(training_labels, np.where(training_labels == -1)) - except: - pass - - # Define trainer - self._trainer[prev_taxa] = SklearnPartialTrainer( - estimator=self._clf, - labels_list=training_labels, - features_list=self.kmers, - params=self._train_params, - datasets=datasets, - batch_size=self.batch_size, - training_epochs=self._training_epochs, - set_estimator_cpus=True, - scaling_config=ScalingConfig( - trainer_resources={ - 'CPU': int(os.cpu_count()*0.6) - } - ), - run_config=RunConfig( - name=self.classifier, - local_dir=self._workdir - ), + print('fit') + # TODO: remove validation from datasets + # train / val on training ds, CV on test ds + ds = datasets['train'] + ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + + # One sub-model per artificial cluster of samples + ds = self._random_split_dataset(ds) + # checkpointing directory + model_dir = os.path.join(self._workdir, self.classifier) + if not os.path.isdir(model_dir): + os.mkdir(model_dir) + + # Model-specific training functions + def build_fit_sgd(data): + X = data[TENSOR_COLUMN_NAME] + y = data[LABELS_COLUMN_NAME] + prev_label = data['cluster'][0] + model = SGDClassifier( + alpha = 173.5667373, + learning_rate = 'optimal', + loss = 'modified_huber', + penalty = 'l2', + # 'class_weight' : self._weights, + ) + model.fit(X, y) + + model_file = os.path.join(model_dir, f'{prev_label}.pkl') + + with open(model_file, "wb") as file: + cpickle.dump(model, file) + + return { + 'cluster' : [prev_label], + 'file' : [model_file] + } + + def build_fit_mnb(data): + X = data[TENSOR_COLUMN_NAME] + y = data[LABELS_COLUMN_NAME] + prev_label = data['cluster'][0] + model = SGDClassifier( + alpha = 173.5667373, + learning_rate = 'optimal', + loss = 'modified_huber', + penalty = 'l2', + # 'class_weight' : self._weights, ) + model.fit(X, y) - # Training execution - training_result[prev_taxa] = self._trainer.fit() - self._model_ckpt[prev_taxa] = training_result[prev_taxa].checkpoint + model_file = os.path.join(model_dir, f'{prev_label}.pkl') + + with open(model_file, "wb") as file: + cpickle.dump(model, file) + + return { + 'cluster' : [prev_label], + 'file' : [model_file] + } + + if self.classifier == 'sgd': + print('Training multiclass SGD classifier') + training_result = ds.map_groups(build_fit_sgd, batch_format = 'numpy') + elif self.classifier == 'mnb': + print('Training multiclass Multinomial Naive Bayes classifier') + training_result = ds.map_groups(build_fit_mnb, batch_format = 'numpy') + + training_result = training_result.to_pandas().to_dict('records') + for record in training_result: + self._model_ckpt[record['cluster']] = record['file'] def predict(self, ds): print('predict') - if ds.count() > 0: - if self._scaler is not None: - ds = self._scaler.transform(ds) - - ds = ds.materialize() - predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} - - for prev_taxa, ckpt in self._model_ckpt.items(): - self._predictor[prev_taxa] = BatchPredictor.from_checkpoint(ckpt, SklearnTensorProbaPredictor) - predictions = self._predictor[prev_taxa].predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) - predictions = self._predictions_grouping(predictions) - return self._label_decode(predictions) - else: - raise ValueError('No data to predict') + probabilities = self._predict_proba(ds) + predictions = np.argmax(probabilities, axis = 1) + predictions = self._label_decode(predictions) + return predictions def predict_proba(self, ds, threshold = 0.8): print('predict_proba') - print('predict') + probabilities = self._predict_proba(ds) + predictions = self._get_threshold_pred(probabilities, threshold) + return self._label_decode(predictions) + + def _predict_proba(self, ds): if ds.count() > 0: if self._scaler is not None: ds = self._scaler.transform(ds) - ds = ds.materialize() - predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} - self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorProbaPredictor) - predictions = self._predictor.predict(ds, batch_size = self.batch_size, feature_columns = [TENSOR_COLUMN_NAME], **predict_kwargs) - predictions = np.array(predictions.to_pandas()).reshape(-1) - return self._label_decode(predictions) - else: - raise ValueError('No data to predict') + # ds = ds.materialize() + + def predict_func(data): + X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) + pred = np.zeros((len(X), len(self._labels_map))) + for cluster, model_file in self._model_ckpt.items(): + with open(model_file, 'rb') as file: + model = cpickle.load(file) + proba = model.predict_proba(X) + for i, cls in enumerate(model.classes_): + pred[:, cls] += proba[:, i] + pred = pred / len(self._model_ckpt) + return {'predictions' : pred} + + probabilities = ds.map_batches(predict_func, batch_format = 'numpy') + probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + return probabilities def _get_threshold_pred(self, predict, threshold): print('_get_threshold_pred') - def map_predicted_label(ds : pd.DataFrame): - predict = pd.DataFrame({ - 'best_proba': [max(ds.iloc[i].values) for i in range(len(ds))], - 'predicted_label': [np.argmax(ds.iloc[i].values) for i in range(len(ds))] - }) - predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1 - return pd.DataFrame(predict['predicted_label']) - - predict = predict.map_batches(map_predicted_label, batch_format = 'pandas') - predict = np.ravel(np.array(predict.to_pandas())) + proba_predict = { + 'best_proba' : [], + 'predicted_label' : [] + } + for line in predict: + proba_predict['best_proba'].append(line[np.argmax(line)]), + proba_predict['predicted_label'].append(np.argmax(line)) + + proba_predict = pd.DataFrame(proba_predict) + proba_predict.loc[proba_predict['best_proba'] < threshold, 'predicted_label'] = -1 - return predict + return proba_predict['predicted_label'] def _label_decode(self, predict): print('_label_decode') @@ -237,4 +250,4 @@ def _label_decode(self, predict): for label, encoded in self._labels_map.items(): decoded[predict == encoded] = label - return np.array(decoded) \ No newline at end of file + return np.array(decoded) From e0809661e404b2d245f6add277b188059da1eba4 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 2 Dec 2023 19:40:54 -0500 Subject: [PATCH 53/92] debug ray cluster start --- src/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/utils.py b/src/utils.py index 74ca704..b025361 100644 --- a/src/utils.py +++ b/src/utils.py @@ -71,12 +71,12 @@ def init_ray_cluster(workdir): nb_CPU = os.cpu_count() nb_GPU = len(list_physical_devices('GPU')) - try: - host_ip = os.environ['HOST_IP'] - except KeyError: - host_ip = '172.24.94.34' + # try: + # host_ip = os.environ['HOST_IP'] + # except KeyError: + # host_ip = '$(hostname -i)' - cmd = f'ray start --head --node-ip-address {host_ip} --port 34567 --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}' + cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}' os.system(cmd) ray.init() From fe0381e4b72b8532d2a6c7ec4d686e10a4d23372 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 5 Dec 2023 11:03:29 -0500 Subject: [PATCH 54/92] bigger batches for sub-models --- src/models/models_utils.py | 1 + src/models/multiclass_utils.py | 37 +++++-------------------- src/models/sklearn/multiclass_models.py | 6 ++-- 3 files changed, 12 insertions(+), 32 deletions(-) diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 55c1d12..9931d55 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -135,4 +135,5 @@ def _compute_weights(self): for lab, encoded in self._labels_map.items(): if lab.lower() != 'unknown': weights[int(encoded)] = cls_weights[classes.index(lab)] + print(weights) return weights \ No newline at end of file diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py index d0db7f6..f098ef3 100644 --- a/src/models/multiclass_utils.py +++ b/src/models/multiclass_utils.py @@ -6,6 +6,7 @@ # Class construction from abc import ABC, abstractmethod +from models.models_utils import ModelsUtils from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed @@ -15,7 +16,7 @@ TENSOR_COLUMN_NAME = '__value__' -class MulticlassUtils(ABC): +class MulticlassUtils(ModelsUtils, ABC): """ Abstract class to provide utilities for multiclass classification models. @@ -76,10 +77,12 @@ def _get_count_previous_taxa(self, taxa, csv): return prev_taxa, len(cls[prev_taxa].unique()) - def _prev_taxa_split_dataset(self, ds, prev_taxa): + def _prev_taxa_split_dataset(self, ds, prev_taxa = None): """ Splits the dataset's taxa column into a collection of smaller datasets according to the previous taxonomic level labels """ + if prev_taxa is None: + prev_taxa, nb_classes = self._get_count_previous_taxa(self.taxa,self._csv) return ds.groupby(prev_taxa) def _random_split_dataset(self, ds): @@ -88,34 +91,8 @@ def _random_split_dataset(self, ds): Used when there is not enough labels in previous taxa for splitting according to the previous taxonomic level labels """ - nb_clusters = int(ds.count() / 10) + nb_clusters = int(ds.count() / 100) ds = ds.repartition(nb_clusters).add_column('cluster', lambda df: df.index % nb_clusters) return ds.groupby('cluster') - - def _predictions_cv(self, predictions): - """ - Brings back together the predictions made by multiple models trained on subclasses of the original dataset - - If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to - - ---------- - Cross-validation - ---------- - * We know the classes from the previous taxa, can make each model CV on their subpart - * Metrics for CV overall per taxa ~k-fold strategy (mean / mode) - TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy() - """ - def _predictions_classif(self, predictions): - """ - Brings back together the predictions made by multiple models trained on subclasses of the original dataset - - If multiple sub-models classify a sample with same probability, use a soft voting logic to determine which one to classify to - - ---------- - Classification - ---------- - * Since we know the previous taxa classified per sequence, we can run this specific model to classify at the current level - * See multi-stage classification - TODO : WRITE THE CONCATENATION METHODS AND TEST ALL STAGES \W ds.GroupBy() - """ + \ No newline at end of file diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 7a7e634..e479f27 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -145,7 +145,7 @@ def build_fit_sgd(data): learning_rate = 'optimal', loss = 'modified_huber', penalty = 'l2', - # 'class_weight' : self._weights, + # class_weight = self._weights, ) model.fit(X, y) @@ -168,7 +168,7 @@ def build_fit_mnb(data): learning_rate = 'optimal', loss = 'modified_huber', penalty = 'l2', - # 'class_weight' : self._weights, + # class_weight = self._weights, ) model.fit(X, y) @@ -226,6 +226,8 @@ def predict_func(data): probabilities = ds.map_batches(predict_func, batch_format = 'numpy') probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + else: + raise ValueError('Empty dataset, cannot execute predictions!') return probabilities From 0197d6d03dcad587e4450ca35a56a852e857139b Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 5 Dec 2023 12:30:56 -0500 Subject: [PATCH 55/92] rectify ray cluster init --- src/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils.py b/src/utils.py index b025361..8c21dbd 100644 --- a/src/utils.py +++ b/src/utils.py @@ -76,10 +76,10 @@ def init_ray_cluster(workdir): # except KeyError: # host_ip = '$(hostname -i)' - cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir}' + cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --storage {workdir}' os.system(cmd) - ray.init() + ray.init(_temp_dir = str(workdir)) logging.getLogger("ray").setLevel(logging.WARNING) ray.data.DataContext.get_current().execution_options.verbose_progress = True # mem = virtual_memory().total From 1dbcd18998c3065a2348e67337e2138267aa6caa Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 5 Dec 2023 18:51:20 -0500 Subject: [PATCH 56/92] ray cluster init me + storage management --- src/utils.py | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/utils.py b/src/utils.py index 8c21dbd..89c74ca 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,5 +1,6 @@ import os import ray +import json import logging import numpy as np @@ -71,15 +72,48 @@ def init_ray_cluster(workdir): nb_CPU = os.cpu_count() nb_GPU = len(list_physical_devices('GPU')) - # try: - # host_ip = os.environ['HOST_IP'] - # except KeyError: - # host_ip = '$(hostname -i)' - - cmd = f'ray start --head --node-ip-address $(hostname -i) --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --storage {workdir}' - os.system(cmd) - - ray.init(_temp_dir = str(workdir)) + mem = ray._private.utils.get_shared_memory_bytes() - 10 + + workdir='/home/nicdemon/ray/' + + if 'HOST_IP' in list(os.environ.keys()): + ray.init( + _node_ip_address = os.environ['HOST_IP'], + num_cpus = nb_CPU, + num_gpus = nb_GPU, + _temp_dir = str(workdir), + object_store_memory = mem, + _system_config={ + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(workdir) + }, + }) + }, + ) + # cmd = f"ray start --head --node-ip-address {os.environ['HOST_IP']} --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}" + else: + ray.init( + num_cpus = nb_CPU, + num_gpus = nb_GPU, + _temp_dir = str(workdir), + object_store_memory = mem, + _system_config={ + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(workdir) + }, + }) + }, + ) + + # cmd = f"ray start --head --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}" + + # os.system(cmd) + + # ray.init() logging.getLogger("ray").setLevel(logging.WARNING) ray.data.DataContext.get_current().execution_options.verbose_progress = True # mem = virtual_memory().total From e2cb50065a60a65ef00e80bbe07e53d6bc017a5e Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 6 Dec 2023 18:22:07 -0500 Subject: [PATCH 57/92] debug cluster + use weights for sklearn multiclass --- src/models/models_utils.py | 2 +- src/models/sklearn/multiclass_models.py | 11 ++++++----- src/utils.py | 2 -- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 9931d55..3f42d9e 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -135,5 +135,5 @@ def _compute_weights(self): for lab, encoded in self._labels_map.items(): if lab.lower() != 'unknown': weights[int(encoded)] = cls_weights[classes.index(lab)] - print(weights) + return weights \ No newline at end of file diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index e479f27..c9a393b 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -116,7 +116,7 @@ def preprocess(self, ds, scaling = False, scaler_file = None): for (label, encode) in zip(labels, encoded): self._labels_map[label] = encode - # self._weights = self._compute_weights() + self._weights = self._compute_weights() def fit(self, datasets): print('fit') @@ -145,7 +145,7 @@ def build_fit_sgd(data): learning_rate = 'optimal', loss = 'modified_huber', penalty = 'l2', - # class_weight = self._weights, + class_weight = self._weights, ) model.fit(X, y) @@ -168,7 +168,7 @@ def build_fit_mnb(data): learning_rate = 'optimal', loss = 'modified_huber', penalty = 'l2', - # class_weight = self._weights, + class_weight = self._weights, ) model.fit(X, y) @@ -221,15 +221,16 @@ def predict_func(data): proba = model.predict_proba(X) for i, cls in enumerate(model.classes_): pred[:, cls] += proba[:, i] - pred = pred / len(self._model_ckpt) + # pred = pred / len(self._model_ckpt) return {'predictions' : pred} probabilities = ds.map_batches(predict_func, batch_format = 'numpy') probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + return probabilities else: raise ValueError('Empty dataset, cannot execute predictions!') - return probabilities def _get_threshold_pred(self, predict, threshold): print('_get_threshold_pred') diff --git a/src/utils.py b/src/utils.py index 89c74ca..3f1c5fe 100644 --- a/src/utils.py +++ b/src/utils.py @@ -74,8 +74,6 @@ def init_ray_cluster(workdir): mem = ray._private.utils.get_shared_memory_bytes() - 10 - workdir='/home/nicdemon/ray/' - if 'HOST_IP' in list(os.environ.keys()): ray.init( _node_ip_address = os.environ['HOST_IP'], From 2e35a0bdb476da6b659f440c0eac33cf40758c25 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 7 Dec 2023 17:47:25 -0500 Subject: [PATCH 58/92] rectify sgd classif --- src/models/multiclass_utils.py | 15 ++++++++++---- src/models/sklearn/multiclass_models.py | 26 +++++++++++-------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py index f098ef3..dd6f362 100644 --- a/src/models/multiclass_utils.py +++ b/src/models/multiclass_utils.py @@ -3,6 +3,7 @@ import warnings import numpy as np import pandas as pd +import pyarrow as pa # Class construction from abc import ABC, abstractmethod @@ -91,8 +92,14 @@ def _random_split_dataset(self, ds): Used when there is not enough labels in previous taxa for splitting according to the previous taxonomic level labels """ + def map_clusters(batch): + clusters = np.arange(len(batch)) + batch['cluster'] = clusters + return batch + nb_clusters = int(ds.count() / 100) - ds = ds.repartition(nb_clusters).add_column('cluster', lambda df: df.index % nb_clusters) - return ds.groupby('cluster') - - \ No newline at end of file + + ds = ds.repartition(100) + ds = ds.map_batches(map_clusters, batch_size = nb_clusters, batch_format = 'pandas') + + return ds.groupby('cluster') \ No newline at end of file diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index c9a393b..1414675 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -128,10 +128,12 @@ def fit(self, datasets): if self._scaler is not None: ds = self._scaler.transform(ds) + # One sub-model per artificial cluster of samples ds = self._random_split_dataset(ds) + # checkpointing directory - model_dir = os.path.join(self._workdir, self.classifier) + model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}') if not os.path.isdir(model_dir): os.mkdir(model_dir) @@ -139,9 +141,9 @@ def fit(self, datasets): def build_fit_sgd(data): X = data[TENSOR_COLUMN_NAME] y = data[LABELS_COLUMN_NAME] - prev_label = data['cluster'][0] + cluster = data['cluster'][0] model = SGDClassifier( - alpha = 173.5667373, + # alpha = 173.5667373, learning_rate = 'optimal', loss = 'modified_huber', penalty = 'l2', @@ -149,36 +151,30 @@ def build_fit_sgd(data): ) model.fit(X, y) - model_file = os.path.join(model_dir, f'{prev_label}.pkl') + model_file = os.path.join(model_dir, f'{cluster}.pkl') with open(model_file, "wb") as file: cpickle.dump(model, file) return { - 'cluster' : [prev_label], + 'cluster' : [cluster], 'file' : [model_file] } def build_fit_mnb(data): X = data[TENSOR_COLUMN_NAME] y = data[LABELS_COLUMN_NAME] - prev_label = data['cluster'][0] - model = SGDClassifier( - alpha = 173.5667373, - learning_rate = 'optimal', - loss = 'modified_huber', - penalty = 'l2', - class_weight = self._weights, - ) + cluster = data['cluster'][0] + model = MultinomialNB() model.fit(X, y) - model_file = os.path.join(model_dir, f'{prev_label}.pkl') + model_file = os.path.join(model_dir, f'{cluster}.pkl') with open(model_file, "wb") as file: cpickle.dump(model, file) return { - 'cluster' : [prev_label], + 'cluster' : [cluster], 'file' : [model_file] } From 5576aa89500bb386907bfc8457cdc89f87577de1 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 8 Dec 2023 15:41:59 -0500 Subject: [PATCH 59/92] sklearn calibrated classifier --- src/models/kerasTF/binary_models.py | 224 +++++++++++++++++++- src/models/kerasTF/models.py | 260 ++---------------------- src/models/kerasTF/multiclass_models.py | 218 +++++++++++++++++++- src/models/models_utils.py | 10 +- src/models/multiclass_utils.py | 33 +-- src/models/sklearn/binary_models.py | 68 ++++--- src/models/sklearn/models.py | 10 +- src/models/sklearn/multiclass_models.py | 94 ++++++--- src/utils.py | 10 +- 9 files changed, 578 insertions(+), 349 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index bc79f35..603434b 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -19,6 +19,7 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig +from models.kerasTF.models import train_func, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint @@ -70,4 +71,225 @@ class KerasTFBinaryModels(KerasTFModels): Minimum percentage of probability to effectively classify. Sequences will be classified as 'unknown' if the probability is under this threshold. Defaults to 80% - """ \ No newline at end of file + """ + + def __init__( + self, + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ): + super().__init__( + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ) + # Parameters + # Initialize hidden + self._nb_CPU_data = int(os.cpu_count() * 0.2) + self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) + self._nb_GPU = len(tf.config.list_physical_devices('GPU')) + # Initialize empty + self._nb_classes = 2 + self._nb_CPU_per_worker = 0 + self._nb_GPU_per_worker = 0 + # Computing variables + if self._nb_GPU > 0: + self._use_gpu = True + self._n_workers = self._nb_GPU + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) + self._nb_GPU_per_worker = 1 + else: + self._use_gpu = False + self._n_workers = int(self._nb_CPU_training * 0.2) + self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) + + if self.classifier == 'attention': + print('Training bacterial / host classifier based on Attention Weighted Neural Network') + elif self.classifier == 'lstm': + print('Training bacterial / host classifier based on Shallow LSTM Neural Network') + elif self.classifier == 'deeplstm': + print('Training bacterial / host classifier based on Deep LSTM Neural Network') + + # Data preprocessing + ######################################################################################################### + + def preprocess(self, ds, scaling = False, scaler_file = None): + print('preprocess') + # Labels encoding + self._encoder = ModelLabelEncoder(self.taxa) + self._encoder.fit(ds) + + # Labels mapping + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + self._nb_classes = len(labels) + self._encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + self._encoded = np.append(self._encoded, -1) + + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded + + # Class weights + self._weights = self._compute_weights() + + # Scaling + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) + self._scaler.fit(ds) + + # Model training + ######################################################################################################### + + def fit(self, datasets): + print('fit') + # Preprocessing loop + for name, ds in datasets.items(): + # ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + datasets[name] = ds + + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'model': self.classifier, + 'weights': self._weights + } + + # Define trainer / tuner + self._trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + train_loop_config=train_params, + scaling_config=ScalingConfig( + trainer_resources={'CPU': self._nb_CPU_data}, + num_workers=self._n_workers, + use_gpu=self._use_gpu, + resources_per_worker={ + 'CPU': self._nb_CPU_per_worker, + 'GPU': self._nb_GPU_per_worker + } + ), + run_config=RunConfig( + name=self.classifier, + local_dir=self._workdir, + ), + datasets=datasets, + ) + + training_result = self._trainer.fit() + self._model_ckpt = training_result.best_checkpoints[0][0] + + # Model predicting + ######################################################################################################### + + def predict(self, ds): + print('predict') + # Predict with model + predictions = self._predict_proba(ds) + + # Convert predictions to labels + predictions = self._get_abs_pred(predictions) + + # Return decoded labels + return self._label_decode(predictions) + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + # Predict with model + predictions = self._predict_proba(ds) + + # Convert predictions to labels with threshold + predictions = self._get_threshold_pred(predictions, threshold) + + # Return decoded labels + return self._label_decode(predictions) + + def _predict_proba(self, ds): + if ds.count() > 0: + if len(ds.schema().names) > 1: + col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] + ds = ds.drop_columns(col_2_drop) + + # Preprocess + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_cpus_per_worker = self._nb_CPU_per_worker, + num_gpus_per_worker = self._nb_GPU_per_worker + ) + return predictions + else: + raise ValueError('No data to predict') + + def _get_abs_pred(self, predictions): + print('_get_abs_pred') + def map_predicted_label(ds): + ds = np.ravel(ds['predictions']) + threshold = 0.5 + predict = pd.DataFrame({ + 'proba': ds, + 'predicted_label': np.full(len(ds), -1) + }) + predict.loc[predict['proba'] > threshold, 'predicted_label'] = 1 + predict.loc[predict['proba'] < threshold, 'predicted_label'] = 0 + return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} + + predict = [] + predictions = predictions.map_batches( + lambda batch : map_predicted_label(batch), + batch_format = 'numpy', + batch_size = self.batch_size + ) + for row in predictions.iter_rows(): + predict.append(row['predictions']) + + return predict + + def _get_threshold_pred(self, predictions, threshold): + print('_get_threshold_pred') + def map_predicted_label(ds, threshold): + ds = np.ravel(ds['predictions']) + lower_threshold = 0.5 - (threshold * 0.5) + upper_threshold = 0.5 + (threshold * 0.5) + predict = pd.DataFrame({ + 'proba': ds, + 'predicted_label': np.full(len(ds), -1) + }) + predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 + predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 + return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} + + predict = [] + predictions = predictions.map_batches( + lambda batch : map_predicted_label(batch, threshold), + batch_format = 'numpy', + batch_size = self.batch_size + ) + for row in predictions.iter_rows(): + predict.append(row['predictions']) + + return predict \ No newline at end of file diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 989f934..ba1c3a4 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -43,7 +43,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' warnings.filterwarnings('ignore') -class KerasTFModels(ModelsUtils): +class KerasTFModels(ModelsUtils, ABC): """ Class used to build, train and predict models using Ray with Keras Tensorflow backend @@ -94,247 +94,31 @@ def __init__( kmers_list, csv ) - # Parameters - # Initialize hidden - self._nb_CPU_data = int(os.cpu_count() * 0.2) - self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) - self._nb_GPU = len(tf.config.list_physical_devices('GPU')) - # Initialize empty - self._nb_classes = None - self._nb_CPU_per_worker = 0 - self._nb_GPU_per_worker = 0 - # Computing variables - if self._nb_GPU > 0: - self._use_gpu = True - self._n_workers = self._nb_GPU - self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) - self._nb_GPU_per_worker = 1 - else: - self._use_gpu = False - self._n_workers = int(self._nb_CPU_training * 0.2) - self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) - - if self.classifier == 'attention': - print('Training bacterial / host classifier based on Attention Weighted Neural Network') - elif self.classifier == 'lstm': - print('Training bacterial / host classifier based on Shallow LSTM Neural Network') - elif self.classifier == 'deeplstm': - print('Training bacterial / host classifier based on Deep LSTM Neural Network') - elif self.classifier == 'lstm_attention': - print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention') - elif self.classifier == 'cnn': - print('Training multiclass classifier based on CNN Neural Network') - elif self.classifier == 'widecnn': - print('Training multiclass classifier based on Wide CNN Network') - - def preprocess(self, ds, scaling = False, scaler_file = None): - print('preprocess') - self._encoder = ModelLabelEncoder(self.taxa) - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) - self._encoder.fit(ds) - labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) - self._nb_classes = len(self._encoder.stats_[f'unique_values({self.taxa})']) - if self._nb_classes > 2 : - self._encoder = Chain( - self._encoder, - OneHotTensorEncoder(self.taxa) - ) - self._encoder.fit(ds) - self._encoded = np.arange(len(labels)) - labels = np.append(labels, 'Unknown') - self._encoded = np.append(self._encoded, -1) - for (label, encoded) in zip(labels, self._encoded): - self._labels_map[label] = encoded - self._compute_weights() - - def _label_decode(self, predict): - print('_label_decode') - decoded = pd.Series(np.empty(len(predict), dtype=object)) - for label, encoded in self._labels_map.items(): - decoded[predict == encoded] = label - - return np.array(decoded) - + @abstractmethod + def preprocess(self): + """ + """ + + @abstractmethod def fit(self, datasets): - print('fit') - # Preprocessing loop - for name, ds in datasets.items(): - ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) - # Trigger the preprocessing computations before ingest in trainer - # Otherwise, it would be executed at each epoch - ds = ds.materialize() - datasets[name] = ds - - # Training parameters - self._train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'model': self.classifier, - 'weights': self._weights - } - - # Define trainer / tuner - self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func, - train_loop_config=self._train_params, - scaling_config=ScalingConfig( - trainer_resources={'CPU': self._nb_CPU_data}, - num_workers=self._n_workers, - use_gpu=self._use_gpu, - resources_per_worker={ - 'CPU': self._nb_CPU_per_worker, - 'GPU': self._nb_GPU_per_worker - } - ), - run_config=RunConfig( - name=self.classifier, - local_dir=self._workdir, - ), - datasets=datasets, - ) - - training_result = self._trainer.fit() - self._model_ckpt = training_result.best_checkpoints[0][0] + """ + """ + @abstractmethod def predict(self, ds): - print('predict') - # Predict with model - predictions = self._make_predictions(ds) - - # Convert predictions to labels for cross-validation of classification - predictions = self._get_abs_pred(predictions) - - # Return decoded labels - return self._label_decode(predictions) - - def predict_proba(self, ds, threshold = 0.8): - print('predict_proba') - # Predict with model - predictions = self._make_predictions(ds) - - # Convert predictions to labels with threshold for top-down classification - predictions = self._get_threshold_pred(predictions, threshold) - - # Return decoded labels - return self._label_decode(predictions) - - def _make_predictions(self, ds): - if ds.count() > 0: - if len(ds.schema().names) > 1: - col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] - ds = ds.drop_columns(col_2_drop) - - # Preprocess - if self._scaler is not None: - ds = self._scaler.transform(ds) - ds = ds.materialize() - - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_cpus_per_worker = self._nb_CPU_per_worker, - num_gpus_per_worker = self._nb_GPU_per_worker - ) - return predictions - else: - raise ValueError('No data to predict') - - def _get_abs_pred(self, predictions): - print('_get_abs_pred') - def map_predicted_label_binary(ds): - ds = np.ravel(ds['predictions']) - lower_threshold = 0.5 - upper_threshold = 0.5 - predict = pd.DataFrame({ - 'proba': ds, - 'predicted_label': np.full(len(ds), -1) - }) - predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 - predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 - return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} - - def map_predicted_label_multiclass(ds): - ds = ds['predictions'] - pred = pd.DataFrame({ - 'best_proba': [np.max(arr) for arr in ds], - 'predicted_label' : [np.argmax(arr) for arr in ds] - }) - - return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} - - if self._nb_classes > 2: - print('map_predicted_label_multiclass') - fn = map_predicted_label_multiclass - else: - print('map_predicted_label_binary') - fn = map_predicted_label_binary - - predict = [] - predictions = predictions.map_batches( - lambda batch : fn(batch), - batch_format = 'numpy', - batch_size = self.batch_size - ) - for row in predictions.iter_rows(): - predict.append(row['predictions']) - - return predict - - def _get_threshold_pred(self, predictions, threshold): - print('_get_threshold_pred') - def map_predicted_label_binary(ds, threshold): - ds = np.ravel(ds['predictions']) - lower_threshold = 0.5 - (threshold * 0.5) - upper_threshold = 0.5 + (threshold * 0.5) - predict = pd.DataFrame({ - 'proba': ds, - 'predicted_label': np.full(len(ds), -1) - }) - predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 - predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 - return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} - - def map_predicted_label_multiclass(ds, threshold): - ds = ds['predictions'] - pred = pd.DataFrame({ - 'best_proba': [np.max(arr) for arr in ds], - 'predicted_label' : [np.argmax(arr) for arr in ds] - }) - pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1 - - return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} - - if self._nb_classes > 2: - print('map_predicted_label_multiclass') - fn = map_predicted_label_multiclass - else: - print('map_predicted_label_binary') - fn = map_predicted_label_binary - - predict = [] - predictions = predictions.map_batches( - lambda batch : fn(batch, threshold), - batch_format = 'numpy', - batch_size = self.batch_size - ) - for row in predictions.iter_rows(): - predict.append(row['predictions']) - - return predict + """ + """ + + @abstractmethod + def predict_proba(self): + """ + """ + + @abstractmethod + def _get_threshold_pred(self): + """ + """ # Training/building function outside of the class as mentioned on the Ray discussion diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index b422ff8..3b0902a 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -11,14 +11,16 @@ from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Parent class / models -from models.models_utils import ModelsUtils +from models.kerasTF.models import KerasTFModels from models.kerasTF.build_neural_networks import * +from models.multiclass_utils import MulticlassUtils # Training import tensorflow as tf from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig +from models.kerasTF.models import train_func from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint @@ -40,7 +42,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' warnings.filterwarnings('ignore') -class KerasTFMulticlassModels(ModelsUtils): +class KerasTFMulticlassModels(KerasTFModels, MulticlassUtils): """ Class used to build, train and predict models using Ray with Keras Tensorflow backend @@ -70,4 +72,216 @@ class KerasTFMulticlassModels(ModelsUtils): Minimum percentage of probability to effectively classify. Sequences will be classified as 'unknown' if the probability is under this threshold. Defaults to 80% + """ + + def __init__( + self, + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ): + super().__init__( + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ) + # Parameters + # Initialize hidden + self._nb_CPU_data = int(os.cpu_count() * 0.2) + self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) + self._nb_GPU = len(tf.config.list_physical_devices('GPU')) + # Initialize empty + self._nb_classes = None + self._nb_CPU_per_worker = 0 + self._nb_GPU_per_worker = 0 + # Computing variables + if self._nb_GPU > 0: + self._use_gpu = True + self._n_workers = self._nb_GPU + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) + self._nb_GPU_per_worker = 1 + else: + self._use_gpu = False + self._n_workers = int(self._nb_CPU_training * 0.2) + self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) + + # Data preprocessing + ######################################################################################################### + + def preprocess(self, ds, scaling = False, scaler_file = None): + print('preprocess') + # Labels encoding + self._encoder = Chain( + ModelLabelEncoder(self.taxa), + OneHotTensorEncoder(LABELS_COLUMN_NAME) + ) + self._encoder.fit(ds) + + # Labels mapping + labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys()) + self._nb_classes = len(labels) + self._encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + self._encoded = np.append(self._encoded, -1) + + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded + + # Class weights + self._weights = self._compute_weights() + + # Scaling + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) + self._scaler.fit(ds) + + # Models training + ######################################################################################################### + + def fit(self, datasets): + print('fit') + # Preprocessing loop + for name, ds in datasets.items(): + # ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + datasets[name] = ds + + # One sub-model per artificial cluster of samples + ds['train'] = self._random_split_dataset(ds['train']) + + # Checkpointing directory + model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}') + if not os.path.isdir(model_dir): + os.mkdir(model_dir) + +# TODO: train_func per model +# TODO: Confirm how it works in Jupyter Notebook + # Distributed building & training + if self.classifier == 'lstm_attention': + print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention') + training_result = ds.map_groups(build_fit_lstm_attention, batch_format = 'numpy') + elif self.classifier == 'cnn': + print('Training multiclass classifier based on CNN Neural Network') + training_result = ds.map_groups(build_fit_cnn, batch_format = 'numpy') + elif self.classifier == 'widecnn': + print('Training multiclass classifier based on Wide CNN Network') + training_result = ds.map_groups(build_fit_widecnn, batch_format = 'numpy') + + training_result = training_result.to_pandas().to_dict('records') + for record in training_result: + self._model_ckpt[record['cluster']] = record['file'] + + # Models predicting + ######################################################################################################### + + def predict(self, ds): + print('predict') + probabilities = self._predict_proba(ds) + predictions = np.argmax(probabilities, axis = 1) + predictions = self._label_decode(predictions) + return predictions + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + probabilities = self._predict_proba(ds) + predictions = self._get_threshold_pred(probabilities, threshold) + return self._label_decode(predictions) + +# TODO: Confirm how it works in Jupyter Notebook + def _predict_proba(self, ds): + print('_predict_proba') + if ds.count() > 0: + if self._scaler is not None: + ds = self._scaler.transform(ds) + # ds = ds.materialize() + + def predict_func(data): + X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) + pred = np.zeros((len(X), len(self._labels_map))) + for cluster, model_file in self._model_ckpt.items(): + with open(model_file, 'rb') as file: + model = cpickle.load(file) + proba = model.predict_proba(X) + for i, cls in enumerate(model.classes_): + pred[:, cls] += proba[:, i] + # pred = pred / len(self._model_ckpt) + return {'predictions' : pred} + + probabilities = ds.map_batches(predict_func, batch_format = 'numpy') + probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + return probabilities + else: + raise ValueError('Empty dataset, cannot execute predictions!') + + def _get_abs_pred(self, predictions): + print('_get_abs_pred') + def map_predicted_label(ds): + ds = ds['predictions'] + pred = pd.DataFrame({ + 'best_proba': [np.max(arr) for arr in ds], + 'predicted_label' : [np.argmax(arr) for arr in ds] + }) + + return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} + + predict = [] + predictions = predictions.map_batches( + lambda batch : map_predicted_label(batch), + batch_format = 'numpy', + batch_size = self.batch_size + ) + for row in predictions.iter_rows(): + predict.append(row['predictions']) + + return predict + + def _get_threshold_pred(self, predictions, threshold): + print('_get_threshold_pred') + def map_predicted_label(ds, threshold): + ds = ds['predictions'] + pred = pd.DataFrame({ + 'best_proba': [np.max(arr) for arr in ds], + 'predicted_label' : [np.argmax(arr) for arr in ds] + }) + pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1 + + return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} + + predict = [] + predictions = predictions.map_batches( + lambda batch : map_predicted_label(batch, threshold), + batch_format = 'numpy', + batch_size = self.batch_size + ) + for row in predictions.iter_rows(): + predict.append(row['predictions']) + + return predict + +# TODO: Confirm how it works in Jupyter Notebook +def build_fit_lstm_attention(data): + """ + LSTM-Attention NN training function + """ + +def build_fit_cnn(data): + """ + Convolution NN training function + """ + +def build_fit_widecnn(data): + """ + Wide Convolution NN training function """ \ No newline at end of file diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 3f42d9e..08d7c79 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -136,4 +136,12 @@ def _compute_weights(self): if lab.lower() != 'unknown': weights[int(encoded)] = cls_weights[classes.index(lab)] - return weights \ No newline at end of file + return weights + + def _label_decode(self, predict): + print('_label_decode') + decoded = pd.Series(np.empty(len(predict), dtype=object)) + for label, encoded in self._labels_map.items(): + decoded[predict == encoded] = label + + return np.array(decoded) \ No newline at end of file diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py index dd6f362..a142ca3 100644 --- a/src/models/multiclass_utils.py +++ b/src/models/multiclass_utils.py @@ -24,43 +24,14 @@ class MulticlassUtils(ModelsUtils, ABC): These methods are meant to be used when decomposing data into taxonomic groups before training one model per group ----------------------- - Ray data GroupBy + Ray data GroupBy + Bagging meta-estimator ----------------------- https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray#approach-2:-using-ray-data-(grouping-data-by-key) + https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier 1. GroupBy previous taxa 2. Fx for model training (train_fx) 3. ds.map_groups(train_fx) to exec the training of models in parallel 4. Write results to file / save models - - ----------------------- - Mixture-of-Experts (MoE) - ----------------------- - 1. Train each expert on their task-associated data - * Split training data into 80/20% splits - * Train/val over multiple epochs - 2. Train a gating network on the whole task - * Perceptron NN for gating - * Train on whole training ds - * Validation on simulated reads ds - * CV on test simulated reads ds - https://medium.com/@bensalemh300/harnessing-the-best-of-both-worlds-how-mixture-of-experts-meets-pyspark-for-mnist-mastery-315f82e65a0e - https://machinelearningmastery.com/mixture-of-experts/ - - 1. Cluster Data Split: Data within each cluster is divided into training and testing sets. - 2. Decision Tree Classifiers: For clusters where there’s more than one unique class in the training data, we train Decision Tree classifiers. These classifiers can distinguish between different classes within the cluster. - 3. Storing Expert Models: Trained Decision Tree models are stored in a dictionary, where each expert corresponds to a specific cluster. - 4. Performance Evaluation: The performance of each expert model is assessed by evaluating its accuracy on the corresponding test data. - - Sklearn LogisticRegression : https://github.com/zermelozf/esn-lm/blob/master/esnlm/readouts/smoe.py - Keras/TF : https://abdulkaderhelwan.medium.com/mixture-of-experts-introduction-39f244a4ff05 - Keras/TF on article 2018 : https://github.com/drawbridge/keras-mmoe - Keras/TF 2018 : https://github.com/eminorhan/mixture-of-experts - Detailed example : https://mattgorb.github.io/moe - Detailed example : https://towardsdatascience.com/how-to-build-a-wide-and-deep-model-using-keras-in-tensorflow-2-0-2f7a236b5a4b - Keras example : https://keras.io/examples/nlp/text_classification_with_switch_transformer/ - Keras example : https://stackoverflow.com/questions/77551865/how-to-extend-keras-gpt2-model-moe-example - FastMoE PyTorch : https://fastmoe.ai/ - Tutel PyTorch : https://www.microsoft.com/en-us/research/blog/tutel-an-efficient-mixture-of-experts-implementation-for-large-dnn-model-training/ """ def _get_count_previous_taxa(self, taxa, csv): diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index 90d3679..61d0656 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -86,8 +86,12 @@ def __init__( csv ) + # Data preprocessing + ######################################################################################################### + def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') + # Labels encoding + mapping if self.classifier == 'onesvm': self._encoder = OneClassSVMLabelEncoder(self.taxa) self._encoded = np.array([1,-1], dtype = np.int32) @@ -100,52 +104,31 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._encoded = np.arange(len(labels)) labels = np.append(labels, 'Unknown') self._encoded = np.append(self._encoded, -1) + # Class weights self._weights = self._compute_weights() + # Labels mapping + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded + + # Scaling if scaling: self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) self._scaler.fit(ds) - # Labels mapping - for (label, encoded) in zip(labels, self._encoded): - self._labels_map[label] = encoded - def _build(self): - print('_build') - if self.classifier == 'onesvm': - print('Training bacterial extractor with One Class SVM') - self._clf = ScoringSGDOneClassSVM() - self._train_params = { - 'nu' : 0.026441491, - 'learning_rate' : 'constant', - 'tol' : 1e-3, - 'eta0' : 0.001 - } - else : - print('Training bacterial / host classifier with SGD') - self._clf = SGDClassifier() - self._train_params = { - 'loss' : 'hinge', - 'penalty' : 'elasticnet', - 'alpha' : 141.6146176, - 'learning_rate' : 'adaptive', - 'class_weight' : self._weights, - 'eta0' : 0.001, - 'n_jobs' : -1 - } - + # Model training + ######################################################################################################### + def fit(self, datasets): print('_fit_model') # Define model self._build() for name, ds in datasets.items(): - ds = ds.drop_columns(['id']) + # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) if self._scaler is not None: ds = self._scaler.transform(ds) - # Trigger the preprocessing computations before ingest in trainer - # Otherwise, it would be executed at each epoch - ds = ds.materialize() datasets[name] = ray.put(ds) try: @@ -179,6 +162,28 @@ def fit(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.checkpoint + def _build(self): + print('_build') + if self.classifier == 'onesvm': + print('Training bacterial extractor with One Class SVM') + self._clf = ScoringSGDOneClassSVM() + self._train_params = { + 'learning_rate' : 'optimal' + } + else : + print('Training bacterial / host classifier with SGD') + self._clf = SGDClassifier() + self._train_params = { + 'loss' : 'hinge', + 'penalty' : 'elasticnet', + 'learning_rate' : 'optimal', + 'class_weight' : self._weights, + 'n_jobs' : -1 + } + + # Model predicting + ######################################################################################################### + def predict(self, ds): print('predict') if ds.count() > 0: @@ -195,6 +200,7 @@ def predict(self, ds): def predict_proba(self, ds, threshold = 0.8): print('predict_proba') + # No predict_proba methods implemented for these models return self.predict(ds) def _get_threshold_pred(self, predict, nb_cls, threshold): diff --git a/src/models/sklearn/models.py b/src/models/sklearn/models.py index 386f684..3cba9c3 100644 --- a/src/models/sklearn/models.py +++ b/src/models/sklearn/models.py @@ -92,12 +92,4 @@ def predict_proba(self): @abstractmethod def _get_threshold_pred(self): """ - """ - - def _label_decode(self, predict): - print('_label_decode') - decoded = pd.Series(np.empty(len(predict), dtype=object)) - for label, encoded in self._labels_map.items(): - decoded[predict == encoded] = label - - return np.array(decoded) \ No newline at end of file + """ \ No newline at end of file diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 1414675..159c672 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -14,6 +14,7 @@ from ray.air.config import ScalingConfig from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier +from sklearn.calibration import CalibratedClassifierCV from models.sklearn.partial_trainer import SklearnPartialTrainer from models.sklearn.scoring_one_svm import ScoringSGDOneClassSVM @@ -96,13 +97,12 @@ def __init__( self._model_ckpt = {} self._predictor = {} + # Data preprocessing + ######################################################################################################### + def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') - - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) - + # Labels encoding self._encoder = ModelLabelEncoder(self.taxa) self._encoder.fit(ds) @@ -112,49 +112,70 @@ def preprocess(self, ds, scaling = False, scaler_file = None): labels = np.append(labels, 'Unknown') encoded = np.append(encoded, -1) - self._labels_map = {} for (label, encode) in zip(labels, encoded): self._labels_map[label] = encode + # Class weights self._weights = self._compute_weights() + + # Scaling + if scaling: + self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) + self._scaler.fit(ds) + + # Models training + ######################################################################################################### def fit(self, datasets): print('fit') - # TODO: remove validation from datasets - # train / val on training ds, CV on test ds - ds = datasets['train'] - ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) - + for name, ds in datasets.items(): + # ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + datasets[name] = ds # One sub-model per artificial cluster of samples - ds = self._random_split_dataset(ds) + ds_train = self._random_split_dataset(datasets['train']) + ds_val = datasets['validation'] - # checkpointing directory + # Checkpointing directory model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}') if not os.path.isdir(model_dir): os.mkdir(model_dir) # Model-specific training functions def build_fit_sgd(data): - X = data[TENSOR_COLUMN_NAME] - y = data[LABELS_COLUMN_NAME] + # Training data + X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) + y_train = np.array(data[LABELS_COLUMN_NAME]) + # Validation data + X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME] + y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME] + msk_val = y_val.isin(np.unique(y_train)) + X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) + y_val = np.array(y_val[msk_val]) cluster = data['cluster'][0] model = SGDClassifier( - # alpha = 173.5667373, learning_rate = 'optimal', loss = 'modified_huber', penalty = 'l2', class_weight = self._weights, ) - model.fit(X, y) + model.fit(X_train, y_train) + + calibrator = CalibratedClassifierCV( + estimator = model, + method = 'isotonic', + cv = 'prefit', + ) + + calibrator.fit(X_val,y_val) model_file = os.path.join(model_dir, f'{cluster}.pkl') with open(model_file, "wb") as file: - cpickle.dump(model, file) + cpickle.dump(calibrator, file) return { 'cluster' : [cluster], @@ -162,16 +183,31 @@ def build_fit_sgd(data): } def build_fit_mnb(data): - X = data[TENSOR_COLUMN_NAME] - y = data[LABELS_COLUMN_NAME] + # Training data + X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) + y_train = np.array(data[LABELS_COLUMN_NAME]) + # Validation data + X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME] + y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME] + msk_val = y_val.isin(np.unique(y_train)) + X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) + y_val = np.array(y_val[msk_val]) cluster = data['cluster'][0] model = MultinomialNB() - model.fit(X, y) + model.fit(X_train, y_train) model_file = os.path.join(model_dir, f'{cluster}.pkl') + calibrator = CalibratedClassifierCV( + estimator = model, + method = 'isotonic', + cv = 'prefit', + ) + + calibrator.fit(X_val,y_val) + with open(model_file, "wb") as file: - cpickle.dump(model, file) + cpickle.dump(calibrator, file) return { 'cluster' : [cluster], @@ -180,15 +216,18 @@ def build_fit_mnb(data): if self.classifier == 'sgd': print('Training multiclass SGD classifier') - training_result = ds.map_groups(build_fit_sgd, batch_format = 'numpy') + training_result = ds_train.map_groups(build_fit_sgd, batch_format = 'numpy') elif self.classifier == 'mnb': print('Training multiclass Multinomial Naive Bayes classifier') - training_result = ds.map_groups(build_fit_mnb, batch_format = 'numpy') + training_result = ds_train.map_groups(build_fit_mnb, batch_format = 'numpy') training_result = training_result.to_pandas().to_dict('records') for record in training_result: self._model_ckpt[record['cluster']] = record['file'] + # Models predicting + ######################################################################################################### + def predict(self, ds): print('predict') probabilities = self._predict_proba(ds) @@ -227,7 +266,6 @@ def predict_func(data): else: raise ValueError('Empty dataset, cannot execute predictions!') - def _get_threshold_pred(self, predict, threshold): print('_get_threshold_pred') proba_predict = { diff --git a/src/utils.py b/src/utils.py index 3f1c5fe..4f48bfa 100644 --- a/src/utils.py +++ b/src/utils.py @@ -65,9 +65,9 @@ def init_ray_cluster(workdir): """ 1. Get physical material available Number of available CPUs and GPUs - 2. Get host IP from OS + 2. Get host IP from OS if available Defaults to 172.24.94.34 - 3. Start the ray cluster at OS level + 3. Start the ray cluster with parameters """ nb_CPU = os.cpu_count() nb_GPU = len(list_physical_devices('GPU')) @@ -90,7 +90,6 @@ def init_ray_cluster(workdir): }) }, ) - # cmd = f"ray start --head --node-ip-address {os.environ['HOST_IP']} --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}" else: ray.init( num_cpus = nb_CPU, @@ -107,11 +106,6 @@ def init_ray_cluster(workdir): }, ) - # cmd = f"ray start --head --num-cpus {nb_CPU} --num-gpus {nb_GPU} --temp-dir {workdir} --object-store-memory {mem}" - - # os.system(cmd) - - # ray.init() logging.getLogger("ray").setLevel(logging.WARNING) ray.data.DataContext.get_current().execution_options.verbose_progress = True # mem = virtual_memory().total From 7b9fddf2c472fef5ac8c3d961f176dc1b8ae495f Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 8 Dec 2023 17:17:06 -0500 Subject: [PATCH 60/92] val ds to pandas outside of training func --- src/models/kerasTF/build_neural_networks.py | 6 ++-- src/models/kerasTF/multiclass_models.py | 11 +++---- src/models/sklearn/multiclass_models.py | 32 ++++++++++----------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 8294110..cdcf08f 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -108,7 +108,7 @@ def build_LSTM_attention(nb_features, nb_classes): net = Dense(nb_classes)(net) outputs = Activation('softmax')(net) model = Model(inputs = inputs, outputs = outputs) - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -134,7 +134,7 @@ def build_CNN(nb_features, nb_classes): model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -172,6 +172,6 @@ def build_wideCNN(nb_features, nb_classes): net = Dense(nb_classes)(net) outputs = Activation('softmax')(net) model = Model(inputs = inputs, outputs = outputs) - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) + model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 3b0902a..a5c48a0 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -119,14 +119,15 @@ def __init__( def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') # Labels encoding - self._encoder = Chain( - ModelLabelEncoder(self.taxa), - OneHotTensorEncoder(LABELS_COLUMN_NAME) - ) + # self._encoder = Chain( + # ModelLabelEncoder(self.taxa), + # OneHotTensorEncoder(LABELS_COLUMN_NAME) + # ) + self._encoder = ModelLabelEncoder(self.taxa) self._encoder.fit(ds) # Labels mapping - labels = list(self._encoder.preprocessors[0].stats_[f'unique_values({self.taxa})'].keys()) + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) self._nb_classes = len(labels) self._encoded = np.arange(len(labels)) labels = np.append(labels, 'Unknown') diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 159c672..87e9575 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -136,8 +136,8 @@ def fit(self, datasets): datasets[name] = ds # One sub-model per artificial cluster of samples - ds_train = self._random_split_dataset(datasets['train']) - ds_val = datasets['validation'] + train_ds = self._random_split_dataset(datasets['train']) + val_ds = datasets['validation'].to_pandas() # Checkpointing directory model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}') @@ -145,17 +145,17 @@ def fit(self, datasets): os.mkdir(model_dir) # Model-specific training functions - def build_fit_sgd(data): + def build_fit_sgd(train_data, val_data): # Training data - X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) - y_train = np.array(data[LABELS_COLUMN_NAME]) + X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME]) + y_train = np.array(train_data[LABELS_COLUMN_NAME]) # Validation data - X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME] - y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME] + X_val = val_data[TENSOR_COLUMN_NAME] + y_val = val_data[LABELS_COLUMN_NAME] msk_val = y_val.isin(np.unique(y_train)) X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) y_val = np.array(y_val[msk_val]) - cluster = data['cluster'][0] + cluster = train_data['cluster'][0] model = SGDClassifier( learning_rate = 'optimal', loss = 'modified_huber', @@ -182,17 +182,17 @@ def build_fit_sgd(data): 'file' : [model_file] } - def build_fit_mnb(data): + def build_fit_mnb(train_data, val_data): # Training data - X_train = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) - y_train = np.array(data[LABELS_COLUMN_NAME]) + X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME]) + y_train = np.array(train_data[LABELS_COLUMN_NAME]) # Validation data - X_val = ds_val.to_pandas()[TENSOR_COLUMN_NAME] - y_val = ds_val.to_pandas()[LABELS_COLUMN_NAME] + X_val = val_data[TENSOR_COLUMN_NAME] + y_val = val_data[LABELS_COLUMN_NAME] msk_val = y_val.isin(np.unique(y_train)) X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) y_val = np.array(y_val[msk_val]) - cluster = data['cluster'][0] + cluster = train_data['cluster'][0] model = MultinomialNB() model.fit(X_train, y_train) @@ -216,10 +216,10 @@ def build_fit_mnb(data): if self.classifier == 'sgd': print('Training multiclass SGD classifier') - training_result = ds_train.map_groups(build_fit_sgd, batch_format = 'numpy') + training_result = train_ds.map_groups(lambda ds: build_fit_sgd(ds, val_ds), batch_format = 'numpy') elif self.classifier == 'mnb': print('Training multiclass Multinomial Naive Bayes classifier') - training_result = ds_train.map_groups(build_fit_mnb, batch_format = 'numpy') + training_result = train_ds.map_groups(lambda ds: build_fit_mnb(ds, val_ds), batch_format = 'numpy') training_result = training_result.to_pandas().to_dict('records') for record in training_result: From e3cc5c068d56d55ea0b30ba051c2a8130271b362 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 9 Dec 2023 09:08:30 -0500 Subject: [PATCH 61/92] sklearn remove clibrated classifier --- src/models/classification.py | 9 +- src/models/kerasTF/binary_models.py | 4 - src/models/kerasTF/build_neural_networks.py | 5 +- src/models/kerasTF/multiclass_models.py | 101 ++++++++++++++++++-- src/models/models_utils.py | 3 +- src/models/multiclass_utils.py | 4 +- src/models/sklearn/multiclass_models.py | 73 +++++++------- 7 files changed, 144 insertions(+), 55 deletions(-) diff --git a/src/models/classification.py b/src/models/classification.py index 3c62ddf..b1f76e6 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -7,11 +7,10 @@ from warnings import warn from typing import Dict, List -from models.kerasTF.models import KerasTFModels from models.sklearn.binary_models import SklearnBinaryModels -# from models.kerasTF.binary_models import KerasTFBinaryModels +from models.kerasTF.binary_models import KerasTFBinaryModels from models.sklearn.multiclass_models import SklearnMulticlassModels -# from models.kerasTF.multiclass_models import KerasTFMulticlassModels +from models.kerasTF.multiclass_models import KerasTFMulticlassModels # CV metrics from sklearn.metrics import precision_recall_fscore_support @@ -204,7 +203,7 @@ def _binary_training(self, datasets, taxa, file): self._database_data['csv'] ) else: - model = KerasTFModels( + model = KerasTFBinaryModels( self._classifier_binary, self._outdirs['models_dir'], self._batch_size, @@ -235,7 +234,7 @@ def _multiclass_training(self, datasets, taxa, file): self._database_data['csv'] ) else: - model = KerasTFModels( + model = KerasTFMulticlassModels( self._classifier_multiclass, self._outdirs['models_dir'], self._batch_size, diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 603434b..0b76a69 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -200,10 +200,8 @@ def predict(self, ds): print('predict') # Predict with model predictions = self._predict_proba(ds) - # Convert predictions to labels predictions = self._get_abs_pred(predictions) - # Return decoded labels return self._label_decode(predictions) @@ -211,10 +209,8 @@ def predict_proba(self, ds, threshold = 0.8): print('predict_proba') # Predict with model predictions = self._predict_proba(ds) - # Convert predictions to labels with threshold predictions = self._get_threshold_pred(predictions, threshold) - # Return decoded labels return self._label_decode(predictions) diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index cdcf08f..7d62eca 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -72,7 +72,10 @@ def build_deepLSTM(nb_features): netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs) netB = Dense(40, activation='tanh',name='H_%d'%40) (netB) - net = Concatenate()([netA,netB]) # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 1000, 40)] + # TODO: Debug error caught in local and on Narval + # TODO: Finish testing NNs + # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 100, 40)] + net = Concatenate()([netA,netB]) net = Dense(200, activation='relu', name='C_%d'%(10*2))(net) net = Dropout(0.1,name='fr_%.1f'%0.1)(net) diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index a5c48a0..1052d33 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -20,7 +20,7 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig -from models.kerasTF.models import train_func +from models.kerasTF.models import train_func, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint @@ -119,10 +119,6 @@ def __init__( def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') # Labels encoding - # self._encoder = Chain( - # ModelLabelEncoder(self.taxa), - # OneHotTensorEncoder(LABELS_COLUMN_NAME) - # ) self._encoder = ModelLabelEncoder(self.taxa) self._encoder.fit(ds) @@ -149,6 +145,10 @@ def preprocess(self, ds, scaling = False, scaler_file = None): def fit(self, datasets): print('fit') + """ + TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training + TODO: train_func per model + TODO: Confirm how it works in Jupyter Notebook # Preprocessing loop for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) @@ -166,8 +166,6 @@ def fit(self, datasets): if not os.path.isdir(model_dir): os.mkdir(model_dir) -# TODO: train_func per model -# TODO: Confirm how it works in Jupyter Notebook # Distributed building & training if self.classifier == 'lstm_attention': print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention') @@ -182,26 +180,89 @@ def fit(self, datasets): training_result = training_result.to_pandas().to_dict('records') for record in training_result: self._model_ckpt[record['cluster']] = record['file'] + """ + + # Preprocessing loop + for name, ds in datasets.items(): + # ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + datasets[name] = ds + + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'model': self.classifier, + 'weights': self._weights + } + + # Define trainer / tuner + self._trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + train_loop_config=train_params, + scaling_config=ScalingConfig( + trainer_resources={'CPU': self._nb_CPU_data}, + num_workers=self._n_workers, + use_gpu=self._use_gpu, + resources_per_worker={ + 'CPU': self._nb_CPU_per_worker, + 'GPU': self._nb_GPU_per_worker + } + ), + run_config=RunConfig( + name=self.classifier, + local_dir=self._workdir, + ), + datasets=datasets, + ) + + training_result = self._trainer.fit() + self._model_ckpt = training_result.best_checkpoints[0][0] # Models predicting ######################################################################################################### def predict(self, ds): print('predict') + """ + TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training probabilities = self._predict_proba(ds) predictions = np.argmax(probabilities, axis = 1) predictions = self._label_decode(predictions) return predictions + """ + # Predict with model + predictions = self._predict_proba(ds) + # Convert predictions to labels + predictions = self._get_abs_pred(predictions) + # Return decoded labels + return self._label_decode(predictions) def predict_proba(self, ds, threshold = 0.8): print('predict_proba') + """ + TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training probabilities = self._predict_proba(ds) predictions = self._get_threshold_pred(probabilities, threshold) return self._label_decode(predictions) + """ + # Predict with model + predictions = self._predict_proba(ds) + # Convert predictions to labels with threshold + predictions = self._get_threshold_pred(predictions, threshold) + # Return decoded labels + return self._label_decode(predictions) # TODO: Confirm how it works in Jupyter Notebook def _predict_proba(self, ds): print('_predict_proba') + """ + TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training if ds.count() > 0: if self._scaler is not None: ds = self._scaler.transform(ds) @@ -225,6 +286,32 @@ def predict_func(data): return probabilities else: raise ValueError('Empty dataset, cannot execute predictions!') + """ + if ds.count() > 0: + if len(ds.schema().names) > 1: + col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] + ds = ds.drop_columns(col_2_drop) + + # Preprocess + if self._scaler is not None: + ds = self._scaler.transform(ds) + ds = ds.materialize() + + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_cpus_per_worker = self._nb_CPU_per_worker, + num_gpus_per_worker = self._nb_GPU_per_worker + ) + return predictions + else: + raise ValueError('No data to predict') def _get_abs_pred(self, predictions): print('_get_abs_pred') diff --git a/src/models/models_utils.py b/src/models/models_utils.py index 08d7c79..da4061b 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -122,7 +122,8 @@ def _compute_weights(self): weights = {} if isinstance(self._csv, tuple): cls = pd.concat([pd.read_csv(self._csv[0]),pd.read_csv(self._csv[1])], axis = 0, join = 'inner', ignore_index = True) - cls = pd.read_csv(self._csv) + else: + cls = pd.read_csv(self._csv) if self.taxa == 'domain': cls.loc[cls['domain'].str.lower() == 'archaea', 'domain'] = 'Bacteria' classes = list(cls[self.taxa].unique()) diff --git a/src/models/multiclass_utils.py b/src/models/multiclass_utils.py index a142ca3..9534670 100644 --- a/src/models/multiclass_utils.py +++ b/src/models/multiclass_utils.py @@ -68,9 +68,9 @@ def map_clusters(batch): batch['cluster'] = clusters return batch - nb_clusters = int(ds.count() / 100) + nb_clusters = int(ds.count() / self.batch_size) - ds = ds.repartition(100) + ds = ds.repartition(self.batch_size) ds = ds.map_batches(map_clusters, batch_size = nb_clusters, batch_format = 'pandas') return ds.groupby('cluster') \ No newline at end of file diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 87e9575..3473a75 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -128,16 +128,17 @@ def preprocess(self, ds, scaling = False, scaler_file = None): def fit(self, datasets): print('fit') - for name, ds in datasets.items(): + # for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) - datasets[name] = ds + train_ds = datasets['train'] + train_ds = self._encoder.transform(train_ds) + if self._scaler is not None: + train_ds = self._scaler.transform(train_ds) + # datasets[name] = ds # One sub-model per artificial cluster of samples - train_ds = self._random_split_dataset(datasets['train']) - val_ds = datasets['validation'].to_pandas() + train_ds = self._random_split_dataset(train_ds) + # val_ds = datasets['validation'].to_pandas() # Checkpointing directory model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}') @@ -145,16 +146,16 @@ def fit(self, datasets): os.mkdir(model_dir) # Model-specific training functions - def build_fit_sgd(train_data, val_data): + def build_fit_sgd(train_data):#, val_data): # Training data X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME]) y_train = np.array(train_data[LABELS_COLUMN_NAME]) # Validation data - X_val = val_data[TENSOR_COLUMN_NAME] - y_val = val_data[LABELS_COLUMN_NAME] - msk_val = y_val.isin(np.unique(y_train)) - X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) - y_val = np.array(y_val[msk_val]) + # X_val = val_data[TENSOR_COLUMN_NAME] + # y_val = val_data[LABELS_COLUMN_NAME] + # msk_val = y_val.isin(np.unique(y_train)) + # X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) + # y_val = np.array(y_val[msk_val]) cluster = train_data['cluster'][0] model = SGDClassifier( learning_rate = 'optimal', @@ -164,50 +165,50 @@ def build_fit_sgd(train_data, val_data): ) model.fit(X_train, y_train) - calibrator = CalibratedClassifierCV( - estimator = model, - method = 'isotonic', - cv = 'prefit', - ) + # calibrator = CalibratedClassifierCV( + # estimator = model, + # method = 'isotonic', + # cv = 'prefit', + # ) - calibrator.fit(X_val,y_val) + # calibrator.fit(X_val,y_val) model_file = os.path.join(model_dir, f'{cluster}.pkl') with open(model_file, "wb") as file: - cpickle.dump(calibrator, file) + cpickle.dump(model, file) return { 'cluster' : [cluster], 'file' : [model_file] } - def build_fit_mnb(train_data, val_data): + def build_fit_mnb(train_data):#, val_data): # Training data X_train = _unwrap_ndarray_object_type_if_needed(train_data[TENSOR_COLUMN_NAME]) y_train = np.array(train_data[LABELS_COLUMN_NAME]) # Validation data - X_val = val_data[TENSOR_COLUMN_NAME] - y_val = val_data[LABELS_COLUMN_NAME] - msk_val = y_val.isin(np.unique(y_train)) - X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) - y_val = np.array(y_val[msk_val]) + # X_val = val_data[TENSOR_COLUMN_NAME] + # y_val = val_data[LABELS_COLUMN_NAME] + # msk_val = y_val.isin(np.unique(y_train)) + # X_val = _unwrap_ndarray_object_type_if_needed(X_val[msk_val]) + # y_val = np.array(y_val[msk_val]) cluster = train_data['cluster'][0] model = MultinomialNB() model.fit(X_train, y_train) model_file = os.path.join(model_dir, f'{cluster}.pkl') - calibrator = CalibratedClassifierCV( - estimator = model, - method = 'isotonic', - cv = 'prefit', - ) + # calibrator = CalibratedClassifierCV( + # estimator = model, + # method = 'isotonic', + # cv = 'prefit', + # ) - calibrator.fit(X_val,y_val) + # calibrator.fit(X_val,y_val) with open(model_file, "wb") as file: - cpickle.dump(calibrator, file) + cpickle.dump(model, file) return { 'cluster' : [cluster], @@ -216,10 +217,12 @@ def build_fit_mnb(train_data, val_data): if self.classifier == 'sgd': print('Training multiclass SGD classifier') - training_result = train_ds.map_groups(lambda ds: build_fit_sgd(ds, val_ds), batch_format = 'numpy') + training_result = train_ds.map_groups(build_fit_sgd, batch_format = 'numpy') + # training_result = train_ds.map_groups(lambda ds: build_fit_sgd(ds, val_ds), batch_format = 'numpy') elif self.classifier == 'mnb': print('Training multiclass Multinomial Naive Bayes classifier') - training_result = train_ds.map_groups(lambda ds: build_fit_mnb(ds, val_ds), batch_format = 'numpy') + training_result = train_ds.map_groups(build_fit_mnb, batch_format = 'numpy') + # training_result = train_ds.map_groups(lambda ds: build_fit_mnb(ds, val_ds), batch_format = 'numpy') training_result = training_result.to_pandas().to_dict('records') for record in training_result: From 31750a81a55f709792761b2c210ea4cb8a8dda30 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 9 Dec 2023 11:14:33 -0500 Subject: [PATCH 62/92] NN architectures debug --- src/models/kerasTF/build_neural_networks.py | 29 ++++++++++----------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 7d62eca..e69747a 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -2,7 +2,7 @@ from keras.models import Model, Sequential from tensorflow.keras import mixed_precision from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy -from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape +from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape, AveragePooling1D @@ -70,11 +70,10 @@ def build_deepLSTM(nb_features): netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA) netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs) - netB = Dense(40, activation='tanh',name='H_%d'%40) (netB) + netB = Dense(100, activation='tanh',name='H_%d'%40) (netB) + netB = AveragePooling1D(100) (netB) + netB = Flatten() (netB) - # TODO: Debug error caught in local and on Narval - # TODO: Finish testing NNs - # A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 40), (None, 100, 40)] net = Concatenate()([netA,netB]) net = Dense(200, activation='relu', name='C_%d'%(10*2))(net) @@ -149,21 +148,21 @@ def build_wideCNN(nb_features, nb_classes): https://github.com/KennthShang/CHEER/blob/master/Classifier/model/Wcnn.py """ - inputs = Input(shape = (nb_features,1)) + inputs = Input(shape = (nb_features, 1)) # embed = Embedding(248, 100)(inputs) - # embed = Reshape((nb_features, -1, 1))(embed) + # inputs = Reshape((nb_features, -1, 1))(inputs) - conv1 = Conv2D(256, 3, activation = 'relu')(inputs) - conv1 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv1) + conv1 = Conv1D(256, 3, activation = 'relu')(inputs) + conv1 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv1) - conv2 = Conv2D(256, 7, activation = 'relu')(inputs) - conv2 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv2) + conv2 = Conv1D(256, 7, activation = 'relu')(inputs) + conv2 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv2) - conv3 = Conv2D(256, 11, activation = 'relu')(inputs) - conv3 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv3) + conv3 = Conv1D(256, 11, activation = 'relu')(inputs) + conv3 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv3) - conv4 = Conv2D(256, 15, activation = 'relu')(inputs) - conv4 = MaxPooling2D(pool_size = (1,1), strides = nb_features)(conv4) + conv4 = Conv1D(256, 15, activation = 'relu')(inputs) + conv4 = MaxPooling1D(pool_size = 1, strides = nb_features)(conv4) net = Concatenate(axis = 1)([conv1,conv2,conv3,conv4]) net = Flatten()(net) From 301e2d6bd185ed263dd4d8e9aebd41864a11dd78 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 11 Dec 2023 11:01:48 -0500 Subject: [PATCH 63/92] min-max scaling for values 0-1 --- src/models/kerasTF/multiclass_models.py | 12 ++--- .../preprocessors/compute_class_weights.py | 49 ------------------- src/models/preprocessors/min_max_scaler.py | 15 +++--- src/models/sklearn/multiclass_models.py | 14 +++--- 4 files changed, 18 insertions(+), 72 deletions(-) delete mode 100644 src/models/preprocessors/compute_class_weights.py diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 1052d33..23b1cb4 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -7,6 +7,7 @@ # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.encoders.model_label_encoder import ModelLabelEncoder +from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer @@ -136,9 +137,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) + self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler.fit(ds) # Models training ######################################################################################################### @@ -186,8 +186,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() datasets[name] = ds @@ -293,8 +292,7 @@ def predict_func(data): ds = ds.drop_columns(col_2_drop) # Preprocess - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() self._predictor = BatchPredictor.from_checkpoint( diff --git a/src/models/preprocessors/compute_class_weights.py b/src/models/preprocessors/compute_class_weights.py deleted file mode 100644 index 43b4c5d..0000000 --- a/src/models/preprocessors/compute_class_weights.py +++ /dev/null @@ -1,49 +0,0 @@ - -import numpy as np -import pandas as pd - -from ray.data.dataset import Dataset -from ray.data.preprocessor import Preprocessor - -TENSOR_COLUMN_NAME = '__value__' - -class ComputeClassWeights(Preprocessor): - """ - Custom implementation of Class Weight Computation inspired by sklearn.utils.class_weight.compute_class_weight to be used as a Ray preprocessor. - https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html - This permits to estimate balanced class weights for an unbalanced dataset. - """ - - def __init__(self, class_col): - # Parameters - self._col = class_col - self._cls = [] - self._counts_map = {} - - def _fit(self, ds: Dataset) -> Preprocessor: - def get_cls_counts(df): - mapping = {} - counts = df[self._col].value_counts() - for cls in self._cls: - if cls in counts.index: - mapping[str(cls)] = [counts[cls]] - else: - mapping[str(cls)] = [0] - return mapping - - self._cls = ds.unique(self._col) - - counts = ds.map_batches(get_cls_counts, batch_format = 'pandas') - - for cls in self._cls: - self._counts_map[str(cls)] = counts.sum(str(cls)) - - freqs = ds.count() / (len(self._cls) * np.array(list(self._counts_map.values())).astype(np.float64)) - - self.stats_ = {} - for i, cls in enumerate(self._cls): - self.stats_[cls] = freqs[i] - - return self - - diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py index 1cb6aa0..0f672a6 100644 --- a/src/models/preprocessors/min_max_scaler.py +++ b/src/models/preprocessors/min_max_scaler.py @@ -13,9 +13,9 @@ class TensorMinMaxScaler(Preprocessor): Custom implementation of Ray's MinMax Scaler for usage with tensor column in ray.data.dataset.Dataset. """ - def __init__(self, features): + def __init__(self, nb_features): # Parameters - self._features = features + self.__nb_features = nb_features def _fit(self, ds: Dataset) -> Preprocessor: """ @@ -23,16 +23,15 @@ def _fit(self, ds: Dataset) -> Preprocessor: """ min = [] max = [] - nb_features = len(self._features) def Min(dct): arr = dct[TENSOR_COLUMN_NAME] - min = np.array([arr[:,i].min() for i in range(nb_features)]) + min = np.array([arr[:,i].min() for i in range(self.__nb_features)]) return min def Max(dct): arr = dct[TENSOR_COLUMN_NAME] - max = np.array([arr[:,i].max() for i in range(nb_features)]) + max = np.array([arr[:,i].max() for i in range(self.__nb_features)]) return max for batch in ds.iter_batches(batch_format = 'numpy'): @@ -42,8 +41,8 @@ def Max(dct): min = np.array(min) max = np.array(max) - min = np.array([min[:,i].min() for i in range(nb_features)]) - max = np.array([max[:,i].max() for i in range(nb_features)]) + min = np.array([min[:,i].min() for i in range(self.__nb_features)]) + max = np.array([max[:,i].max() for i in range(self.__nb_features)]) self.stats_ = {'min' : min, 'max' : max} @@ -80,4 +79,4 @@ def _transform_numpy(self, batch: dict): return batch def __repr__(self): - return f"{self.__class__.__name__}(columns={self._features_list!r})" + return f"{self.__class__.__name__}(columns={self._nb_features!r})" diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 3473a75..d449e64 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -6,6 +6,7 @@ # Preprocessing from models.encoders.model_label_encoder import ModelLabelEncoder +from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer @@ -119,10 +120,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) - + self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler.fit(ds) + # Models training ######################################################################################################### @@ -132,8 +132,7 @@ def fit(self, datasets): # ds = ds.drop_columns(['id']) train_ds = datasets['train'] train_ds = self._encoder.transform(train_ds) - if self._scaler is not None: - train_ds = self._scaler.transform(train_ds) + train_ds = self._scaler.transform(train_ds) # datasets[name] = ds # One sub-model per artificial cluster of samples @@ -246,8 +245,7 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): if ds.count() > 0: - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) # ds = ds.materialize() def predict_func(data): From fe9fd362f993c8c6d58ce4a80135df9e555d847a Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 11 Dec 2023 11:04:21 -0500 Subject: [PATCH 64/92] MinMax scaling for binary models --- src/models/kerasTF/binary_models.py | 14 ++++++-------- src/models/sklearn/binary_models.py | 12 +++++------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 0b76a69..d7a9ed4 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -7,6 +7,7 @@ # Preprocessing from ray.data.preprocessors import LabelEncoder, Chain from models.encoders.model_label_encoder import ModelLabelEncoder +from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer @@ -142,10 +143,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) - + self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler.fit(ds) + # Model training ######################################################################################################### @@ -155,8 +155,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() datasets[name] = ds @@ -221,8 +220,7 @@ def _predict_proba(self, ds): ds = ds.drop_columns(col_2_drop) # Preprocess - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() self._predictor = BatchPredictor.from_checkpoint( diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index 61d0656..ca137d0 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -6,6 +6,7 @@ # Preprocessing from models.encoders.model_label_encoder import ModelLabelEncoder +from models.preprocessors.min_max_scaler import TensorMinMaxScaler from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer @@ -112,9 +113,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._labels_map[label] = encoded # Scaling - if scaling: - self._scaler = TensorTfIdfTransformer(self.kmers, scaler_file) - self._scaler.fit(ds) + self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler.fit(ds) # Model training @@ -127,8 +127,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) datasets[name] = ray.put(ds) try: @@ -187,8 +186,7 @@ def _build(self): def predict(self, ds): print('predict') if ds.count() > 0: - if self._scaler is not None: - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) From b80f9f43fd1bc9dff364374d7d36111068aed9bd Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 11 Dec 2023 17:51:16 -0500 Subject: [PATCH 65/92] ray cluster init --- src/utils.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/utils.py b/src/utils.py index 4f48bfa..44386c4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -75,21 +75,7 @@ def init_ray_cluster(workdir): mem = ray._private.utils.get_shared_memory_bytes() - 10 if 'HOST_IP' in list(os.environ.keys()): - ray.init( - _node_ip_address = os.environ['HOST_IP'], - num_cpus = nb_CPU, - num_gpus = nb_GPU, - _temp_dir = str(workdir), - object_store_memory = mem, - _system_config={ - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(workdir) - }, - }) - }, - ) + ray.init(address = f"{os.environ['HOST_IP']}:{os.environ['RAY_PORT']}", _node_ip_address = os.environ['HOST_IP']) else: ray.init( num_cpus = nb_CPU, From 275f6d21ba031cc0926037d2e328cff1dafaf9df Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 12 Dec 2023 17:37:34 -0500 Subject: [PATCH 66/92] resources for BatchPredictor --- src/models/kerasTF/binary_models.py | 21 ++++++++++++++------- src/models/kerasTF/multiclass_models.py | 22 ++++++++++++++-------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index d7a9ed4..2d48cec 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -228,13 +228,20 @@ def _predict_proba(self, ds): TensorflowPredictor, model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_cpus_per_worker = self._nb_CPU_per_worker, - num_gpus_per_worker = self._nb_GPU_per_worker - ) + if self._nb_GPU > 0: + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_gpus_per_worker = self._nb_GPU_per_worker + ) + else: + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_cpus_per_worker = self._nb_CPU_per_worker + ) return predictions else: raise ValueError('No data to predict') diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 23b1cb4..259ac83 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -257,7 +257,6 @@ def predict_proba(self, ds, threshold = 0.8): # Return decoded labels return self._label_decode(predictions) -# TODO: Confirm how it works in Jupyter Notebook def _predict_proba(self, ds): print('_predict_proba') """ @@ -300,13 +299,20 @@ def predict_func(data): TensorflowPredictor, model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_cpus_per_worker = self._nb_CPU_per_worker, - num_gpus_per_worker = self._nb_GPU_per_worker - ) + if self._nb_GPU > 0: + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_gpus_per_worker = self._nb_GPU_per_worker + ) + else: + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + num_cpus_per_worker = self._nb_CPU_per_worker + ) return predictions else: raise ValueError('No data to predict') From 810de385565cfccfbe43e9efe4c694fd4d198b64 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 13 Dec 2023 17:48:28 -0500 Subject: [PATCH 67/92] tf-idf scaler instead of MinMax for usage with only reduced data --- environment.yml | 124 ------------------------ frozen_requirements.txt | 95 ------------------ src/models/kerasTF/binary_models.py | 6 +- src/models/kerasTF/multiclass_models.py | 6 +- src/models/sklearn/binary_models.py | 6 +- src/models/sklearn/multiclass_models.py | 7 +- 6 files changed, 20 insertions(+), 224 deletions(-) delete mode 100644 environment.yml delete mode 100644 frozen_requirements.txt diff --git a/environment.yml b/environment.yml deleted file mode 100644 index d48b75b..0000000 --- a/environment.yml +++ /dev/null @@ -1,124 +0,0 @@ -name: caribou -channels: - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - ca-certificates=2022.10.11=h06a4308_0 - - certifi=2022.12.7=py38h06a4308_0 - - ld_impl_linux-64=2.38=h1181459_1 - - libffi=3.3=he6710b0_2 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libstdcxx-ng=11.2.0=h1234567_1 - - ncurses=6.3=h5eee18b_3 - - openssl=1.1.1s=h7f8727e_0 - - pip=22.3.1=py38h06a4308_0 - - python=3.8.10=h12debd9_8 - - readline=8.2=h5eee18b_0 - - setuptools=65.5.0=py38h06a4308_0 - - sqlite=3.40.0=h5082296_0 - - tk=8.6.12=h1ccaba5_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.8=h5eee18b_0 - - zlib=1.2.13=h5eee18b_0 - - pip: - - absl-py==1.3.0 - - aiohttp==3.8.3 - - aiohttp-cors==0.7.0 - - aiorwlock==1.3.0 - - aiosignal==1.3.1 - - anyio==3.6.2 - - astunparse==1.6.3 - - async-timeout==4.0.2 - - attrs==22.2.0 - - biopython==1.78 - - blessed==1.19.1 - - cachetools==5.2.0 - - charset-normalizer==2.1.1 - - click==8.1.3 - - cloudpickle==2.2.0 - - colorful==0.5.5 - - distlib==0.3.6 - - fastapi==0.88.0 - - filelock==3.8.2 - - flatbuffers==22.12.6 - - frozenlist==1.3.3 - - fsspec==2022.11.0 - - future==0.18.2 - - gast==0.4.0 - - google-api-core==2.11.0 - - google-auth==2.15.0 - - google-auth-oauthlib==0.4.6 - - google-pasta==0.2.0 - - googleapis-common-protos==1.57.0 - - gpustat==1.0.0 - - grpcio==1.51.1 - - h11==0.14.0 - - h5py==3.7.0 - - idna==3.4 - - importlib-metadata==5.2.0 - - importlib-resources==5.10.1 - - insilicoseq==1.5.4 - - joblib==1.2.0 - - jsonschema==4.17.3 - - keras==2.11.0 - - libclang==14.0.6 - - markdown==3.4.1 - - markupsafe==2.1.1 - - msgpack==1.0.4 - - multidict==6.0.3 - - numpy==1.23.4 - - nvidia-ml-py==11.495.46 - - oauthlib==3.2.2 - - opencensus==0.11.0 - - opencensus-context==0.1.3 - - opt-einsum==3.3.0 - - packaging==22.0 - - pandas==1.5.2 - - pkgutil-resolve-name==1.3.10 - - platformdirs==2.6.0 - - prometheus-client==0.13.1 - - protobuf==3.19.6 - - psutil==5.9.4 - - py-spy==0.3.14 - - pyarrow==6.0.1 - - pyasn1==0.4.8 - - pyasn1-modules==0.2.8 - - pydantic==1.10.2 - - pyrsistent==0.19.2 - - pysam==0.20.0 - - python-dateutil==2.8.2 - - pytz==2022.7 - - pyyaml==6.0 - - ray==2.2.0 - - requests==2.28.1 - - requests-oauthlib==1.3.1 - - rsa==4.9 - - scikit-learn==1.2.0 - - scipy==1.9.3 - - six==1.16.0 - - smart-open==6.3.0 - - sniffio==1.3.0 - - starlette==0.22.0 - - tabulate==0.9.0 - - tensorboard==2.11.0 - - tensorboard-data-server==0.6.1 - - tensorboard-plugin-wit==1.8.1 - - tensorboardx==2.5.1 - - tensorflow==2.11.0 - - tensorflow-estimator==2.11.0 - - tensorflow-io-gcs-filesystem==0.29.0 - - termcolor==2.1.1 - - threadpoolctl==3.1.0 - - tqdm==4.64.1 - - typing-extensions==4.4.0 - - urllib3==1.26.13 - - uvicorn==0.20.0 - - virtualenv==20.17.1 - - wcwidth==0.2.5 - - werkzeug==2.2.2 - - wrapt==1.14.1 - - yarl==1.8.2 - - zipp==3.11.0 -prefix: /root/anaconda3/envs/caribou diff --git a/frozen_requirements.txt b/frozen_requirements.txt deleted file mode 100644 index caf68a4..0000000 --- a/frozen_requirements.txt +++ /dev/null @@ -1,95 +0,0 @@ -absl-py==1.4.0 -aiohttp==3.8.5 -aiohttp-cors==0.7.0 -aiosignal==1.3.1 -astunparse==1.6.3 -async-timeout==4.0.2 -attrs==23.1.0 -biopython==1.78 -blessed==1.20.0 -cachetools==5.3.1 -certifi==2023.7.22 -charset-normalizer==3.2.0 -click==8.1.6 -cloudpickle==2.2.1 -colorful==0.5.5 -Cython==3.0.0 -distlib==0.3.7 -filelock==3.12.2 -flatbuffers==23.5.26 -frozenlist==1.4.0 -future==0.18.3 -gast==0.4.0 -google-api-core==2.11.1 -google-auth==2.22.0 -google-auth-oauthlib==1.0.0 -google-pasta==0.2.0 -googleapis-common-protos==1.60.0 -gpustat==1.1 -grpcio==1.47.0 -h5py==3.8.0 -idna==3.4 -importlib-metadata==6.8.0 -importlib-resources==6.0.0 -InSilicoSeq==1.5.4 -joblib==1.3.1 -jsonschema==4.18.6 -jsonschema-specifications==2023.7.1 -keras==2.13.1 -libclang==16.0.6 -Markdown==3.4.4 -MarkupSafe==2.1.3 -msgpack==1.0.5 -multidict==6.0.4 -numpy==1.25.2 -nvidia-ml-py==12.535.77 -oauthlib==3.2.2 -opencensus==0.11.2 -opencensus-context==0.1.3 -opt-einsum==3.3.0 -packaging==23.1 -pandas==2.0.3 -pkgutil_resolve_name==1.3.10 -platformdirs==3.10.0 -prometheus-client==0.13.1 -protobuf==4.23.4 -psutil==5.9.5 -py-spy==0.3.14 -pyarrow==12.0.0 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pydantic==1.10.12 -pysam==0.21.0 -python-dateutil==2.8.2 -pytz==2023.3 -PyYAML==6.0.1 -ray==2.6.3 -referencing==0.30.2 -requests==2.31.0 -requests-oauthlib==1.3.1 -rpds-py==0.10.0 -rsa==4.9 -scikit-learn==1.3.0 -scipy==1.10.1 -six==1.16.0 -smart-open==6.3.0 -tabulate==0.9.0 -tensorboard==2.13.0 -tensorboard-data-server==0.7.1 -tensorboardX==2.6.2 -tensorflow==2.14.0 -tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.32.0 -termcolor==2.3.0 -threadpoolctl==3.2.0 -tqdm==4.65.0 -tune-sklearn==0.4.6 -typing_extensions==4.5.0 -tzdata==2023.3 -urllib3==1.26.16 -virtualenv==20.24.2 -wcwidth==0.2.6 -Werkzeug==2.3.6 -wrapt==1.15.0 -yarl==1.9.2 -zipp==3.16.2 \ No newline at end of file diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 2d48cec..8eedd51 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -143,7 +143,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler = TensorTfIdfTransformer( + features = self.kmers, + file = scaler_file + ) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) # Model training diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 259ac83..d1d1651 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -137,7 +137,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler = TensorTfIdfTransformer( + features = self.kmers, + file = scaler_file + ) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) # Models training diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index ca137d0..9846f47 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -113,7 +113,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._labels_map[label] = encoded # Scaling - self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler = TensorTfIdfTransformer( + features = self.kmers, + file = scaler_file + ) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index d449e64..c464d0d 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -7,7 +7,6 @@ # Preprocessing from models.encoders.model_label_encoder import ModelLabelEncoder from models.preprocessors.min_max_scaler import TensorMinMaxScaler -from models.encoders.onesvm_label_encoder import OneClassSVMLabelEncoder from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Training @@ -120,7 +119,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler = TensorTfIdfTransformer( + features = self.kmers, + file = scaler_file + ) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) # Models training From 291d5a952e6eeb978be029e97ba4cd438c33791f Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 14 Dec 2023 19:40:44 -0500 Subject: [PATCH 68/92] sklearn SGD default regularization --- src/models/sklearn/binary_models.py | 1 - src/models/sklearn/multiclass_models.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index 9846f47..8a2af15 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -178,7 +178,6 @@ def _build(self): self._clf = SGDClassifier() self._train_params = { 'loss' : 'hinge', - 'penalty' : 'elasticnet', 'learning_rate' : 'optimal', 'class_weight' : self._weights, 'n_jobs' : -1 diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index c464d0d..ef2de2a 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -160,9 +160,8 @@ def build_fit_sgd(train_data):#, val_data): # y_val = np.array(y_val[msk_val]) cluster = train_data['cluster'][0] model = SGDClassifier( - learning_rate = 'optimal', loss = 'modified_huber', - penalty = 'l2', + learning_rate = 'optimal', class_weight = self._weights, ) model.fit(X_train, y_train) From eebbd743fed392fa3e4de9c359960796c05faede Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 14 Dec 2023 19:42:37 -0500 Subject: [PATCH 69/92] MinMax scaler for usage with decomposed data --- src/models/kerasTF/binary_models.py | 10 +++++----- src/models/kerasTF/multiclass_models.py | 10 +++++----- src/models/sklearn/binary_models.py | 10 +++++----- src/models/sklearn/multiclass_models.py | 10 +++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 8eedd51..fe7fb58 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -143,11 +143,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - self._scaler = TensorTfIdfTransformer( - features = self.kmers, - file = scaler_file - ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler = TensorTfIdfTransformer( + # features = self.kmers, + # file = scaler_file + # ) + self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) # Model training diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index d1d1651..bc1b963 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -137,11 +137,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - self._scaler = TensorTfIdfTransformer( - features = self.kmers, - file = scaler_file - ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler = TensorTfIdfTransformer( + # features = self.kmers, + # file = scaler_file + # ) + self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) # Models training diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index 8a2af15..0ffa104 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -113,11 +113,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._labels_map[label] = encoded # Scaling - self._scaler = TensorTfIdfTransformer( - features = self.kmers, - file = scaler_file - ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler = TensorTfIdfTransformer( + # features = self.kmers, + # file = scaler_file + # ) + self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index ef2de2a..d12724b 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -119,11 +119,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): self._weights = self._compute_weights() # Scaling - self._scaler = TensorTfIdfTransformer( - features = self.kmers, - file = scaler_file - ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler = TensorTfIdfTransformer( + # features = self.kmers, + # file = scaler_file + # ) + self._scaler = TensorMinMaxScaler(self._nb_kmers) self._scaler.fit(ds) # Models training From aa888a73a48969e3d165194bf3afbe02e016aa84 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 17 Dec 2023 12:46:46 -0500 Subject: [PATCH 70/92] weighted multiclass sklearn predict_proba --- src/models/kerasTF/multiclass_models.py | 37 ------------------------- src/models/sklearn/multiclass_models.py | 8 +++++- 2 files changed, 7 insertions(+), 38 deletions(-) diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index bc1b963..55e7421 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -149,43 +149,6 @@ def preprocess(self, ds, scaling = False, scaler_file = None): def fit(self, datasets): print('fit') - """ - TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training - TODO: train_func per model - TODO: Confirm how it works in Jupyter Notebook - # Preprocessing loop - for name, ds in datasets.items(): - # ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - if self._scaler is not None: - ds = self._scaler.transform(ds) - ds = ds.materialize() - datasets[name] = ds - - # One sub-model per artificial cluster of samples - ds['train'] = self._random_split_dataset(ds['train']) - - # Checkpointing directory - model_dir = os.path.join(self._workdir, f'{self.classifier}_{self.taxa}') - if not os.path.isdir(model_dir): - os.mkdir(model_dir) - - # Distributed building & training - if self.classifier == 'lstm_attention': - print('Training multiclass classifier based on Deep Neural Network hybrid between LSTM and Attention') - training_result = ds.map_groups(build_fit_lstm_attention, batch_format = 'numpy') - elif self.classifier == 'cnn': - print('Training multiclass classifier based on CNN Neural Network') - training_result = ds.map_groups(build_fit_cnn, batch_format = 'numpy') - elif self.classifier == 'widecnn': - print('Training multiclass classifier based on Wide CNN Network') - training_result = ds.map_groups(build_fit_widecnn, batch_format = 'numpy') - - training_result = training_result.to_pandas().to_dict('records') - for record in training_result: - self._model_ckpt[record['cluster']] = record['file'] - """ - # Preprocessing loop for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index d12724b..f6fade1 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -259,11 +259,17 @@ def predict_func(data): proba = model.predict_proba(X) for i, cls in enumerate(model.classes_): pred[:, cls] += proba[:, i] - # pred = pred / len(self._model_ckpt) + pred = pred / len(self._model_ckpt) return {'predictions' : pred} probabilities = ds.map_batches(predict_func, batch_format = 'numpy') probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + weights = np.zeros(len(self._weights)) + for encoded, w in self._weights.items(): + weights[encoded] = w + + probabilities = probabilities * weights return probabilities else: From 4cd603f74df5e0ad532f2cf49336cd4aecc74a8d Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sun, 17 Dec 2023 16:27:01 -0500 Subject: [PATCH 71/92] debug multiclass weighted proba --- src/models/sklearn/multiclass_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index f6fade1..0f185d4 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -252,7 +252,7 @@ def _predict_proba(self, ds): def predict_func(data): X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) - pred = np.zeros((len(X), len(self._labels_map))) + pred = np.zeros((len(X), len(self._labels_map)-1)) for cluster, model_file in self._model_ckpt.items(): with open(model_file, 'rb') as file: model = cpickle.load(file) From 1391c327c57f1f9d4c078dbfcc2d140752d2eb29 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 18 Dec 2023 14:45:04 -0500 Subject: [PATCH 72/92] predictions rectify --- src/Caribou_reduce_features.py | 2 +- src/models/kerasTF/multiclass_models.py | 85 ++-------------------- src/models/models_utils.py | 6 +- src/models/preprocessors/min_max_scaler.py | 10 +-- src/models/sklearn/multiclass_models.py | 15 ---- 5 files changed, 13 insertions(+), 105 deletions(-) diff --git a/src/Caribou_reduce_features.py b/src/Caribou_reduce_features.py index efe88db..1f2e5bb 100644 --- a/src/Caribou_reduce_features.py +++ b/src/Caribou_reduce_features.py @@ -151,7 +151,7 @@ def features_selection(train_ds, export_ds, kmers, taxa): parser.add_argument('-dt','--dataset_name', default='dataset', help='Name of the dataset used to name files') parser.add_argument('-l','--kmers_list', default=None, type=Path, help='PATH to a file containing a list of k-mers that will be reduced') # Parameters - parser.add_argument('-t','--taxa', default='phylum', help='The taxonomic level to use for the classification, defaults to Phylum.') + parser.add_argument('-t','--taxa', default='species', help='The taxonomic level to use for the classification, defaults to Phylum.') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') parser.add_argument('-wd','--workdir', default='/tmp/spill', type=Path, help='Optional. Path to a working directory where tuning data will be spilled') args = parser.parse_args() diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 55e7421..4980023 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -32,6 +32,9 @@ from ray.train.tensorflow import TensorflowPredictor from ray.train.batch_predictor import BatchPredictor +# Data +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + __author__ = 'Nicolas de Montigny' __all__ = ['KerasTFModel'] @@ -195,28 +198,16 @@ def fit(self, datasets): def predict(self, ds): print('predict') - """ - TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training - probabilities = self._predict_proba(ds) - predictions = np.argmax(probabilities, axis = 1) - predictions = self._label_decode(predictions) - return predictions - """ # Predict with model - predictions = self._predict_proba(ds) + probabilities = self._predict_proba(ds) # Convert predictions to labels - predictions = self._get_abs_pred(predictions) + probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + predictions = np.argmax(probabilities, axis = 1) # Return decoded labels return self._label_decode(predictions) def predict_proba(self, ds, threshold = 0.8): print('predict_proba') - """ - TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training - probabilities = self._predict_proba(ds) - predictions = self._get_threshold_pred(probabilities, threshold) - return self._label_decode(predictions) - """ # Predict with model predictions = self._predict_proba(ds) # Convert predictions to labels with threshold @@ -226,32 +217,6 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): print('_predict_proba') - """ - TODO: If Ray AIR training is too long, try using the datasets groupby / Tune for multimodel training - if ds.count() > 0: - if self._scaler is not None: - ds = self._scaler.transform(ds) - # ds = ds.materialize() - - def predict_func(data): - X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) - pred = np.zeros((len(X), len(self._labels_map))) - for cluster, model_file in self._model_ckpt.items(): - with open(model_file, 'rb') as file: - model = cpickle.load(file) - proba = model.predict_proba(X) - for i, cls in enumerate(model.classes_): - pred[:, cls] += proba[:, i] - # pred = pred / len(self._model_ckpt) - return {'predictions' : pred} - - probabilities = ds.map_batches(predict_func, batch_format = 'numpy') - probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) - - return probabilities - else: - raise ValueError('Empty dataset, cannot execute predictions!') - """ if ds.count() > 0: if len(ds.schema().names) > 1: col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] @@ -284,28 +249,6 @@ def predict_func(data): else: raise ValueError('No data to predict') - def _get_abs_pred(self, predictions): - print('_get_abs_pred') - def map_predicted_label(ds): - ds = ds['predictions'] - pred = pd.DataFrame({ - 'best_proba': [np.max(arr) for arr in ds], - 'predicted_label' : [np.argmax(arr) for arr in ds] - }) - - return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} - - predict = [] - predictions = predictions.map_batches( - lambda batch : map_predicted_label(batch), - batch_format = 'numpy', - batch_size = self.batch_size - ) - for row in predictions.iter_rows(): - predict.append(row['predictions']) - - return predict - def _get_threshold_pred(self, predictions, threshold): print('_get_threshold_pred') def map_predicted_label(ds, threshold): @@ -328,19 +271,3 @@ def map_predicted_label(ds, threshold): predict.append(row['predictions']) return predict - -# TODO: Confirm how it works in Jupyter Notebook -def build_fit_lstm_attention(data): - """ - LSTM-Attention NN training function - """ - -def build_fit_cnn(data): - """ - Convolution NN training function - """ - -def build_fit_widecnn(data): - """ - Wide Convolution NN training function - """ \ No newline at end of file diff --git a/src/models/models_utils.py b/src/models/models_utils.py index da4061b..7b7fc6c 100644 --- a/src/models/models_utils.py +++ b/src/models/models_utils.py @@ -110,11 +110,6 @@ def _get_threshold_pred(self): """ """ - @abstractmethod - def _label_decode(self): - """ - """ - def _compute_weights(self): """ Set class weights depending on their abundance in data-associated classes csv @@ -141,6 +136,7 @@ def _compute_weights(self): def _label_decode(self, predict): print('_label_decode') + decoded = pd.Series(np.empty(len(predict), dtype=object)) for label, encoded in self._labels_map.items(): decoded[predict == encoded] = label diff --git a/src/models/preprocessors/min_max_scaler.py b/src/models/preprocessors/min_max_scaler.py index 0f672a6..a311b5e 100644 --- a/src/models/preprocessors/min_max_scaler.py +++ b/src/models/preprocessors/min_max_scaler.py @@ -15,7 +15,7 @@ class TensorMinMaxScaler(Preprocessor): def __init__(self, nb_features): # Parameters - self.__nb_features = nb_features + self._nb_features = nb_features def _fit(self, ds: Dataset) -> Preprocessor: """ @@ -26,12 +26,12 @@ def _fit(self, ds: Dataset) -> Preprocessor: def Min(dct): arr = dct[TENSOR_COLUMN_NAME] - min = np.array([arr[:,i].min() for i in range(self.__nb_features)]) + min = np.array([arr[:,i].min() for i in range(self._nb_features)]) return min def Max(dct): arr = dct[TENSOR_COLUMN_NAME] - max = np.array([arr[:,i].max() for i in range(self.__nb_features)]) + max = np.array([arr[:,i].max() for i in range(self._nb_features)]) return max for batch in ds.iter_batches(batch_format = 'numpy'): @@ -41,8 +41,8 @@ def Max(dct): min = np.array(min) max = np.array(max) - min = np.array([min[:,i].min() for i in range(self.__nb_features)]) - max = np.array([max[:,i].max() for i in range(self.__nb_features)]) + min = np.array([min[:,i].min() for i in range(self._nb_features)]) + max = np.array([max[:,i].max() for i in range(self._nb_features)]) self.stats_ = {'min' : min, 'max' : max} diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 0f185d4..7a53787 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -248,7 +248,6 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): if ds.count() > 0: ds = self._scaler.transform(ds) - # ds = ds.materialize() def predict_func(data): X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) @@ -264,12 +263,6 @@ def predict_func(data): probabilities = ds.map_batches(predict_func, batch_format = 'numpy') probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) - - weights = np.zeros(len(self._weights)) - for encoded, w in self._weights.items(): - weights[encoded] = w - - probabilities = probabilities * weights return probabilities else: @@ -289,11 +282,3 @@ def _get_threshold_pred(self, predict, threshold): proba_predict.loc[proba_predict['best_proba'] < threshold, 'predicted_label'] = -1 return proba_predict['predicted_label'] - - def _label_decode(self, predict): - print('_label_decode') - decoded = pd.Series(np.empty(len(predict), dtype=object)) - for label, encoded in self._labels_map.items(): - decoded[predict == encoded] = label - - return np.array(decoded) From 5eba525bc3252c3a6c19ba95da9d232dbdb44339 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 18 Dec 2023 16:54:15 -0500 Subject: [PATCH 73/92] performances tuning --- src/models/classification.py | 3 ++- src/models/kerasTF/binary_models.py | 8 ++++---- src/models/kerasTF/multiclass_models.py | 8 ++++---- src/models/sklearn/binary_models.py | 8 ++++---- src/models/sklearn/multiclass_models.py | 16 +++++++--------- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/models/classification.py b/src/models/classification.py index b1f76e6..c164bf8 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -320,7 +320,8 @@ def _score_cv(self, y_true, y_pred, taxa): support = precision_recall_fscore_support( y_compare['y_true'], y_compare['y_pred'], - average = 'weighted' + average = 'weighted', + zero_division = np.nan ) scores = pd.DataFrame({ diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index fe7fb58..ae30644 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -147,8 +147,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # features = self.kmers, # file = scaler_file # ) - self._scaler = TensorMinMaxScaler(self._nb_kmers) - self._scaler.fit(ds) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler.fit(ds) # Model training ######################################################################################################### @@ -159,7 +159,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._scaler.transform(ds) + # ds = self._scaler.transform(ds) ds = ds.materialize() datasets[name] = ds @@ -224,7 +224,7 @@ def _predict_proba(self, ds): ds = ds.drop_columns(col_2_drop) # Preprocess - ds = self._scaler.transform(ds) + # ds = self._scaler.transform(ds) ds = ds.materialize() self._predictor = BatchPredictor.from_checkpoint( diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 4980023..9807556 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -144,8 +144,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # features = self.kmers, # file = scaler_file # ) - self._scaler = TensorMinMaxScaler(self._nb_kmers) - self._scaler.fit(ds) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler.fit(ds) # Models training ######################################################################################################### @@ -156,7 +156,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._scaler.transform(ds) + # ds = self._scaler.transform(ds) ds = ds.materialize() datasets[name] = ds @@ -223,7 +223,7 @@ def _predict_proba(self, ds): ds = ds.drop_columns(col_2_drop) # Preprocess - ds = self._scaler.transform(ds) + # ds = self._scaler.transform(ds) ds = ds.materialize() self._predictor = BatchPredictor.from_checkpoint( diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index 0ffa104..cd5e6c0 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -117,8 +117,8 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # features = self.kmers, # file = scaler_file # ) - self._scaler = TensorMinMaxScaler(self._nb_kmers) - self._scaler.fit(ds) + # self._scaler = TensorMinMaxScaler(self._nb_kmers) + # self._scaler.fit(ds) # Model training @@ -131,7 +131,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - ds = self._scaler.transform(ds) + # ds = self._scaler.transform(ds) datasets[name] = ray.put(ds) try: @@ -189,7 +189,7 @@ def _build(self): def predict(self, ds): print('predict') if ds.count() > 0: - ds = self._scaler.transform(ds) + # ds = self._scaler.transform(ds) ds = ds.materialize() predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index 7a53787..f7e9f10 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -118,13 +118,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # Class weights self._weights = self._compute_weights() - # Scaling - # self._scaler = TensorTfIdfTransformer( - # features = self.kmers, - # file = scaler_file - # ) - self._scaler = TensorMinMaxScaler(self._nb_kmers) - self._scaler.fit(ds) + if self.classifier == 'mnb': + self._scaler = TensorMinMaxScaler(self._nb_kmers) + self._scaler.fit(ds) # Models training ######################################################################################################### @@ -135,7 +131,8 @@ def fit(self, datasets): # ds = ds.drop_columns(['id']) train_ds = datasets['train'] train_ds = self._encoder.transform(train_ds) - train_ds = self._scaler.transform(train_ds) + if self.classifier == 'mnb': + train_ds = self._scaler.transform(train_ds) # datasets[name] = ds # One sub-model per artificial cluster of samples @@ -247,7 +244,8 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): if ds.count() > 0: - ds = self._scaler.transform(ds) + if self.classifier == 'mnb': + ds = self._scaler.transform(ds) def predict_func(data): X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) From 747ae2e9326df5929aaff1a2be05c9ae47defb24 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Mon, 18 Dec 2023 17:09:23 -0500 Subject: [PATCH 74/92] LSTM params to be used with cuDNN --- src/models/kerasTF/build_neural_networks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index e69747a..12893f3 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -22,8 +22,8 @@ def build_attention(nb_features): inputs = Input(shape = (nb_features,1)) # x = Embedding(nb_features, 128)(inputs) - x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(inputs) - x = LSTM(128, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1 )(x) + x = LSTM(128, return_sequences = True, dropout = 0.1)(inputs) + x = LSTM(128, return_sequences = True, dropout = 0.1)(x) x = AttentionWeightedAverage()(x) x = Dense(128, activation = "relu")(x) @@ -46,7 +46,7 @@ def build_LSTM(nb_features): inputs = Input(shape = (nb_features,1)) # x = Embedding(nb_features, 128)(inputs) - x = LSTM(128, recurrent_dropout = 0.1, dropout = 0.1)(inputs) + x = LSTM(128, dropout = 0.1)(inputs) x = Dense(1, activation = 'tanh')(x) @@ -66,8 +66,8 @@ def build_deepLSTM(nb_features): inputs = Input(shape=(nb_features,1)) # netA = Embedding(nb_features, 128)(inputs) - netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='A_%d'%40,return_sequences=True) (inputs) - netA = LSTM(40, activation='tanh',recurrent_dropout=0.05,dropout=0.1,name='B_%d'%40) (netA) + netA = LSTM(40, activation='tanh',dropout=0.1,name='A_%d'%40,return_sequences=True) (inputs) + netA = LSTM(40, activation='tanh',dropout=0.1,name='B_%d'%40) (netA) netB = Dense(100, activation='tanh',name='G_%d'%40) (inputs) netB = Dense(100, activation='tanh',name='H_%d'%40) (netB) From d851ee0905fa87b946b1f6926c6fd10c05609a39 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 19 Dec 2023 16:49:07 -0500 Subject: [PATCH 75/92] keras debugged for GPU --- src/models/kerasTF/binary_models.py | 25 ++------ src/models/kerasTF/models.py | 78 +++++++++++++++++++++++-- src/models/kerasTF/multiclass_models.py | 25 ++------ 3 files changed, 86 insertions(+), 42 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index ae30644..1a81eb9 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -20,7 +20,7 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig -from models.kerasTF.models import train_func, build_model +from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint @@ -93,25 +93,7 @@ def __init__( kmers_list, csv ) - # Parameters - # Initialize hidden - self._nb_CPU_data = int(os.cpu_count() * 0.2) - self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) - self._nb_GPU = len(tf.config.list_physical_devices('GPU')) - # Initialize empty self._nb_classes = 2 - self._nb_CPU_per_worker = 0 - self._nb_GPU_per_worker = 0 - # Computing variables - if self._nb_GPU > 0: - self._use_gpu = True - self._n_workers = self._nb_GPU - self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) - self._nb_GPU_per_worker = 1 - else: - self._use_gpu = False - self._n_workers = int(self._nb_CPU_training * 0.2) - self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) if self.classifier == 'attention': print('Training bacterial / host classifier based on Attention Weighted Neural Network') @@ -173,6 +155,11 @@ def fit(self, datasets): 'weights': self._weights } + if self._nb_GPU > 0: + train_func = train_func_GPU + else: + train_func = train_func_CPU + # Define trainer / tuner self._trainer = TensorflowTrainer( train_loop_per_worker=train_func, diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index ba1c3a4..d26b28f 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -94,7 +94,25 @@ def __init__( kmers_list, csv ) - + # Parameters + # Initialize hidden + self._nb_CPU_data = int(os.cpu_count() * 0.2) # 6 + self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 26 + self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 6 + # Initialize empty + self._nb_CPU_per_worker = 0 + self._nb_GPU_per_worker = 0 + # Computing variables + if self._nb_GPU > 0: + self._use_gpu = True + self._n_workers = self._nb_GPU #6 + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 + self._nb_GPU_per_worker = 1 + else: + self._use_gpu = False + self._n_workers = int(self._nb_CPU_training * 0.2) + self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) + @abstractmethod def preprocess(self): """ @@ -120,7 +138,6 @@ def _get_threshold_pred(self): """ """ - # Training/building function outside of the class as mentioned on the Ray discussion # https://discuss.ray.io/t/statuscode-resource-exhausted/4379/16 ################################################################################ @@ -130,7 +147,7 @@ def _get_threshold_pred(self): # Smaller nb of workers + bigger nb CPU_per_worker + smaller batch_size to avoid memory overload # https://discuss.ray.io/t/ray-sgd-distributed-tensorflow/261/8 -def train_func(config): +def train_func_CPU(config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) @@ -139,7 +156,7 @@ def train_func(config): model = config.get('model') weights = config.get('weights') - # Model construction + # Model construction model = build_model(model, nb_cls, size) train_data = session.get_dataset_shard('train') @@ -181,6 +198,59 @@ def train_func(config): gc.collect() tf.keras.backend.clear_session() +def train_func_GPU(config): + # Parameters + batch_size = config.get('batch_size', 128) + epochs = config.get('epochs', 10) + size = config.get('size') + nb_cls = config.get('nb_cls') + model = config.get('model') + weights = config.get('weights') + + # Model construction + strategy = tf.distribute.MirroredStrategy() + with strategy.scope(): + model = build_model(model, nb_cls, size) + + train_data = session.get_dataset_shard('train') + val_data = session.get_dataset_shard('validation') + + for _ in range(epochs): + batch_train = train_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + batch_val = val_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + history = model.fit( + x = batch_train, + validation_data = batch_val, + callbacks = [ReportCheckpointCallback()], + class_weight = weights, + verbose = 0 + ) + session.report({ + 'accuracy': history.history['accuracy'][0], + 'loss': history.history['loss'][0], + 'val_accuracy': history.history['val_accuracy'][0], + 'val_loss': history.history['val_loss'][0], + }, + checkpoint=TensorflowCheckpoint.from_model(model) + ) + gc.collect() + tf.keras.backend.clear_session() + del model + gc.collect() + tf.keras.backend.clear_session() + def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': model = build_attention(nb_kmers) diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 9807556..68fe190 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -21,7 +21,7 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig -from models.kerasTF.models import train_func, build_model +from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint @@ -97,25 +97,7 @@ def __init__( kmers_list, csv ) - # Parameters - # Initialize hidden - self._nb_CPU_data = int(os.cpu_count() * 0.2) - self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) - self._nb_GPU = len(tf.config.list_physical_devices('GPU')) - # Initialize empty self._nb_classes = None - self._nb_CPU_per_worker = 0 - self._nb_GPU_per_worker = 0 - # Computing variables - if self._nb_GPU > 0: - self._use_gpu = True - self._n_workers = self._nb_GPU - self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) - self._nb_GPU_per_worker = 1 - else: - self._use_gpu = False - self._n_workers = int(self._nb_CPU_training * 0.2) - self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) # Data preprocessing ######################################################################################################### @@ -170,6 +152,11 @@ def fit(self, datasets): 'weights': self._weights } + if self._nb_GPU > 0: + train_func = train_func_GPU + else: + train_func = train_func_CPU + # Define trainer / tuner self._trainer = TensorflowTrainer( train_loop_per_worker=train_func, From 64bd8d8b08dd51b7763906083cec171bfe895011 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Tue, 19 Dec 2023 16:55:29 -0500 Subject: [PATCH 76/92] keras predictions all resources --- src/models/kerasTF/binary_models.py | 19 +++++-------------- src/models/kerasTF/multiclass_models.py | 19 +++++-------------- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 1a81eb9..0679396 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -219,20 +219,11 @@ def _predict_proba(self, ds): TensorflowPredictor, model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) ) - if self._nb_GPU > 0: - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_gpus_per_worker = self._nb_GPU_per_worker - ) - else: - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_cpus_per_worker = self._nb_CPU_per_worker - ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) return predictions else: raise ValueError('No data to predict') diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 68fe190..4ab1b9c 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -218,20 +218,11 @@ def _predict_proba(self, ds): TensorflowPredictor, model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) ) - if self._nb_GPU > 0: - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_gpus_per_worker = self._nb_GPU_per_worker - ) - else: - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - num_cpus_per_worker = self._nb_CPU_per_worker - ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) return predictions else: raise ValueError('No data to predict') From fb4fc1b354a7daa08af75d1d3cab10fba444d515 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 12:34:14 -0500 Subject: [PATCH 77/92] keras CPU/GPU strategies --- src/models/kerasTF/binary_models.py | 148 +++++++------- src/models/kerasTF/models.py | 259 +++++++++++++++++++----- src/models/kerasTF/multiclass_models.py | 120 ++++++----- 3 files changed, 358 insertions(+), 169 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 0679396..b03165b 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -104,7 +104,7 @@ def __init__( # Data preprocessing ######################################################################################################### - + """ def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') # Labels encoding @@ -131,10 +131,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # ) # self._scaler = TensorMinMaxScaler(self._nb_kmers) # self._scaler.fit(ds) - + """ # Model training ######################################################################################################### + """ def fit(self, datasets): print('fit') # Preprocessing loop @@ -145,6 +146,12 @@ def fit(self, datasets): ds = ds.materialize() datasets[name] = ds + if self._nb_GPU > 0: + self._fit_GPU(datasets) + else: + self._fit_CPU(datasets) + + def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -155,22 +162,16 @@ def fit(self, datasets): 'weights': self._weights } - if self._nb_GPU > 0: - train_func = train_func_GPU - else: - train_func = train_func_CPU - # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func, + train_loop_per_worker=train_func_CPU, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker, - 'GPU': self._nb_GPU_per_worker + 'CPU': self._nb_CPU_per_worker } ), run_config=RunConfig( @@ -183,24 +184,40 @@ def fit(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] + def _fit_GPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'taxa': self.taxa, + 'workdir':self._workdir, + 'model': self.classifier, + 'weights': self._weights + } + + self._model_ckpt = train_func_GPU(datasets, train_params) + """ # Model predicting ######################################################################################################### + """ def predict(self, ds): print('predict') # Predict with model - predictions = self._predict_proba(ds) + probabilities = self._predict_proba(ds) # Convert predictions to labels - predictions = self._get_abs_pred(predictions) + predictions = self._get_abs_pred(probabilities) # Return decoded labels return self._label_decode(predictions) def predict_proba(self, ds, threshold = 0.8): print('predict_proba') # Predict with model - predictions = self._predict_proba(ds) + probabilities = self._predict_proba(ds) # Convert predictions to labels with threshold - predictions = self._get_threshold_pred(predictions, threshold) + predictions = self._get_threshold_pred(probabilities, threshold) # Return decoded labels return self._label_decode(predictions) @@ -210,69 +227,64 @@ def _predict_proba(self, ds): col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] ds = ds.drop_columns(col_2_drop) - # Preprocess - # ds = self._scaler.transform(ds) ds = ds.materialize() - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - return predictions + if self._nb_GPU > 0: + probabilities = self._predict_proba_GPU(ds) + else: + probabilities = self._predict_proba_CPU(ds) + + return probabilities + else: raise ValueError('No data to predict') - + def _predict_proba_CPU(self, ds): + print('_predict_proba_CPU') + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + return predictions + + def _predict_proba_GPU(self, ds): + print('_predict_proba_GPU') + model = load_model(self._model_ckpt) + probabilities = [] + for batch in ds.iter_tf_batches(batch_size = self.batch_size): + probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) + """ def _get_abs_pred(self, predictions): print('_get_abs_pred') - def map_predicted_label(ds): - ds = np.ravel(ds['predictions']) - threshold = 0.5 - predict = pd.DataFrame({ - 'proba': ds, - 'predicted_label': np.full(len(ds), -1) - }) - predict.loc[predict['proba'] > threshold, 'predicted_label'] = 1 - predict.loc[predict['proba'] < threshold, 'predicted_label'] = 0 - return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} - - predict = [] - predictions = predictions.map_batches( - lambda batch : map_predicted_label(batch), - batch_format = 'numpy', - batch_size = self.batch_size - ) - for row in predictions.iter_rows(): - predict.append(row['predictions']) + return np.round(np.ravel(predictions)) + # predict = pd.DataFrame({ + # 'proba': np.ravel(predictions), + # 'predicted_label' : np.full(len(predictions), -1) + # }) + # predict.loc[predict['proba'] > 0.5, 'predicted_label'] = 1 + # predict.loc[predict['proba'] < 0.5, 'predicted_label'] = 0 - return predict + # return predict def _get_threshold_pred(self, predictions, threshold): print('_get_threshold_pred') - def map_predicted_label(ds, threshold): - ds = np.ravel(ds['predictions']) - lower_threshold = 0.5 - (threshold * 0.5) - upper_threshold = 0.5 + (threshold * 0.5) - predict = pd.DataFrame({ - 'proba': ds, - 'predicted_label': np.full(len(ds), -1) - }) - predict.loc[predict['proba'] >= upper_threshold, 'predicted_label'] = 1 - predict.loc[predict['proba'] <= lower_threshold, 'predicted_label'] = 0 - return {'predictions' : predict['predicted_label'].to_numpy(dtype = np.int32)} + lower_threshold = 0.5 - (threshold * 0.5) + upper_threshold = 0.5 + (threshold * 0.5) - predict = [] - predictions = predictions.map_batches( - lambda batch : map_predicted_label(batch, threshold), - batch_format = 'numpy', - batch_size = self.batch_size - ) - for row in predictions.iter_rows(): - predict.append(row['predictions']) + predict = pd.DataFrame({ + 'proba': np.ravel(predictions), + 'label' : np.full(len(predictions), -1) + }) - return predict \ No newline at end of file + predict.loc[predict['proba'] >= upper_threshold, 'label'] = 1 + predict.loc[predict['proba'] <= lower_threshold, 'label'] = 0 + + return predict['label'].to_numpy(dtype = np.int32) \ No newline at end of file diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index d26b28f..93917f6 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -20,18 +20,22 @@ # Training import tensorflow as tf from ray.air import session -# from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig from ray.air.integrations.keras import ReportCheckpointCallback +from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint # Tuning from ray.air.config import RunConfig # Predicting +from tensorflow.keras.models import load_model from ray.train.tensorflow import TensorflowPredictor from ray.train.batch_predictor import BatchPredictor +# Data +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + __author__ = 'Nicolas de Montigny' __all__ = ['KerasTFModel'] @@ -113,31 +117,169 @@ def __init__( self._n_workers = int(self._nb_CPU_training * 0.2) self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) - @abstractmethod - def preprocess(self): - """ - """ + # Data preprocessing + ######################################################################################################### + + def preprocess(self, ds, scaling = False, scaler_file = None): + print('preprocess') + # Labels encoding + self._encoder = ModelLabelEncoder(self.taxa) + self._encoder.fit(ds) + + # Labels mapping + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + self._nb_classes = len(labels) + self._encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + self._encoded = np.append(self._encoded, -1) + + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded - @abstractmethod + # Class weights + self._weights = self._compute_weights() + + # Models training + ######################################################################################################### + def fit(self, datasets): - """ - """ + print('fit') + # Preprocessing loop + for name, ds in datasets.items(): + # ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + # ds = self._scaler.transform(ds) + ds = ds.materialize() + datasets[name] = ds + + if self._nb_GPU > 0: + self._fit_GPU(datasets) + else: + self._fit_CPU(datasets) + + def _fit_CPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'model': self.classifier, + 'weights': self._weights + } + + # Define trainer / tuner + self._trainer = TensorflowTrainer( + train_loop_per_worker=train_func_CPU, + train_loop_config=train_params, + scaling_config=ScalingConfig( + trainer_resources={'CPU': self._nb_CPU_data}, + num_workers=self._n_workers, + use_gpu=self._use_gpu, + resources_per_worker={ + 'CPU': self._nb_CPU_per_worker + } + ), + run_config=RunConfig( + name=self.classifier, + local_dir=self._workdir, + ), + datasets=datasets, + ) + + training_result = self._trainer.fit() + self._model_ckpt = training_result.best_checkpoints[0][0] + + def _fit_GPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'taxa': self.taxa, + 'workdir':self._workdir, + 'model': self.classifier, + 'weights': self._weights + } + + self._model_ckpt = train_func_GPU(datasets, train_params) + + # Models predicting + ######################################################################################################### - @abstractmethod def predict(self, ds): - """ - """ + print('predict') + # Predict with model + probabilities = self._predict_proba(ds) + # Convert predictions to labels + predictions = self._get_abs_pred(probabilities) + # Return decoded labels + return self._label_decode(predictions) + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + # Predict with model + probabilities = self._predict_proba(ds) + # Convert predictions to labels with threshold + predictions = self._get_threshold_pred(probabilities, threshold) + # Return decoded labels + return self._label_decode(predictions) + + def _predict_proba(self, ds): + print('_predict_proba') + if ds.count() > 0: + if len(ds.schema().names) > 1: + col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] + ds = ds.drop_columns(col_2_drop) + + ds = ds.materialize() + + if self._nb_GPU > 0: + probabilities = self._predict_proba_GPU(ds) + else: + probabilities = self._predict_proba_CPU(ds) + + return probabilities + else: + raise ValueError('No data to predict') + + def _predict_proba_CPU(self, ds): + print('_predict_proba_CPU') + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + + return probabilities + + def _predict_proba_GPU(self, ds): + print('_predict_proba_GPU') + model = load_model(self._model_ckpt) + probabilities = [] + for batch in ds.iter_tf_batches(batch_size = self.batch_size): + probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) + + return probabilities @abstractmethod - def predict_proba(self): + def _get_abs_pred(self): """ """ - + @abstractmethod def _get_threshold_pred(self): """ """ - + # Training/building function outside of the class as mentioned on the Ray discussion # https://discuss.ray.io/t/statuscode-resource-exhausted/4379/16 ################################################################################ @@ -159,6 +301,7 @@ def train_func_CPU(config): # Model construction model = build_model(model, nb_cls, size) + # Data train_data = session.get_dataset_shard('train') val_data = session.get_dataset_shard('validation') @@ -177,6 +320,7 @@ def train_func_CPU(config): local_shuffle_buffer_size = batch_size, local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) + # Training history = model.fit( x = batch_train, validation_data = batch_val, @@ -184,6 +328,7 @@ def train_func_CPU(config): class_weight = weights, verbose = 0 ) + # Checkpointing session.report({ 'accuracy': history.history['accuracy'][0], 'loss': history.history['loss'][0], @@ -198,58 +343,62 @@ def train_func_CPU(config): gc.collect() tf.keras.backend.clear_session() -def train_func_GPU(config): +def train_func_GPU(datasets, config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) size = config.get('size') nb_cls = config.get('nb_cls') + taxa = config.get('taxa') + workdir = config.get('workdir') model = config.get('model') weights = config.get('weights') + checkpoint = os.path.join(workdir, model) + + # Data + train_ds = datasets['train'] + val_ds = datasets['validation'] + + # Convert datasets to tensorflow ds & generator + train_ds = train_ds.iterator().to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + val_ds = val_ds.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + # Model construction - strategy = tf.distribute.MirroredStrategy() - with strategy.scope(): - model = build_model(model, nb_cls, size) + model = build_model(model, nb_cls, size) - train_data = session.get_dataset_shard('train') - val_data = session.get_dataset_shard('validation') + # Callbacks + model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') + model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') + modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') + early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) + csv = CSVLogger(model_csv) + + # Training + hist = model.fit( + train_ds, + epochs = epochs, + validation_data = val_ds, + callbacks = [modelckpt, early, csv], + class_weight = weights, + verbose = 0 + ) + + # Checkpointing + best_model = np.argmin(hist.history['val_loss']) + 1 + best_model = f'{best_model:03d}.hdf5' + best_model = os.path.join(checkpoint, taxa, best_model) + + return best_model - for _ in range(epochs): - batch_train = train_data.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) - ) - batch_val = val_data.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) - ) - history = model.fit( - x = batch_train, - validation_data = batch_val, - callbacks = [ReportCheckpointCallback()], - class_weight = weights, - verbose = 0 - ) - session.report({ - 'accuracy': history.history['accuracy'][0], - 'loss': history.history['loss'][0], - 'val_accuracy': history.history['val_accuracy'][0], - 'val_loss': history.history['val_loss'][0], - }, - checkpoint=TensorflowCheckpoint.from_model(model) - ) - gc.collect() - tf.keras.backend.clear_session() - del model - gc.collect() - tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 4ab1b9c..99bf6bd 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -21,7 +21,6 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig -from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint @@ -29,6 +28,7 @@ from ray.air.config import RunConfig # Predicting +from tensorflow.keras.models import load_model from ray.train.tensorflow import TensorflowPredictor from ray.train.batch_predictor import BatchPredictor @@ -102,6 +102,7 @@ def __init__( # Data preprocessing ######################################################################################################### + """ def preprocess(self, ds, scaling = False, scaler_file = None): print('preprocess') # Labels encoding @@ -128,10 +129,11 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # ) # self._scaler = TensorMinMaxScaler(self._nb_kmers) # self._scaler.fit(ds) - + """ # Models training ######################################################################################################### + """ def fit(self, datasets): print('fit') # Preprocessing loop @@ -142,6 +144,12 @@ def fit(self, datasets): ds = ds.materialize() datasets[name] = ds + if self._nb_GPU > 0: + self._fit_GPU(datasets) + else: + self._fit_CPU(datasets) + + def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -152,22 +160,16 @@ def fit(self, datasets): 'weights': self._weights } - if self._nb_GPU > 0: - train_func = train_func_GPU - else: - train_func = train_func_CPU - # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func, + train_loop_per_worker=train_func_CPU, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker, - 'GPU': self._nb_GPU_per_worker + 'CPU': self._nb_CPU_per_worker } ), run_config=RunConfig( @@ -179,26 +181,41 @@ def fit(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] + + def _fit_GPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'taxa': self.taxa, + 'workdir':self._workdir, + 'model': self.classifier, + 'weights': self._weights + } + self._model_ckpt = train_func_GPU(datasets, train_params) + """ # Models predicting ######################################################################################################### + """ def predict(self, ds): print('predict') # Predict with model probabilities = self._predict_proba(ds) # Convert predictions to labels - probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) - predictions = np.argmax(probabilities, axis = 1) + predictions = self._get_abs_pred(probabilities) # Return decoded labels return self._label_decode(predictions) def predict_proba(self, ds, threshold = 0.8): print('predict_proba') # Predict with model - predictions = self._predict_proba(ds) + probabilities = self._predict_proba(ds) # Convert predictions to labels with threshold - predictions = self._get_threshold_pred(predictions, threshold) + predictions = self._get_threshold_pred(probabilities, threshold) # Return decoded labels return self._label_decode(predictions) @@ -209,43 +226,54 @@ def _predict_proba(self, ds): col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] ds = ds.drop_columns(col_2_drop) - # Preprocess - # ds = self._scaler.transform(ds) ds = ds.materialize() - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - return predictions + if self._nb_GPU > 0: + probabilities = self._predict_proba_GPU(ds) + else: + probabilities = self._predict_proba_CPU(ds) + + return probabilities else: raise ValueError('No data to predict') + def _predict_proba_CPU(self, ds): + print('_predict_proba_CPU') + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + return probabilities + + def _predict_proba_GPU(self, ds): + print('_predict_proba_GPU') + model = load_model(self._model_ckpt) + probabilities = [] + for batch in ds.iter_tf_batches(batch_size = self.batch_size): + probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) + + return probabilities + """ + def _get_abs_pred(self, predictions): + print('_get_abs_pred') + return np.argmax(predictions, axis = 1) + def _get_threshold_pred(self, predictions, threshold): print('_get_threshold_pred') - def map_predicted_label(ds, threshold): - ds = ds['predictions'] - pred = pd.DataFrame({ - 'best_proba': [np.max(arr) for arr in ds], - 'predicted_label' : [np.argmax(arr) for arr in ds] - }) - pred.loc[pred['best_proba'] < threshold, 'predicted_label'] = -1 - - return {'predictions' : pred['predicted_label'].to_numpy(dtype = np.int32)} - - predict = [] - predictions = predictions.map_batches( - lambda batch : map_predicted_label(batch, threshold), - batch_format = 'numpy', - batch_size = self.batch_size - ) - for row in predictions.iter_rows(): - predict.append(row['predictions']) + pred = pd.DataFrame({ + 'proba': [np.max(arr) for arr in predictions], + 'label' : [np.argmax(arr) for arr in predictions] + }) + pred.loc[pred['proba'] < threshold, 'label'] = -1 + + return pred['label'].to_numpy(dtype = np.int32) - return predict From 88a10529a95f496a769c841ebdbacf083b129425 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 12:48:44 -0500 Subject: [PATCH 78/92] keras fit verbose --- src/models/kerasTF/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 93917f6..8330925 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -389,7 +389,7 @@ def train_func_GPU(datasets, config): validation_data = val_ds, callbacks = [modelckpt, early, csv], class_weight = weights, - verbose = 0 + verbose = 1 ) # Checkpointing From 4b86498af3382bd64bc3c5b291ff36b5a989fae7 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 15:24:33 -0500 Subject: [PATCH 79/92] TF-IDF scaling for smaller k experiments --- src/models/kerasTF/models.py | 173 ++++------ src/models/kerasTF/models_linear.py | 423 ++++++++++++++++++++++++ src/models/sklearn/binary_models.py | 15 +- src/models/sklearn/multiclass_models.py | 13 +- 4 files changed, 507 insertions(+), 117 deletions(-) create mode 100644 src/models/kerasTF/models_linear.py diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 8330925..126a908 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -136,6 +136,10 @@ def preprocess(self, ds, scaling = False, scaler_file = None): for (label, encoded) in zip(labels, self._encoded): self._labels_map[label] = encoded + # Features scaling + self._scaler = TensorTfIdfTransformer(features = self.kmers, file = scaler_file) + self._scaler.fit(ds) + # Class weights self._weights = self._compute_weights() @@ -148,16 +152,10 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - # ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() datasets[name] = ds - if self._nb_GPU > 0: - self._fit_GPU(datasets) - else: - self._fit_CPU(datasets) - - def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -168,16 +166,22 @@ def _fit_CPU(self, datasets): 'weights': self._weights } + if self._nb_GPU > 0: + train_func = train_func_GPU + else: + train_func = train_func_CPU + # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func_CPU, + train_loop_per_worker=train_func, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker + 'CPU': self._nb_CPU_per_worker, + 'GPU' : self._nb_GPU_per_worker } ), run_config=RunConfig( @@ -190,21 +194,6 @@ def _fit_CPU(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] - def _fit_GPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'taxa': self.taxa, - 'workdir':self._workdir, - 'model': self.classifier, - 'weights': self._weights - } - - self._model_ckpt = train_func_GPU(datasets, train_params) - # Models predicting ######################################################################################################### @@ -233,43 +222,27 @@ def _predict_proba(self, ds): col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] ds = ds.drop_columns(col_2_drop) + ds = self._scaler.transform(ds) + ds = ds.materialize() - if self._nb_GPU > 0: - probabilities = self._predict_proba_GPU(ds) - else: - probabilities = self._predict_proba_CPU(ds) + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) return probabilities else: raise ValueError('No data to predict') - def _predict_proba_CPU(self, ds): - print('_predict_proba_CPU') - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) - - return probabilities - - def _predict_proba_GPU(self, ds): - print('_predict_proba_GPU') - model = load_model(self._model_ckpt) - probabilities = [] - for batch in ds.iter_tf_batches(batch_size = self.batch_size): - probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) - - return probabilities - @abstractmethod def _get_abs_pred(self): """ @@ -326,7 +299,7 @@ def train_func_CPU(config): validation_data = batch_val, callbacks = [ReportCheckpointCallback()], class_weight = weights, - verbose = 0 + verbose = 1 ) # Checkpointing session.report({ @@ -343,62 +316,62 @@ def train_func_CPU(config): gc.collect() tf.keras.backend.clear_session() -def train_func_GPU(datasets, config): + +def train_func_GPU(config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) size = config.get('size') nb_cls = config.get('nb_cls') - taxa = config.get('taxa') - workdir = config.get('workdir') model = config.get('model') weights = config.get('weights') - checkpoint = os.path.join(workdir, model) - - # Data - train_ds = datasets['train'] - val_ds = datasets['validation'] - - # Convert datasets to tensorflow ds & generator - train_ds = train_ds.iterator().to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size - ) - val_ds = val_ds.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size - ) - # Model construction - model = build_model(model, nb_cls, size) + strategy = tf.distribute.MirroredStrategy() + with strategy.scope(): + model = build_model(model, nb_cls, size) - # Callbacks - model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') - model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') - modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') - early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) - csv = CSVLogger(model_csv) - - # Training - hist = model.fit( - train_ds, - epochs = epochs, - validation_data = val_ds, - callbacks = [modelckpt, early, csv], - class_weight = weights, - verbose = 1 - ) - - # Checkpointing - best_model = np.argmin(hist.history['val_loss']) + 1 - best_model = f'{best_model:03d}.hdf5' - best_model = os.path.join(checkpoint, taxa, best_model) - - return best_model + # Data + train_data = session.get_dataset_shard('train') + val_data = session.get_dataset_shard('validation') + for _ in range(epochs): + batch_train = train_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + batch_val = val_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + # Training + history = model.fit( + x = batch_train, + validation_data = batch_val, + callbacks = [ReportCheckpointCallback()], + class_weight = weights, + verbose = 0 + ) + # Checkpointing + session.report({ + 'accuracy': history.history['accuracy'][0], + 'loss': history.history['loss'][0], + 'val_accuracy': history.history['val_accuracy'][0], + 'val_loss': history.history['val_loss'][0], + }, + checkpoint=TensorflowCheckpoint.from_model(model) + ) + gc.collect() + tf.keras.backend.clear_session() + del model + gc.collect() + tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': diff --git a/src/models/kerasTF/models_linear.py b/src/models/kerasTF/models_linear.py new file mode 100644 index 0000000..adc81e3 --- /dev/null +++ b/src/models/kerasTF/models_linear.py @@ -0,0 +1,423 @@ +import os +import gc +import warnings +import numpy as np +import pandas as pd + +# Class construction +from abc import ABC, abstractmethod + +# Preprocessing +from ray.data.preprocessors import LabelEncoder, Chain +from models.encoders.model_label_encoder import ModelLabelEncoder +from models.encoders.one_hot_tensor_encoder import OneHotTensorEncoder +from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer + +# Parent class / models +from models.models_utils import ModelsUtils +from models.kerasTF.build_neural_networks import * + +# Training +import tensorflow as tf +from ray.air import session +from ray.air.config import ScalingConfig +from ray.air.integrations.keras import ReportCheckpointCallback +from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping +from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint + +# Tuning +from ray.air.config import RunConfig + +# Predicting +from tensorflow.keras.models import load_model +from ray.train.tensorflow import TensorflowPredictor +from ray.train.batch_predictor import BatchPredictor + +# Data +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +__author__ = 'Nicolas de Montigny' + +__all__ = ['KerasTFModel'] + +TENSOR_COLUMN_NAME = '__value__' +LABELS_COLUMN_NAME = 'labels' + +# Ignore warnings to have a more comprehensible output on stdout +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +warnings.filterwarnings('ignore') + +class KerasTFModels(ModelsUtils, ABC): + """ + Class used to build, train and predict models using Ray with Keras Tensorflow backend + + ---------- + Attributes + ---------- + + clf_file : string + Path to a file containing the trained model for this object + + nb_classes : int + Number of classes for learning + + ---------- + Methods + ---------- + + preprocess : preprocess the data before training and splitting the original dataset in case of cross-validation + + train : train a model using the given datasets + + predict : predict the classes of a dataset + ds : ray.data.Dataset + Dataset containing K-mers profiles of sequences to be classified + + threshold : float + Minimum percentage of probability to effectively classify. + Sequences will be classified as 'unknown' if the probability is under this threshold. + Defaults to 80% + """ + + def __init__( + self, + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ): + super().__init__( + classifier, + outdir_model, + batch_size, + training_epochs, + taxa, + kmers_list, + csv + ) + # Parameters + # Initialize hidden + self._nb_CPU_data = int(os.cpu_count() * 0.2) # 6 + self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 26 + self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 6 + # Initialize empty + self._nb_CPU_per_worker = 0 + self._nb_GPU_per_worker = 0 + # Computing variables + if self._nb_GPU > 0: + self._use_gpu = True + self._n_workers = self._nb_GPU #6 + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 + self._nb_GPU_per_worker = 1 + else: + self._use_gpu = False + self._n_workers = int(self._nb_CPU_training * 0.2) + self._nb_CPU_per_worker = int(int(self._nb_CPU_training * 0.8) / self._n_workers) + + # Data preprocessing + ######################################################################################################### + + def preprocess(self, ds, scaling = False, scaler_file = None): + print('preprocess') + # Labels encoding + self._encoder = ModelLabelEncoder(self.taxa) + self._encoder.fit(ds) + + # Labels mapping + labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) + self._nb_classes = len(labels) + self._encoded = np.arange(len(labels)) + labels = np.append(labels, 'Unknown') + self._encoded = np.append(self._encoded, -1) + + for (label, encoded) in zip(labels, self._encoded): + self._labels_map[label] = encoded + + # Features scaling + self._scaler = TensorTfIdfTransformer(features = self.kmers, file = scaler_file) + self._scaler.fit(ds) + + # Class weights + self._weights = self._compute_weights() + + # Models training + ######################################################################################################### + + def fit(self, datasets): + print('fit') + # Preprocessing loop + for name, ds in datasets.items(): + # ds = ds.drop_columns(['id']) + ds = self._encoder.transform(ds) + ds = self._scaler.transform(ds) + ds = ds.materialize() + datasets[name] = ds + + if self._nb_GPU > 0: + self._fit_GPU(datasets) + else: + self._fit_CPU(datasets) + + def _fit_CPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'model': self.classifier, + 'weights': self._weights + } + + # Define trainer / tuner + self._trainer = TensorflowTrainer( + train_loop_per_worker=train_func_CPU, + train_loop_config=train_params, + scaling_config=ScalingConfig( + trainer_resources={'CPU': self._nb_CPU_data}, + num_workers=self._n_workers, + use_gpu=self._use_gpu, + resources_per_worker={ + 'CPU': self._nb_CPU_per_worker + } + ), + run_config=RunConfig( + name=self.classifier, + local_dir=self._workdir, + ), + datasets=datasets, + ) + + training_result = self._trainer.fit() + self._model_ckpt = training_result.best_checkpoints[0][0] + + def _fit_GPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'taxa': self.taxa, + 'workdir':self._workdir, + 'model': self.classifier, + 'weights': self._weights + } + + self._model_ckpt = train_func_GPU(datasets, train_params) + + # Models predicting + ######################################################################################################### + + def predict(self, ds): + print('predict') + # Predict with model + probabilities = self._predict_proba(ds) + # Convert predictions to labels + predictions = self._get_abs_pred(probabilities) + # Return decoded labels + return self._label_decode(predictions) + + def predict_proba(self, ds, threshold = 0.8): + print('predict_proba') + # Predict with model + probabilities = self._predict_proba(ds) + # Convert predictions to labels with threshold + predictions = self._get_threshold_pred(probabilities, threshold) + # Return decoded labels + return self._label_decode(predictions) + + def _predict_proba(self, ds): + print('_predict_proba') + if ds.count() > 0: + if len(ds.schema().names) > 1: + col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] + ds = ds.drop_columns(col_2_drop) + + ds = self._scaler.transform(ds) + + ds = ds.materialize() + + if self._nb_GPU > 0: + probabilities = self._predict_proba_GPU(ds) + else: + probabilities = self._predict_proba_CPU(ds) + + return probabilities + else: + raise ValueError('No data to predict') + + def _predict_proba_CPU(self, ds): + print('_predict_proba_CPU') + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + + return probabilities + + def _predict_proba_GPU(self, ds): + print('_predict_proba_GPU') + model = load_model(self._model_ckpt) + probabilities = [] + for batch in ds.iter_tf_batches(batch_size = self.batch_size): + probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) + + return probabilities + + @abstractmethod + def _get_abs_pred(self): + """ + """ + + @abstractmethod + def _get_threshold_pred(self): + """ + """ + +# Training/building function outside of the class as mentioned on the Ray discussion +# https://discuss.ray.io/t/statuscode-resource-exhausted/4379/16 +################################################################################ + +# Data streaming in PipelineDataset for larger than memory data, should prevent OOM +# https://docs.ray.io/en/latest/ray-air/check-ingest.html#enabling-streaming-ingest +# Smaller nb of workers + bigger nb CPU_per_worker + smaller batch_size to avoid memory overload +# https://discuss.ray.io/t/ray-sgd-distributed-tensorflow/261/8 + +def train_func_CPU(config): + # Parameters + batch_size = config.get('batch_size', 128) + epochs = config.get('epochs', 10) + size = config.get('size') + nb_cls = config.get('nb_cls') + model = config.get('model') + weights = config.get('weights') + + # Model construction + model = build_model(model, nb_cls, size) + + # Data + train_data = session.get_dataset_shard('train') + val_data = session.get_dataset_shard('validation') + + for _ in range(epochs): + batch_train = train_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + batch_val = val_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + # Training + history = model.fit( + x = batch_train, + validation_data = batch_val, + callbacks = [ReportCheckpointCallback()], + class_weight = weights, + verbose = 0 + ) + # Checkpointing + session.report({ + 'accuracy': history.history['accuracy'][0], + 'loss': history.history['loss'][0], + 'val_accuracy': history.history['val_accuracy'][0], + 'val_loss': history.history['val_loss'][0], + }, + checkpoint=TensorflowCheckpoint.from_model(model) + ) + gc.collect() + tf.keras.backend.clear_session() + del model + gc.collect() + tf.keras.backend.clear_session() + +def train_func_GPU(datasets, config): + # Parameters + batch_size = config.get('batch_size', 128) + epochs = config.get('epochs', 10) + size = config.get('size') + nb_cls = config.get('nb_cls') + taxa = config.get('taxa') + workdir = config.get('workdir') + model = config.get('model') + weights = config.get('weights') + + checkpoint = os.path.join(workdir, model) + + # Data + train_ds = datasets['train'] + val_ds = datasets['validation'] + + # Convert datasets to tensorflow ds & generator + train_ds = train_ds.iterator().to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + val_ds = val_ds.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + + # Model construction + model = build_model(model, nb_cls, size) + + # Callbacks + model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') + model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') + modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') + early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) + csv = CSVLogger(model_csv) + + # Training + hist = model.fit( + train_ds, + epochs = epochs, + validation_data = val_ds, + callbacks = [modelckpt, early, csv], + class_weight = weights, + verbose = 1 + ) + + # Checkpointing + best_model = np.argmin(hist.history['val_loss']) + 1 + best_model = f'{best_model:03d}.hdf5' + best_model = os.path.join(checkpoint, taxa, best_model) + + return best_model + + +def build_model(classifier, nb_cls, nb_kmers): + if classifier == 'attention': + model = build_attention(nb_kmers) + elif classifier == 'lstm': + model = build_LSTM(nb_kmers) + elif classifier == 'deeplstm': + model = build_deepLSTM(nb_kmers) + elif classifier == 'lstm_attention': + model = build_LSTM_attention(nb_kmers, nb_cls) + elif classifier == 'cnn': + model = build_CNN(nb_kmers, nb_cls) + elif classifier == 'widecnn': + model = build_wideCNN(nb_kmers, nb_cls) + return model + diff --git a/src/models/sklearn/binary_models.py b/src/models/sklearn/binary_models.py index cd5e6c0..9c72baa 100644 --- a/src/models/sklearn/binary_models.py +++ b/src/models/sklearn/binary_models.py @@ -112,14 +112,9 @@ def preprocess(self, ds, scaling = False, scaler_file = None): for (label, encoded) in zip(labels, self._encoded): self._labels_map[label] = encoded - # Scaling - # self._scaler = TensorTfIdfTransformer( - # features = self.kmers, - # file = scaler_file - # ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) - # self._scaler.fit(ds) - + # Features scaling + self._scaler = TensorTfIdfTransformer(features = self.kmers,file = scaler_file) + self._scaler.fit(ds) # Model training ######################################################################################################### @@ -131,7 +126,7 @@ def fit(self, datasets): for name, ds in datasets.items(): # ds = ds.drop_columns(['id']) ds = self._encoder.transform(ds) - # ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) datasets[name] = ray.put(ds) try: @@ -189,7 +184,7 @@ def _build(self): def predict(self, ds): print('predict') if ds.count() > 0: - # ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) ds = ds.materialize() predict_kwargs = {'features':self.kmers, 'num_estimator_cpus':-1} self._predictor = BatchPredictor.from_checkpoint(self._model_ckpt, SklearnTensorPredictor) diff --git a/src/models/sklearn/multiclass_models.py b/src/models/sklearn/multiclass_models.py index f7e9f10..798e6aa 100644 --- a/src/models/sklearn/multiclass_models.py +++ b/src/models/sklearn/multiclass_models.py @@ -118,9 +118,10 @@ def preprocess(self, ds, scaling = False, scaler_file = None): # Class weights self._weights = self._compute_weights() - if self.classifier == 'mnb': - self._scaler = TensorMinMaxScaler(self._nb_kmers) - self._scaler.fit(ds) + # Features scaling + self._scaler = TensorTfIdfTransformer(features = self.kmers,file = scaler_file) + self._scaler.fit(ds) + # Models training ######################################################################################################### @@ -131,8 +132,7 @@ def fit(self, datasets): # ds = ds.drop_columns(['id']) train_ds = datasets['train'] train_ds = self._encoder.transform(train_ds) - if self.classifier == 'mnb': - train_ds = self._scaler.transform(train_ds) + train_ds = self._scaler.transform(train_ds) # datasets[name] = ds # One sub-model per artificial cluster of samples @@ -244,8 +244,7 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): if ds.count() > 0: - if self.classifier == 'mnb': - ds = self._scaler.transform(ds) + ds = self._scaler.transform(ds) def predict_func(data): X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) From 312ea308c775175058d264e70fd69c9fe488b31d Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 20:24:15 -0500 Subject: [PATCH 80/92] NN no verbose --- src/models/kerasTF/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 126a908..7429b7a 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -299,7 +299,7 @@ def train_func_CPU(config): validation_data = batch_val, callbacks = [ReportCheckpointCallback()], class_weight = weights, - verbose = 1 + verbose = 0 ) # Checkpointing session.report({ From edd47b359bbc22385b005acefd3bcf8fb01ef877 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 22:04:47 -0500 Subject: [PATCH 81/92] linear NN training --- src/models/kerasTF/binary_models.py | 4 +- src/models/kerasTF/models.py | 163 ++++++++++------- .../{models_linear.py => models_parallel.py} | 167 +++++++----------- 3 files changed, 167 insertions(+), 167 deletions(-) rename src/models/kerasTF/{models_linear.py => models_parallel.py} (76%) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index b03165b..ae99f5c 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -12,7 +12,7 @@ from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Parent class / models -from models.kerasTF.models import KerasTFModels +from models.kerasTF.models_parallel import KerasTFModels from models.kerasTF.build_neural_networks import * # Training @@ -20,7 +20,7 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig -from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model +from models.kerasTF.models_parallel import train_func_CPU, train_func_GPU, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 7429b7a..adc81e3 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -156,6 +156,12 @@ def fit(self, datasets): ds = ds.materialize() datasets[name] = ds + if self._nb_GPU > 0: + self._fit_GPU(datasets) + else: + self._fit_CPU(datasets) + + def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -166,22 +172,16 @@ def fit(self, datasets): 'weights': self._weights } - if self._nb_GPU > 0: - train_func = train_func_GPU - else: - train_func = train_func_CPU - # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func, + train_loop_per_worker=train_func_CPU, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker, - 'GPU' : self._nb_GPU_per_worker + 'CPU': self._nb_CPU_per_worker } ), run_config=RunConfig( @@ -194,6 +194,21 @@ def fit(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] + def _fit_GPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'taxa': self.taxa, + 'workdir':self._workdir, + 'model': self.classifier, + 'weights': self._weights + } + + self._model_ckpt = train_func_GPU(datasets, train_params) + # Models predicting ######################################################################################################### @@ -226,23 +241,41 @@ def _predict_proba(self, ds): ds = ds.materialize() - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + if self._nb_GPU > 0: + probabilities = self._predict_proba_GPU(ds) + else: + probabilities = self._predict_proba_CPU(ds) return probabilities else: raise ValueError('No data to predict') + def _predict_proba_CPU(self, ds): + print('_predict_proba_CPU') + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + + return probabilities + + def _predict_proba_GPU(self, ds): + print('_predict_proba_GPU') + model = load_model(self._model_ckpt) + probabilities = [] + for batch in ds.iter_tf_batches(batch_size = self.batch_size): + probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) + + return probabilities + @abstractmethod def _get_abs_pred(self): """ @@ -316,62 +349,62 @@ def train_func_CPU(config): gc.collect() tf.keras.backend.clear_session() - -def train_func_GPU(config): +def train_func_GPU(datasets, config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) size = config.get('size') nb_cls = config.get('nb_cls') + taxa = config.get('taxa') + workdir = config.get('workdir') model = config.get('model') weights = config.get('weights') - # Model construction - strategy = tf.distribute.MirroredStrategy() - with strategy.scope(): - model = build_model(model, nb_cls, size) + checkpoint = os.path.join(workdir, model) # Data - train_data = session.get_dataset_shard('train') - val_data = session.get_dataset_shard('validation') + train_ds = datasets['train'] + val_ds = datasets['validation'] + + # Convert datasets to tensorflow ds & generator + train_ds = train_ds.iterator().to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + val_ds = val_ds.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + + # Model construction + model = build_model(model, nb_cls, size) + + # Callbacks + model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') + model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') + modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') + early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) + csv = CSVLogger(model_csv) + + # Training + hist = model.fit( + train_ds, + epochs = epochs, + validation_data = val_ds, + callbacks = [modelckpt, early, csv], + class_weight = weights, + verbose = 1 + ) + + # Checkpointing + best_model = np.argmin(hist.history['val_loss']) + 1 + best_model = f'{best_model:03d}.hdf5' + best_model = os.path.join(checkpoint, taxa, best_model) + + return best_model - for _ in range(epochs): - batch_train = train_data.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) - ) - batch_val = val_data.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) - ) - # Training - history = model.fit( - x = batch_train, - validation_data = batch_val, - callbacks = [ReportCheckpointCallback()], - class_weight = weights, - verbose = 0 - ) - # Checkpointing - session.report({ - 'accuracy': history.history['accuracy'][0], - 'loss': history.history['loss'][0], - 'val_accuracy': history.history['val_accuracy'][0], - 'val_loss': history.history['val_loss'][0], - }, - checkpoint=TensorflowCheckpoint.from_model(model) - ) - gc.collect() - tf.keras.backend.clear_session() - del model - gc.collect() - tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': diff --git a/src/models/kerasTF/models_linear.py b/src/models/kerasTF/models_parallel.py similarity index 76% rename from src/models/kerasTF/models_linear.py rename to src/models/kerasTF/models_parallel.py index adc81e3..bfa04cb 100644 --- a/src/models/kerasTF/models_linear.py +++ b/src/models/kerasTF/models_parallel.py @@ -109,9 +109,9 @@ def __init__( # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = self._nb_GPU #6 + self._n_workers = self._nb_GPU / 2 #6 self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 - self._nb_GPU_per_worker = 1 + self._nb_GPU_per_worker = 2 else: self._use_gpu = False self._n_workers = int(self._nb_CPU_training * 0.2) @@ -156,12 +156,6 @@ def fit(self, datasets): ds = ds.materialize() datasets[name] = ds - if self._nb_GPU > 0: - self._fit_GPU(datasets) - else: - self._fit_CPU(datasets) - - def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -172,16 +166,22 @@ def _fit_CPU(self, datasets): 'weights': self._weights } + if self._nb_GPU > 0: + train_func = train_func_GPU + else: + train_func = train_func_CPU + # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func_CPU, + train_loop_per_worker=train_func, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker + 'CPU': self._nb_CPU_per_worker, + 'GPU' : self._nb_GPU_per_worker } ), run_config=RunConfig( @@ -194,21 +194,6 @@ def _fit_CPU(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] - def _fit_GPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'taxa': self.taxa, - 'workdir':self._workdir, - 'model': self.classifier, - 'weights': self._weights - } - - self._model_ckpt = train_func_GPU(datasets, train_params) - # Models predicting ######################################################################################################### @@ -241,41 +226,23 @@ def _predict_proba(self, ds): ds = ds.materialize() - if self._nb_GPU > 0: - probabilities = self._predict_proba_GPU(ds) - else: - probabilities = self._predict_proba_CPU(ds) + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) return probabilities else: raise ValueError('No data to predict') - def _predict_proba_CPU(self, ds): - print('_predict_proba_CPU') - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) - - return probabilities - - def _predict_proba_GPU(self, ds): - print('_predict_proba_GPU') - model = load_model(self._model_ckpt) - probabilities = [] - for batch in ds.iter_tf_batches(batch_size = self.batch_size): - probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) - - return probabilities - @abstractmethod def _get_abs_pred(self): """ @@ -349,62 +316,62 @@ def train_func_CPU(config): gc.collect() tf.keras.backend.clear_session() -def train_func_GPU(datasets, config): + +def train_func_GPU(config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) size = config.get('size') nb_cls = config.get('nb_cls') - taxa = config.get('taxa') - workdir = config.get('workdir') model = config.get('model') weights = config.get('weights') - checkpoint = os.path.join(workdir, model) - - # Data - train_ds = datasets['train'] - val_ds = datasets['validation'] - - # Convert datasets to tensorflow ds & generator - train_ds = train_ds.iterator().to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size - ) - val_ds = val_ds.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size - ) - # Model construction - model = build_model(model, nb_cls, size) + strategy = tf.distribute.MirroredStrategy() + with strategy.scope(): + model = build_model(model, nb_cls, size) - # Callbacks - model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') - model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') - modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') - early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) - csv = CSVLogger(model_csv) - - # Training - hist = model.fit( - train_ds, - epochs = epochs, - validation_data = val_ds, - callbacks = [modelckpt, early, csv], - class_weight = weights, - verbose = 1 - ) - - # Checkpointing - best_model = np.argmin(hist.history['val_loss']) + 1 - best_model = f'{best_model:03d}.hdf5' - best_model = os.path.join(checkpoint, taxa, best_model) - - return best_model + # Data + train_data = session.get_dataset_shard('train') + val_data = session.get_dataset_shard('validation') + for _ in range(epochs): + batch_train = train_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + batch_val = val_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + # Training + history = model.fit( + x = batch_train, + validation_data = batch_val, + callbacks = [ReportCheckpointCallback()], + class_weight = weights, + verbose = 0 + ) + # Checkpointing + session.report({ + 'accuracy': history.history['accuracy'][0], + 'loss': history.history['loss'][0], + 'val_accuracy': history.history['val_accuracy'][0], + 'val_loss': history.history['val_loss'][0], + }, + checkpoint=TensorflowCheckpoint.from_model(model) + ) + gc.collect() + tf.keras.backend.clear_session() + del model + gc.collect() + tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': From d4b1ea4846b1914530900a22c93e3df2a5e9770b Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 22:12:02 -0500 Subject: [PATCH 82/92] keras bad import --- src/models/kerasTF/binary_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index ae99f5c..05cab3c 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -12,7 +12,7 @@ from models.preprocessors.tfidf_transformer import TensorTfIdfTransformer # Parent class / models -from models.kerasTF.models_parallel import KerasTFModels +from models.kerasTF.models import KerasTFModels from models.kerasTF.build_neural_networks import * # Training From f12c6573e86e225c083a29d05a07d64118871595 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 22:26:05 -0500 Subject: [PATCH 83/92] parallel NN training + more GPU / worker --- src/models/kerasTF/binary_models.py | 2 +- src/models/kerasTF/models.py | 169 +++++++----------- .../{models_parallel.py => models_linear.py} | 167 ++++++++++------- 3 files changed, 169 insertions(+), 169 deletions(-) rename src/models/kerasTF/{models_parallel.py => models_linear.py} (76%) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index 05cab3c..b03165b 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -20,7 +20,7 @@ from ray.air import session # from ray.air.integrations.keras import Callback from ray.air.config import ScalingConfig -from models.kerasTF.models_parallel import train_func_CPU, train_func_GPU, build_model +from models.kerasTF.models import train_func_CPU, train_func_GPU, build_model from ray.air.integrations.keras import ReportCheckpointCallback from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index adc81e3..1e69445 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -109,9 +109,9 @@ def __init__( # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = self._nb_GPU #6 - self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 - self._nb_GPU_per_worker = 1 + self._n_workers = self._nb_GPU / 2 # 3 + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 8 + self._nb_GPU_per_worker = 2 else: self._use_gpu = False self._n_workers = int(self._nb_CPU_training * 0.2) @@ -156,12 +156,6 @@ def fit(self, datasets): ds = ds.materialize() datasets[name] = ds - if self._nb_GPU > 0: - self._fit_GPU(datasets) - else: - self._fit_CPU(datasets) - - def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -172,16 +166,22 @@ def _fit_CPU(self, datasets): 'weights': self._weights } + if self._nb_GPU > 0: + train_func = train_func_GPU + else: + train_func = train_func_CPU + # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func_CPU, + train_loop_per_worker=train_func, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker + 'CPU': self._nb_CPU_per_worker, + 'GPU' : self._nb_GPU_per_worker } ), run_config=RunConfig( @@ -194,21 +194,6 @@ def _fit_CPU(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] - def _fit_GPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'taxa': self.taxa, - 'workdir':self._workdir, - 'model': self.classifier, - 'weights': self._weights - } - - self._model_ckpt = train_func_GPU(datasets, train_params) - # Models predicting ######################################################################################################### @@ -241,41 +226,23 @@ def _predict_proba(self, ds): ds = ds.materialize() - if self._nb_GPU > 0: - probabilities = self._predict_proba_GPU(ds) - else: - probabilities = self._predict_proba_CPU(ds) + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) return probabilities else: raise ValueError('No data to predict') - def _predict_proba_CPU(self, ds): - print('_predict_proba_CPU') - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) - - return probabilities - - def _predict_proba_GPU(self, ds): - print('_predict_proba_GPU') - model = load_model(self._model_ckpt) - probabilities = [] - for batch in ds.iter_tf_batches(batch_size = self.batch_size): - probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) - - return probabilities - @abstractmethod def _get_abs_pred(self): """ @@ -349,62 +316,62 @@ def train_func_CPU(config): gc.collect() tf.keras.backend.clear_session() -def train_func_GPU(datasets, config): + +def train_func_GPU(config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) size = config.get('size') nb_cls = config.get('nb_cls') - taxa = config.get('taxa') - workdir = config.get('workdir') model = config.get('model') weights = config.get('weights') - checkpoint = os.path.join(workdir, model) - - # Data - train_ds = datasets['train'] - val_ds = datasets['validation'] - - # Convert datasets to tensorflow ds & generator - train_ds = train_ds.iterator().to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size - ) - val_ds = val_ds.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size - ) - # Model construction - model = build_model(model, nb_cls, size) + strategy = tf.distribute.MirroredStrategy() + with strategy.scope(): + model = build_model(model, nb_cls, size) - # Callbacks - model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') - model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') - modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') - early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) - csv = CSVLogger(model_csv) - - # Training - hist = model.fit( - train_ds, - epochs = epochs, - validation_data = val_ds, - callbacks = [modelckpt, early, csv], - class_weight = weights, - verbose = 1 - ) - - # Checkpointing - best_model = np.argmin(hist.history['val_loss']) + 1 - best_model = f'{best_model:03d}.hdf5' - best_model = os.path.join(checkpoint, taxa, best_model) - - return best_model + # Data + train_data = session.get_dataset_shard('train') + val_data = session.get_dataset_shard('validation') + for _ in range(epochs): + batch_train = train_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + batch_val = val_data.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size, + local_shuffle_buffer_size = batch_size, + local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + ) + # Training + history = model.fit( + x = batch_train, + validation_data = batch_val, + callbacks = [ReportCheckpointCallback()], + class_weight = weights, + verbose = 0 + ) + # Checkpointing + session.report({ + 'accuracy': history.history['accuracy'][0], + 'loss': history.history['loss'][0], + 'val_accuracy': history.history['val_accuracy'][0], + 'val_loss': history.history['val_loss'][0], + }, + checkpoint=TensorflowCheckpoint.from_model(model) + ) + gc.collect() + tf.keras.backend.clear_session() + del model + gc.collect() + tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': diff --git a/src/models/kerasTF/models_parallel.py b/src/models/kerasTF/models_linear.py similarity index 76% rename from src/models/kerasTF/models_parallel.py rename to src/models/kerasTF/models_linear.py index bfa04cb..adc81e3 100644 --- a/src/models/kerasTF/models_parallel.py +++ b/src/models/kerasTF/models_linear.py @@ -109,9 +109,9 @@ def __init__( # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = self._nb_GPU / 2 #6 + self._n_workers = self._nb_GPU #6 self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 - self._nb_GPU_per_worker = 2 + self._nb_GPU_per_worker = 1 else: self._use_gpu = False self._n_workers = int(self._nb_CPU_training * 0.2) @@ -156,6 +156,12 @@ def fit(self, datasets): ds = ds.materialize() datasets[name] = ds + if self._nb_GPU > 0: + self._fit_GPU(datasets) + else: + self._fit_CPU(datasets) + + def _fit_CPU(self, datasets): # Training parameters train_params = { 'batch_size': self.batch_size, @@ -166,22 +172,16 @@ def fit(self, datasets): 'weights': self._weights } - if self._nb_GPU > 0: - train_func = train_func_GPU - else: - train_func = train_func_CPU - # Define trainer / tuner self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func, + train_loop_per_worker=train_func_CPU, train_loop_config=train_params, scaling_config=ScalingConfig( trainer_resources={'CPU': self._nb_CPU_data}, num_workers=self._n_workers, use_gpu=self._use_gpu, resources_per_worker={ - 'CPU': self._nb_CPU_per_worker, - 'GPU' : self._nb_GPU_per_worker + 'CPU': self._nb_CPU_per_worker } ), run_config=RunConfig( @@ -194,6 +194,21 @@ def fit(self, datasets): training_result = self._trainer.fit() self._model_ckpt = training_result.best_checkpoints[0][0] + def _fit_GPU(self, datasets): + # Training parameters + train_params = { + 'batch_size': self.batch_size, + 'epochs': self._training_epochs, + 'size': self._nb_kmers, + 'nb_cls': self._nb_classes, + 'taxa': self.taxa, + 'workdir':self._workdir, + 'model': self.classifier, + 'weights': self._weights + } + + self._model_ckpt = train_func_GPU(datasets, train_params) + # Models predicting ######################################################################################################### @@ -226,23 +241,41 @@ def _predict_proba(self, ds): ds = ds.materialize() - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + if self._nb_GPU > 0: + probabilities = self._predict_proba_GPU(ds) + else: + probabilities = self._predict_proba_CPU(ds) return probabilities else: raise ValueError('No data to predict') + def _predict_proba_CPU(self, ds): + print('_predict_proba_CPU') + self._predictor = BatchPredictor.from_checkpoint( + self._model_ckpt, + TensorflowPredictor, + model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + ) + predictions = self._predictor.predict( + data = ds, + feature_columns = [TENSOR_COLUMN_NAME], + batch_size = self.batch_size, + ) + + probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + + return probabilities + + def _predict_proba_GPU(self, ds): + print('_predict_proba_GPU') + model = load_model(self._model_ckpt) + probabilities = [] + for batch in ds.iter_tf_batches(batch_size = self.batch_size): + probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) + + return probabilities + @abstractmethod def _get_abs_pred(self): """ @@ -316,62 +349,62 @@ def train_func_CPU(config): gc.collect() tf.keras.backend.clear_session() - -def train_func_GPU(config): +def train_func_GPU(datasets, config): # Parameters batch_size = config.get('batch_size', 128) epochs = config.get('epochs', 10) size = config.get('size') nb_cls = config.get('nb_cls') + taxa = config.get('taxa') + workdir = config.get('workdir') model = config.get('model') weights = config.get('weights') - # Model construction - strategy = tf.distribute.MirroredStrategy() - with strategy.scope(): - model = build_model(model, nb_cls, size) + checkpoint = os.path.join(workdir, model) # Data - train_data = session.get_dataset_shard('train') - val_data = session.get_dataset_shard('validation') + train_ds = datasets['train'] + val_ds = datasets['validation'] + + # Convert datasets to tensorflow ds & generator + train_ds = train_ds.iterator().to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + val_ds = val_ds.to_tf( + feature_columns = TENSOR_COLUMN_NAME, + label_columns = LABELS_COLUMN_NAME, + batch_size = batch_size + ) + + # Model construction + model = build_model(model, nb_cls, size) + + # Callbacks + model_file = os.path.join(checkpoint, taxa, '{epoch:03d}.hdf5') + model_csv = os.path.join(checkpoint, taxa, 'training_log.csv') + modelckpt = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, mode='auto') + early = EarlyStopping(monitor='val_loss', mode='auto', patience=10) + csv = CSVLogger(model_csv) + + # Training + hist = model.fit( + train_ds, + epochs = epochs, + validation_data = val_ds, + callbacks = [modelckpt, early, csv], + class_weight = weights, + verbose = 1 + ) + + # Checkpointing + best_model = np.argmin(hist.history['val_loss']) + 1 + best_model = f'{best_model:03d}.hdf5' + best_model = os.path.join(checkpoint, taxa, best_model) + + return best_model - for _ in range(epochs): - batch_train = train_data.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) - ) - batch_val = val_data.to_tf( - feature_columns = TENSOR_COLUMN_NAME, - label_columns = LABELS_COLUMN_NAME, - batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) - ) - # Training - history = model.fit( - x = batch_train, - validation_data = batch_val, - callbacks = [ReportCheckpointCallback()], - class_weight = weights, - verbose = 0 - ) - # Checkpointing - session.report({ - 'accuracy': history.history['accuracy'][0], - 'loss': history.history['loss'][0], - 'val_accuracy': history.history['val_accuracy'][0], - 'val_loss': history.history['val_loss'][0], - }, - checkpoint=TensorflowCheckpoint.from_model(model) - ) - gc.collect() - tf.keras.backend.clear_session() - del model - gc.collect() - tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': From 11e02a1013ec07d60dde01d1ad11ec44391267b1 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 22:30:12 -0500 Subject: [PATCH 84/92] adjust nb workers to int --- src/models/kerasTF/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 1e69445..52527c0 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -109,7 +109,7 @@ def __init__( # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = self._nb_GPU / 2 # 3 + self._n_workers = int(self._nb_GPU / 2) # 3 self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 8 self._nb_GPU_per_worker = 2 else: From 8ba9dbe9f116ce3979ef541a984f0c4103643e95 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 23:02:55 -0500 Subject: [PATCH 85/92] ise all gpus to avoid oom while training --- src/models/kerasTF/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 52527c0..ac02efd 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -109,9 +109,9 @@ def __init__( # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = int(self._nb_GPU / 2) # 3 - self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 8 - self._nb_GPU_per_worker = 2 + self._n_workers = 1 # int(self._nb_GPU / 2) # 3 + self._nb_CPU_per_worker = self._nb_CPU_training # int(self._nb_CPU_training / self._n_workers) # 8 + self._nb_GPU_per_worker = self._nb_GPU # 1 else: self._use_gpu = False self._n_workers = int(self._nb_CPU_training * 0.2) From 468781a43c6b3a8ff976ad82fbbe514d916fc00a Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Wed, 20 Dec 2023 23:14:10 -0500 Subject: [PATCH 86/92] NN no random shuffle / iter --- src/models/kerasTF/models.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index ac02efd..4790656 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -109,9 +109,9 @@ def __init__( # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = 1 # int(self._nb_GPU / 2) # 3 - self._nb_CPU_per_worker = self._nb_CPU_training # int(self._nb_CPU_training / self._n_workers) # 8 - self._nb_GPU_per_worker = self._nb_GPU # 1 + self._n_workers = self._nb_GPU # 6 + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 + self._nb_GPU_per_worker = 1 else: self._use_gpu = False self._n_workers = int(self._nb_CPU_training * 0.2) @@ -283,15 +283,15 @@ def train_func_CPU(config): feature_columns = TENSOR_COLUMN_NAME, label_columns = LABELS_COLUMN_NAME, batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + # local_shuffle_buffer_size = batch_size, + # local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) batch_val = val_data.to_tf( feature_columns = TENSOR_COLUMN_NAME, label_columns = LABELS_COLUMN_NAME, batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + # local_shuffle_buffer_size = batch_size, + # local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) # Training history = model.fit( @@ -340,15 +340,15 @@ def train_func_GPU(config): feature_columns = TENSOR_COLUMN_NAME, label_columns = LABELS_COLUMN_NAME, batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + # local_shuffle_buffer_size = batch_size, + # local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) batch_val = val_data.to_tf( feature_columns = TENSOR_COLUMN_NAME, label_columns = LABELS_COLUMN_NAME, batch_size = batch_size, - local_shuffle_buffer_size = batch_size, - local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) + # local_shuffle_buffer_size = batch_size, + # local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) # Training history = model.fit( From 07a223294fc9e85c6c84d2b829168b2ad86ff7ca Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 21 Dec 2023 20:24:48 -0500 Subject: [PATCH 87/92] NN mixed precision --- src/models/kerasTF/build_neural_networks.py | 24 +++++++++++++-------- src/models/kerasTF/models.py | 11 +++++----- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 12893f3..1a0fec0 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -4,10 +4,11 @@ from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Concatenate, Flatten, Attention, Activation, Bidirectional, Reshape, AveragePooling1D - - +from tensorflow.keras import mixed_precision from models.kerasTF.attentionLayer import AttentionWeightedAverage +mixed_precision.set_global_policy('mixed_float16') + __author__ = "Nicolas de Montigny" __all__ = ['build_attention','build_LSTM','build_deepLSTM','build_LSTM_attention','build_CNN','build_wideCNN'] @@ -28,7 +29,8 @@ def build_attention(nb_features): x = Dense(128, activation = "relu")(x) x = Dropout(0.1)(x) - x = Dense(1, activation = "sigmoid")(x) + x = Dense(1)(x) + x = Activation(activation = "sigmoid", dtype = 'float32')(x) model = Model(inputs = inputs, outputs = x) model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'], jit_compile = True) @@ -48,8 +50,9 @@ def build_LSTM(nb_features): x = LSTM(128, dropout = 0.1)(inputs) - x = Dense(1, activation = 'tanh')(x) - + x = Dense(1)(x) + x = Activation(activation = "tanh", dtype = 'float32')(x) + model = Model(inputs = inputs, outputs = x) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) @@ -82,7 +85,9 @@ def build_deepLSTM(nb_features): net = Dense(10, activation='relu', name='D_%d'%10)(net) net = Dropout(0.1,name='fr_same')(net) - outputs = Dense(1, activation='sigmoid', name='score')(net) + net = Dense(1)(net) + outputs = Activation(activation = "sigmoid", dtype = 'float32')(net) + model = Model(inputs=inputs, outputs=outputs) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) @@ -108,7 +113,7 @@ def build_LSTM_attention(nb_features, nb_classes): net = Dropout(0.2)(net) net = Flatten()(net) net = Dense(nb_classes)(net) - outputs = Activation('softmax')(net) + outputs = Activation('softmax', dtype = 'float32')(net) model = Model(inputs = inputs, outputs = outputs) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) @@ -135,7 +140,7 @@ def build_CNN(nb_features, nb_classes): model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) - model.add(Activation('softmax')) + model.add(Activation('softmax', dtype = 'float32')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) return model @@ -172,7 +177,8 @@ def build_wideCNN(nb_features, nb_classes): net = Dropout(0.5)(net) net = Dense(nb_classes)(net) - outputs = Activation('softmax')(net) + outputs = Activation('softmax', dtype = 'float32')(net) + model = Model(inputs = inputs, outputs = outputs) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], jit_compile = True) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 4790656..67d7fb9 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -47,6 +47,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' warnings.filterwarnings('ignore') + class KerasTFModels(ModelsUtils, ABC): """ Class used to build, train and predict models using Ray with Keras Tensorflow backend @@ -100,17 +101,17 @@ def __init__( ) # Parameters # Initialize hidden - self._nb_CPU_data = int(os.cpu_count() * 0.2) # 6 - self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 26 - self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 6 + self._nb_CPU_data = int(os.cpu_count() * 0.2) # 9 + self._nb_CPU_training = int(os.cpu_count() - self._nb_CPU_data) # 39 + self._nb_GPU = len(tf.config.list_physical_devices('GPU')) # 4 # Initialize empty self._nb_CPU_per_worker = 0 self._nb_GPU_per_worker = 0 # Computing variables if self._nb_GPU > 0: self._use_gpu = True - self._n_workers = self._nb_GPU # 6 - self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 4 + self._n_workers = self._nb_GPU # 4 + self._nb_CPU_per_worker = int(self._nb_CPU_training / self._n_workers) # 9 self._nb_GPU_per_worker = 1 else: self._use_gpu = False From 265682300c8b3e57291ae79e984568bddc4cadd1 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 21 Dec 2023 21:07:22 -0500 Subject: [PATCH 88/92] distribution strategy in TF --- src/models/kerasTF/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 67d7fb9..5dce3a2 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -328,7 +328,7 @@ def train_func_GPU(config): weights = config.get('weights') # Model construction - strategy = tf.distribute.MirroredStrategy() + strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): model = build_model(model, nb_cls, size) From 01946bcdce3ac5e5aa4f2cef2189ab5e1e329300 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Thu, 21 Dec 2023 23:14:42 -0500 Subject: [PATCH 89/92] Keras NN Mirrored Workers fo CCDB GPUs --- src/models/kerasTF/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 5dce3a2..67d7fb9 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -328,7 +328,7 @@ def train_func_GPU(config): weights = config.get('weights') # Model construction - strategy = tf.distribute.MultiWorkerMirroredStrategy() + strategy = tf.distribute.MirroredStrategy() with strategy.scope(): model = build_model(model, nb_cls, size) From 32bebc53e4af127ec2b7f84355d070994973536c Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Fri, 22 Dec 2023 14:40:59 -0500 Subject: [PATCH 90/92] bagging for NN predictions --- src/models/kerasTF/binary_models.py | 157 ------------------- src/models/kerasTF/build_neural_networks.py | 6 +- src/models/kerasTF/models.py | 61 +++++--- src/models/kerasTF/multiclass_models.py | 161 -------------------- 4 files changed, 45 insertions(+), 340 deletions(-) diff --git a/src/models/kerasTF/binary_models.py b/src/models/kerasTF/binary_models.py index b03165b..0a80e2d 100644 --- a/src/models/kerasTF/binary_models.py +++ b/src/models/kerasTF/binary_models.py @@ -102,166 +102,9 @@ def __init__( elif self.classifier == 'deeplstm': print('Training bacterial / host classifier based on Deep LSTM Neural Network') - # Data preprocessing - ######################################################################################################### - """ - def preprocess(self, ds, scaling = False, scaler_file = None): - print('preprocess') - # Labels encoding - self._encoder = ModelLabelEncoder(self.taxa) - self._encoder.fit(ds) - - # Labels mapping - labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) - self._nb_classes = len(labels) - self._encoded = np.arange(len(labels)) - labels = np.append(labels, 'Unknown') - self._encoded = np.append(self._encoded, -1) - - for (label, encoded) in zip(labels, self._encoded): - self._labels_map[label] = encoded - - # Class weights - self._weights = self._compute_weights() - - # Scaling - # self._scaler = TensorTfIdfTransformer( - # features = self.kmers, - # file = scaler_file - # ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) - # self._scaler.fit(ds) - """ - # Model training - ######################################################################################################### - - """ - def fit(self, datasets): - print('fit') - # Preprocessing loop - for name, ds in datasets.items(): - # ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - # ds = self._scaler.transform(ds) - ds = ds.materialize() - datasets[name] = ds - - if self._nb_GPU > 0: - self._fit_GPU(datasets) - else: - self._fit_CPU(datasets) - - def _fit_CPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'model': self.classifier, - 'weights': self._weights - } - - # Define trainer / tuner - self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func_CPU, - train_loop_config=train_params, - scaling_config=ScalingConfig( - trainer_resources={'CPU': self._nb_CPU_data}, - num_workers=self._n_workers, - use_gpu=self._use_gpu, - resources_per_worker={ - 'CPU': self._nb_CPU_per_worker - } - ), - run_config=RunConfig( - name=self.classifier, - local_dir=self._workdir, - ), - datasets=datasets, - ) - - training_result = self._trainer.fit() - self._model_ckpt = training_result.best_checkpoints[0][0] - - def _fit_GPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'taxa': self.taxa, - 'workdir':self._workdir, - 'model': self.classifier, - 'weights': self._weights - } - - self._model_ckpt = train_func_GPU(datasets, train_params) - """ # Model predicting ######################################################################################################### - - """ - def predict(self, ds): - print('predict') - # Predict with model - probabilities = self._predict_proba(ds) - # Convert predictions to labels - predictions = self._get_abs_pred(probabilities) - # Return decoded labels - return self._label_decode(predictions) - - def predict_proba(self, ds, threshold = 0.8): - print('predict_proba') - # Predict with model - probabilities = self._predict_proba(ds) - # Convert predictions to labels with threshold - predictions = self._get_threshold_pred(probabilities, threshold) - # Return decoded labels - return self._label_decode(predictions) - def _predict_proba(self, ds): - if ds.count() > 0: - if len(ds.schema().names) > 1: - col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] - ds = ds.drop_columns(col_2_drop) - - ds = ds.materialize() - - if self._nb_GPU > 0: - probabilities = self._predict_proba_GPU(ds) - else: - probabilities = self._predict_proba_CPU(ds) - - return probabilities - - else: - raise ValueError('No data to predict') - def _predict_proba_CPU(self, ds): - print('_predict_proba_CPU') - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) - - return predictions - - def _predict_proba_GPU(self, ds): - print('_predict_proba_GPU') - model = load_model(self._model_ckpt) - probabilities = [] - for batch in ds.iter_tf_batches(batch_size = self.batch_size): - probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) - """ def _get_abs_pred(self, predictions): print('_get_abs_pred') return np.round(np.ravel(predictions)) diff --git a/src/models/kerasTF/build_neural_networks.py b/src/models/kerasTF/build_neural_networks.py index 1a0fec0..eb36d77 100644 --- a/src/models/kerasTF/build_neural_networks.py +++ b/src/models/kerasTF/build_neural_networks.py @@ -1,3 +1,4 @@ +import tensorflow as tf from keras.models import Model, Sequential from tensorflow.keras import mixed_precision @@ -7,7 +8,10 @@ from tensorflow.keras import mixed_precision from models.kerasTF.attentionLayer import AttentionWeightedAverage -mixed_precision.set_global_policy('mixed_float16') +if len(tf.config.list_physical_devices('GPU')) > 0: + mixed_precision.set_global_policy('mixed_float16') +else: + mixed_precision.set_global_policy('mixed_bfloat16') __author__ = "Nicolas de Montigny" diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index 67d7fb9..c916afa 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -4,6 +4,8 @@ import numpy as np import pandas as pd +from glob import glob + # Class construction from abc import ABC, abstractmethod @@ -193,8 +195,13 @@ def fit(self, datasets): ) training_result = self._trainer.fit() - self._model_ckpt = training_result.best_checkpoints[0][0] - + # self._model_ckpt = training_result.best_checkpoints[0][0] + self._model_ckpt = glob( + os.path.join( + os.path.dirname(training_result.best_checkpoints[0][0].path),'checkpoint_*' + ) + ) + # Models predicting ######################################################################################################### @@ -219,26 +226,38 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): print('_predict_proba') if ds.count() > 0: - if len(ds.schema().names) > 1: - col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] - ds = ds.drop_columns(col_2_drop) + # if len(ds.schema().names) > 1: + # col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] + # ds = ds.drop_columns(col_2_drop) ds = self._scaler.transform(ds) - ds = ds.materialize() - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) + def predict_func(data): + X = _unwrap_ndarray_object_type_if_needed(data[TENSOR_COLUMN_NAME]) + pred = np.zeros((len(X), len(self._labels_map)-1)) + for ckpt in self._model_ckpt: + ckpt = TensorflowCheckpoint.from_directory(ckpt) + predictor = TensorflowPredictor().from_checkpoint(ckpt, model_definition = lambda: build_model('cnn', self._nb_classes, self._nb_kmers)) + proba = predictor.predict(X) + pred += proba['predictions'] + pred = pred / len(self._model_ckpt) + return {'predictions' : pred} + + probabilities = ds.map_batches(predict_func, batch_format = 'numpy') + probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) + + # self._predictor = BatchPredictor.from_checkpoint( + # self._model_ckpt, + # TensorflowPredictor, + # model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) + # ) + # predictions = self._predictor.predict( + # data = ds, + # feature_columns = [TENSOR_COLUMN_NAME], + # batch_size = self.batch_size, + # ) + # probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) return probabilities else: @@ -370,9 +389,9 @@ def train_func_GPU(config): ) gc.collect() tf.keras.backend.clear_session() - del model - gc.collect() - tf.keras.backend.clear_session() + # del model + # gc.collect() + # tf.keras.backend.clear_session() def build_model(classifier, nb_cls, nb_kmers): if classifier == 'attention': diff --git a/src/models/kerasTF/multiclass_models.py b/src/models/kerasTF/multiclass_models.py index 99bf6bd..4e564da 100644 --- a/src/models/kerasTF/multiclass_models.py +++ b/src/models/kerasTF/multiclass_models.py @@ -99,170 +99,9 @@ def __init__( ) self._nb_classes = None - # Data preprocessing - ######################################################################################################### - - """ - def preprocess(self, ds, scaling = False, scaler_file = None): - print('preprocess') - # Labels encoding - self._encoder = ModelLabelEncoder(self.taxa) - self._encoder.fit(ds) - - # Labels mapping - labels = list(self._encoder.stats_[f'unique_values({self.taxa})'].keys()) - self._nb_classes = len(labels) - self._encoded = np.arange(len(labels)) - labels = np.append(labels, 'Unknown') - self._encoded = np.append(self._encoded, -1) - - for (label, encoded) in zip(labels, self._encoded): - self._labels_map[label] = encoded - - # Class weights - self._weights = self._compute_weights() - - # Scaling - # self._scaler = TensorTfIdfTransformer( - # features = self.kmers, - # file = scaler_file - # ) - # self._scaler = TensorMinMaxScaler(self._nb_kmers) - # self._scaler.fit(ds) - """ - # Models training - ######################################################################################################### - - """ - def fit(self, datasets): - print('fit') - # Preprocessing loop - for name, ds in datasets.items(): - # ds = ds.drop_columns(['id']) - ds = self._encoder.transform(ds) - # ds = self._scaler.transform(ds) - ds = ds.materialize() - datasets[name] = ds - - if self._nb_GPU > 0: - self._fit_GPU(datasets) - else: - self._fit_CPU(datasets) - - def _fit_CPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'model': self.classifier, - 'weights': self._weights - } - - # Define trainer / tuner - self._trainer = TensorflowTrainer( - train_loop_per_worker=train_func_CPU, - train_loop_config=train_params, - scaling_config=ScalingConfig( - trainer_resources={'CPU': self._nb_CPU_data}, - num_workers=self._n_workers, - use_gpu=self._use_gpu, - resources_per_worker={ - 'CPU': self._nb_CPU_per_worker - } - ), - run_config=RunConfig( - name=self.classifier, - local_dir=self._workdir, - ), - datasets=datasets, - ) - - training_result = self._trainer.fit() - self._model_ckpt = training_result.best_checkpoints[0][0] - - def _fit_GPU(self, datasets): - # Training parameters - train_params = { - 'batch_size': self.batch_size, - 'epochs': self._training_epochs, - 'size': self._nb_kmers, - 'nb_cls': self._nb_classes, - 'taxa': self.taxa, - 'workdir':self._workdir, - 'model': self.classifier, - 'weights': self._weights - } - - self._model_ckpt = train_func_GPU(datasets, train_params) - """ # Models predicting ######################################################################################################### - """ - def predict(self, ds): - print('predict') - # Predict with model - probabilities = self._predict_proba(ds) - # Convert predictions to labels - predictions = self._get_abs_pred(probabilities) - # Return decoded labels - return self._label_decode(predictions) - - def predict_proba(self, ds, threshold = 0.8): - print('predict_proba') - # Predict with model - probabilities = self._predict_proba(ds) - # Convert predictions to labels with threshold - predictions = self._get_threshold_pred(probabilities, threshold) - # Return decoded labels - return self._label_decode(predictions) - - def _predict_proba(self, ds): - print('_predict_proba') - if ds.count() > 0: - if len(ds.schema().names) > 1: - col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] - ds = ds.drop_columns(col_2_drop) - - ds = ds.materialize() - - if self._nb_GPU > 0: - probabilities = self._predict_proba_GPU(ds) - else: - probabilities = self._predict_proba_CPU(ds) - - return probabilities - else: - raise ValueError('No data to predict') - - def _predict_proba_CPU(self, ds): - print('_predict_proba_CPU') - self._predictor = BatchPredictor.from_checkpoint( - self._model_ckpt, - TensorflowPredictor, - model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - ) - predictions = self._predictor.predict( - data = ds, - feature_columns = [TENSOR_COLUMN_NAME], - batch_size = self.batch_size, - ) - - probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) - - return probabilities - - def _predict_proba_GPU(self, ds): - print('_predict_proba_GPU') - model = load_model(self._model_ckpt) - probabilities = [] - for batch in ds.iter_tf_batches(batch_size = self.batch_size): - probabilities.extend(model.predict(batch[TENSOR_COLUMN_NAME])) - - return probabilities - """ def _get_abs_pred(self, predictions): print('_get_abs_pred') return np.argmax(predictions, axis = 1) From b685c4c5d3bfbb0f59661b6fe7f26013ec97aa62 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 23 Dec 2023 20:38:11 -0500 Subject: [PATCH 91/92] scripts default options --- src/Caribou_classification.py | 2 +- src/Caribou_classification_train_cv.py | 2 +- src/Caribou_extraction.py | 2 +- src/Caribou_pipeline.py | 4 ++-- src/models/kerasTF/models.py | 17 ++--------------- 5 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index 9a9473c..eef5813 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -108,7 +108,7 @@ def bacteria_classification(opt): # Optional datasets parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') # Parameters - parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') + parser.add_argument('-model','--model_type', default='sgd', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to species. Can be one level or a list of levels separated by commas.') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') diff --git a/src/Caribou_classification_train_cv.py b/src/Caribou_classification_train_cv.py index f6d1422..09d957f 100644 --- a/src/Caribou_classification_train_cv.py +++ b/src/Caribou_classification_train_cv.py @@ -104,7 +104,7 @@ def bacteria_classification_train_cv(opt): parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') parser.add_argument('-t','--test', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the test dataset') # Parameters - parser.add_argument('-model','--model_type', default='lstm_attention', choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') + parser.add_argument('-model','--model_type', required=True, choices=['sgd','mnb','lstm_attention','cnn','widecnn'], help='The type of model to train') parser.add_argument('-tx','--taxa', default=None, help='The taxonomic level to use for the classification, defaults to None. Can be one level or a list of levels separated by commas.') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') diff --git a/src/Caribou_extraction.py b/src/Caribou_extraction.py index 6228230..90ceebe 100644 --- a/src/Caribou_extraction.py +++ b/src/Caribou_extraction.py @@ -127,7 +127,7 @@ def bacteria_extraction(opt): parser.add_argument('-m','--merged', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the merged bacteria and host databases') parser.add_argument('-v','--validation', default=None, type=Path, help='PATH to a npz file containing the k-mers profile for the validation dataset') # Parameters - parser.add_argument('-model','--model_type', default=None, choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') + parser.add_argument('-model','--model_type', default='linearsvm', choices=[None,'onesvm','linearsvm','attention','lstm','deeplstm'], help='The type of model to train') parser.add_argument('-bs','--batch_size', default=32, type=int, help='Size of the batch size to use, defaults to 32') parser.add_argument('-e','--training_epochs', default=100, type=int, help='The number of training iterations for the neural networks models if one ise chosen, defaults to 100') parser.add_argument('-o','--outdir', required=True, type=Path, help='PATH to a directory on file where outputs will be saved') diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py index a6fdb0b..97b42f8 100644 --- a/src/Caribou_pipeline.py +++ b/src/Caribou_pipeline.py @@ -43,8 +43,8 @@ def caribou(opt): # settings k_length = config.getint('settings', 'k', fallback = 35) cv = config.getboolean('settings', 'cross_validation', fallback = True) - binary_classifier = config.get('settings', 'host_extractor', fallback = 'attention') - multi_classifier = config.get('settings', 'bacteria_classifier', fallback = 'lstm_attention') + binary_classifier = config.get('settings', 'host_extractor', fallback = 'linearsvm') + multi_classifier = config.get('settings', 'bacteria_classifier', fallback = 'sgd') training_batch_size = config.getint('settings', 'training_batch_size', fallback = 32) training_epochs = config.getint('settings','neural_network_training_iterations', fallback = 100) classif_threshold = config.getfloat('settings', 'classification_threshold', fallback = 0.8) diff --git a/src/models/kerasTF/models.py b/src/models/kerasTF/models.py index c916afa..3bcea47 100644 --- a/src/models/kerasTF/models.py +++ b/src/models/kerasTF/models.py @@ -226,9 +226,6 @@ def predict_proba(self, ds, threshold = 0.8): def _predict_proba(self, ds): print('_predict_proba') if ds.count() > 0: - # if len(ds.schema().names) > 1: - # col_2_drop = [col for col in ds.schema().names if col != TENSOR_COLUMN_NAME] - # ds = ds.drop_columns(col_2_drop) ds = self._scaler.transform(ds) ds = ds.materialize() @@ -247,18 +244,6 @@ def predict_func(data): probabilities = ds.map_batches(predict_func, batch_format = 'numpy') probabilities = _unwrap_ndarray_object_type_if_needed(probabilities.to_pandas()['predictions']) - # self._predictor = BatchPredictor.from_checkpoint( - # self._model_ckpt, - # TensorflowPredictor, - # model_definition = lambda: build_model(self.classifier, self._nb_classes, self._nb_kmers) - # ) - # predictions = self._predictor.predict( - # data = ds, - # feature_columns = [TENSOR_COLUMN_NAME], - # batch_size = self.batch_size, - # ) - # probabilities = _unwrap_ndarray_object_type_if_needed(predictions.to_pandas()['predictions']) - return probabilities else: raise ValueError('No data to predict') @@ -314,6 +299,7 @@ def train_func_CPU(config): # local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) # Training + # TODO: Move epochs to model.fit instead of in loop? history = model.fit( x = batch_train, validation_data = batch_val, @@ -371,6 +357,7 @@ def train_func_GPU(config): # local_shuffle_seed = int(np.random.randint(1,10000, size = 1)) ) # Training + # TODO: Move epochs to model.fit instead of in loop? history = model.fit( x = batch_train, validation_data = batch_val, From a7dd1ece9490c104b0c37e1d2cca91fcf5021084 Mon Sep 17 00:00:00 2001 From: Nicolas de Montigny Date: Sat, 23 Dec 2023 21:34:32 -0500 Subject: [PATCH 92/92] recurrent predictions debug --- src/Caribou_classification.py | 2 +- src/models/classification.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Caribou_classification.py b/src/Caribou_classification.py index eef5813..1aae153 100644 --- a/src/Caribou_classification.py +++ b/src/Caribou_classification.py @@ -70,7 +70,7 @@ def bacteria_classification(opt): outdirs = outdirs, db_name = opt['database_name'], clf_multiclass = opt['model_type'], - taxa = 'domain', + taxa = lst_taxas, batch_size = opt['batch_size'], training_epochs = opt['training_epochs'], scaling = scaling diff --git a/src/models/classification.py b/src/models/classification.py index c164bf8..a965347 100644 --- a/src/models/classification.py +++ b/src/models/classification.py @@ -76,7 +76,7 @@ def fit(self, datasets): self._valid_assign_taxas() self._valid_classifier() tax_map = self._verify_model_trained() - + self._fit(datasets, tax_map) def predict(self, dataset): @@ -263,12 +263,12 @@ def _remove_unknown(self, ds, predict): 'ids' : ids, 'predictions' : predict }) - mapping = mapping[mapping['predictions'] != -1] + mapping = mapping[mapping['predictions'] != 'Unknown'] ids = mapping['ids'] predict = mapping['predictions'] def remove_unknown(df): - df = df[df['ids'].isin(ids)] + df = df[df['id'].isin(ids)] return df ds = ds.map_batches(remove_unknown, batch_format = 'pandas') @@ -351,6 +351,7 @@ def _valid_assign_taxas(self): self._taxas = [self._taxas] else: raise ValueError("Invalid taxa option, it must either be absent/None, be a list of taxas to extract or a string identifiying a taxa to extract") + self._valid_taxas() self._taxas = [taxa for taxa in self._database_data['taxas'] if taxa in self._taxas] self._taxas.reverse() @@ -443,7 +444,8 @@ def _save_dataset(self, ds, taxa): model = self._classifier_binary else: model = self._classifier_multiclass - file = os.path.join(self._outdirs['results'], f'data_classified_{model}_{taxa}.parquet') + file = os.path.join(self._outdirs['results_dir'], f'data_classified_{model}_{taxa}') + ds.write_parquet(file) return file \ No newline at end of file