ramosrenzo
diff --git a/‎AAM/.ipynb_checkpoints/model-checkpoint.py
+374 b/‎AAM/.ipynb_checkpoints/model-checkpoint.py
+374
diff --git a/‎AAM/.ipynb_checkpoints/plot_auroc_auprc-checkpoint.py
+59 b/‎AAM/.ipynb_checkpoints/plot_auroc_auprc-checkpoint.py
+59
diff --git a/‎AAM/.ipynb_checkpoints/test-checkpoint.py
+103 b/‎AAM/.ipynb_checkpoints/test-checkpoint.py
+103
diff --git a/‎AAM/.ipynb_checkpoints/training-checkpoint.py
+170 b/‎AAM/.ipynb_checkpoints/training-checkpoint.py
+170
diff --git a/‎AAM/__pycache__/model.cpython-39.pyc
13 KB b/‎AAM/__pycache__/model.cpython-39.pyc
13 KB
diff --git a/‎AAM/__pycache__/plot_auroc_auprc.cpython-39.pyc
2.03 KB b/‎AAM/__pycache__/plot_auroc_auprc.cpython-39.pyc
2.03 KB
diff --git a/‎AAM/__pycache__/test.cpython-39.pyc
3.39 KB b/‎AAM/__pycache__/test.cpython-39.pyc
3.39 KB
diff --git a/‎AAM/__pycache__/training.cpython-39.pyc
4.44 KB b/‎AAM/__pycache__/training.cpython-39.pyc
4.44 KB
diff --git a/‎AAM/training.py
+7-7 b/‎AAM/training.py
+7-7
@@ -0,0 +1,59 @@
+import seaborn as sns
+from sklearn.metrics import roc_curve, auc, precision_recall_curve
+import matplotlib.pyplot as plt
+import numpy as np
+
+def plot_auroc_auprc(nares_predictions, forehead_predictions, stool_predictions, inside_floor_predictions):   
+    sample_data = {
+        "Nares": nares_predictions[0],
+        "Forehead": forehead_predictions[0],
+        "Stool": stool_predictions[0],
+        "Inside floor": inside_floor_predictions[0],
+    }
+ 
+    # set up
+    palette = ["#dc9766", "#d32f88", "#914f1f", "#bf64d7"]
+    colors = sns.color_palette(palette)
+    plt.figure(figsize=(10, 5))
+    
+    # AUROC
+    plt.subplot(1, 2, 1)
+    for (sample, (y_pred, y_true)), color in zip(sample_data.items(), colors):
+        fpr, tpr, _ = roc_curve(y_true, y_pred)
+        roc_auc = auc(fpr, tpr)
+        plt.plot(fpr, tpr, color=color, label=f"{sample}: AUROC={roc_auc:.2f}")
+    plt.xlabel("1 - Specificity")
+    plt.ylabel("Sensitivity")
+    plt.xticks(np.arange(0.0, 1.1, 0.25))
+    plt.yticks(np.arange(0.0, 1.1, 0.25))
+    plt.xticks(np.arange(0.0, 1.1, 0.125), minor=True)
+    plt.yticks(np.arange(0.0, 1.1, 0.125), minor=True)
+    plt.tick_params(which="minor", length=0) 
+    plt.grid(True, linestyle="-", alpha=0.4)
+    plt.grid(True, which="minor", linestyle="-", alpha=0.4)
+    legend = plt.legend(title="Sample types", framealpha=1, facecolor="white", edgecolor="none", labelspacing=1.3, fontsize="medium")
+    legend._legend_box.align = "left"
+    
+    # AUPRC
+    plt.subplot(1, 2, 2)
+    for (sample, (y_pred, y_true)), color in zip(sample_data.items(), colors):
+        precision, recall, _ = precision_recall_curve(y_true, y_pred)
+        pr_auc = auc(recall, precision)
+        plt.plot(recall, precision, color=color, label=f"{sample}: AUPRC={pr_auc:.2f}")
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.xticks(np.arange(0.0, 1.1, 0.25))
+    plt.yticks(np.arange(0.0, 1.1, 0.25))
+    plt.xticks(np.arange(0.0, 1.1, 0.125), minor=True)
+    plt.yticks(np.arange(0.0, 1.1, 0.125), minor=True)
+    plt.tick_params(which="minor", length=0) 
+    plt.grid(True, linestyle="-", alpha=0.4)
+    plt.grid(True, which="minor", linestyle="-", alpha=0.4)
+    legend = plt.legend(title="Sample types", framealpha=1, facecolor="white", edgecolor="none", labelspacing=1.3, fontsize="medium")
+    legend._legend_box.align = "left"
+    
+    # adjust layout
+    plt.tight_layout()
+    plt.subplots_adjust(wspace=0.3)
+    
+    plt.savefig('figures/auroc_auprc_aam.png')
@@ -0,0 +1,103 @@
+from aam.models.sequence_regressor import SequenceRegressor
+from aam.models.sequence_regressor_v2 import SequenceRegressorV2
+from aam.callbacks import SaveModel
+from keras.callbacks import EarlyStopping
+from AAM.model import GeneratorDataset
+from AAM.model import Classifier
+
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+import seaborn as sns
+from sklearn.metrics import roc_curve, auc, precision_recall_curve
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split, StratifiedKFold
+import biom
+from biom import Table, load_table
+import os
+import sys
+import warnings
+
+gpus = tf.config.list_physical_devices("GPU")
+if len(gpus) > 0:
+    tf.config.experimental.set_memory_growth(gpus[0], True)
+
+warnings.filterwarnings('ignore')
+
+K = tf.keras
+
+def get_sample_type(file_path):
+            filename = os.path.basename(file_path)
+            # Remove the 'test_metadata_' prefix and the file extension
+            if filename.startswith('test_metadata_'):
+                sample_type = filename[len('test_metadata_'):]
+                sample_type = os.path.splitext(sample_type)[0]
+                return sample_type
+            return "Unknown"
+
+def test_model(test_fp, model_fp, ensemble=False):
+        sample_type = get_sample_type(test_fp)
+        test_metadata = pd.read_csv(test_fp, sep='\t', index_col=0)
+        X_test = test_metadata.drop(columns=['study_sample_type', 'has_covid'], axis=1)
+        y_test = test_metadata[['study_sample_type', 'has_covid']]
+        
+        if sample_type == 'stool':
+            rarefy_depth = 4000
+        else:
+            rarefy_depth = 1000
+    
+        if 'large' in model_fp:
+            sequence_embeddings = 'data/input/asv_embeddings_large.npy'
+        else:
+            sequence_embeddings = 'data/input/asv_embeddings_aam.npy'
+        gd_test = [GeneratorDataset(
+            table='data/input/merged_biom_table.biom',
+            metadata=y_test,
+            metadata_column='has_covid',
+            shuffle=False,
+            is_categorical=False,
+            shift=0,
+            rarefy_depth = rarefy_depth,
+            scale=1,
+            batch_size = 32,
+            epochs=1,
+            sequence_embeddings = sequence_embeddings,
+            sequence_labels = 'data/input/asv_embeddings_ids.npy',
+            upsample=False,
+            drop_remainder=False,
+            gen_new_table_frequency = 1,
+            rarefy_seed = 42 + i
+        ) for i in range(69)
+                  ]
+        if '.keras' in model_fp: #Test on One Model
+            model=tf.keras.models.load_model(model_fp, compile=False)
+            predictions = [model.predict(ds, steps=ds.steps_per_epoch) for ds in gd_test]
+            y_pred, y_true = [], []
+            for y_p, y_t, _ in predictions:
+                y_pred.append(y_p)
+                y_true.append(y_t)
+            y_pred = np.hstack(y_pred)
+            y_true = np.hstack(y_true)
+    
+            auc_score = 0
+            return (y_pred, y_true), auc_score
+        else: #Ensemble Method
+            models = [tf.keras.models.load_model(f'{model_fp}/{sample_type}_{i}_model.keras', compile=False) for i in range(5)]
+            predictions = []
+            for model in models:
+                predictions.append([model.predict(ds, steps=ds.steps_per_epoch) for ds in gd_test])
+            ensemble_y_pred, ensemble_y_true = [], []
+            for model_predictions in predictions:
+                y_pred, y_true = [], []
+                for y_p, y_t, _ in model_predictions:
+                    y_pred.append(y_p)
+                    y_true.append(y_t)
+                y_pred = np.hstack(y_pred)
+                y_true = np.hstack(y_true)
+                ensemble_y_pred.append(y_pred)
+                ensemble_y_true.append(y_true)
+            ensemble_y_pred = np.vstack(ensemble_y_pred).mean(axis=0)
+            ensemble_y_true = np.vstack(ensemble_y_true).mean(axis=0)
+        
+            auc_score = 0
+            return (ensemble_y_pred, ensemble_y_true), auc_score
@@ -0,0 +1,170 @@
+from aam.models.sequence_regressor import SequenceRegressor
+from aam.models.sequence_regressor_v2 import SequenceRegressorV2
+from aam.callbacks import SaveModel
+from keras.callbacks import EarlyStopping
+from AAM.model import GeneratorDataset
+from AAM.model import Classifier
+
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+import seaborn as sns
+from sklearn.metrics import roc_curve, auc, precision_recall_curve
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split, StratifiedKFold
+import biom
+from biom import Table, load_table
+import os
+import sys
+import warnings
+
+gpus = tf.config.list_physical_devices("GPU")
+if len(gpus) > 0:
+    tf.config.experimental.set_memory_growth(gpus[0], True)
+
+warnings.filterwarnings('ignore')
+
+K = tf.keras
+
+def get_sample_type(file_path):
+    filename = os.path.basename(file_path)
+    # Remove the 'training_metadata_' prefix and the file extension
+    if filename.startswith('training_metadata_'):
+        sample_type = filename[len('training_metadata_'):]
+        sample_type = os.path.splitext(sample_type)[0]
+        return sample_type
+    return "Unknown"
+    
+#function that creates training and valid split and trains each model
+def train_model(train_fp, opt_type, hidden_dim, num_hidden_layers, dropout_rate, learning_rate, beta_1=None, beta_2=None, weight_decay=None, momentum=None, model_fp=None, large=True, use_cova=False):
+        training_metadata = pd.read_csv(train_fp, sep='\t', index_col=0)
+        X = training_metadata.drop(columns=['study_sample_type', 'has_covid'], axis=1)
+        y = training_metadata[['study_sample_type', 'has_covid']]
+        sample_type = get_sample_type(train_fp)
+        dir_path = f'trained_models_aam/{sample_type}'
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+        if not large:
+            sequence_embedding_fp = 'data/input/asv_embeddings_aam.npy'
+            sequence_embedding_dim = 256
+        else:
+            sequence_embedding_fp = 'data/input/asv_embeddings_large.npy'
+            sequence_embedding_dim = 512
+    
+        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+    
+        curr_best_val_loss = np.inf
+        curr_best_model = None
+        for i, (train_index, valid_index) in enumerate(skf.split(y, y['has_covid'])):
+            y_train = y.iloc[train_index]
+            y_valid = y.iloc[valid_index]
+    
+            if sample_type == 'stool':
+                rarefy_depth = 4000
+            else:
+                rarefy_depth = 1000
+            dataset_train = GeneratorDataset(
+                table='data/input/merged_biom_table.biom',
+                metadata=y_train,
+                metadata_column='has_covid',
+                shuffle=True,
+                is_categorical=False,
+                shift=0,
+                rarefy_depth = rarefy_depth,
+                scale=1,
+                epochs=100000,
+                batch_size = 4,
+                gen_new_tables = True, #only in training dataset
+                sequence_embeddings = sequence_embedding_fp,
+                sequence_labels = 'data/input/asv_embeddings_ids.npy',
+                upsample=False,
+                drop_remainder=False
+            )
+        
+            dataset_valid = GeneratorDataset(
+                table='data/input/merged_biom_table.biom',
+                metadata=y_valid,
+                metadata_column='has_covid',
+                shuffle=False,
+                is_categorical=False,
+                shift=0,
+                rarefy_depth = rarefy_depth,
+                scale=1,
+                epochs=100000,
+                batch_size = 4,
+                sequence_embeddings = sequence_embedding_fp,
+                sequence_labels = 'data/input/asv_embeddings_ids.npy',
+                upsample=False,
+                drop_remainder=False,
+                rarefy_seed = 42
+            )
+        
+    
+            if model_fp is None:
+                model = Classifier(hidden_dim=hidden_dim, num_hidden_layers=num_hidden_layers, dropout_rate=dropout_rate, use_cova=use_cova)
+            else:
+                model = tf.keras.models.load_model(model_fp, compile=False)
+            token_shape = tf.TensorShape([None, sequence_embedding_dim])
+            batch_indicies = tf.TensorShape([None, 2])
+            indicies_shape = tf.TensorShape([None])
+            count_shape = tf.TensorShape([None, 1])
+            model.build([token_shape, batch_indicies, indicies_shape, count_shape])
+            model.summary()
+            if opt_type == 'adam':
+                optimizer = tf.keras.optimizers.Adam(
+                    learning_rate=tf.keras.optimizers.schedules.CosineDecay(
+                    initial_learning_rate = 0.0,
+                    warmup_target = learning_rate, # maybe change
+                    warmup_steps=0,
+                    decay_steps=250000,
+                    ),
+                    use_ema = True,
+                    beta_1 = beta_1,
+                    beta_2 = beta_2,
+                    weight_decay = weight_decay
+                    )
+                early_stop = EarlyStopping(patience=250, start_from_epoch=250, restore_best_weights=False)
+            else:
+                optimizer = tf.keras.optimizers.legacy.SGD(
+                    learning_rate=tf.keras.optimizers.schedules.CosineDecay(
+                    initial_learning_rate = 0.0,
+                    warmup_target = learning_rate, # maybe change
+                    warmup_steps=0,
+                    decay_steps=250000,
+                    ),
+                    momentum = momentum
+                )
+                early_stop = EarlyStopping(patience=250, start_from_epoch=250, restore_best_weights=True)
+        
+            model.compile(optimizer=optimizer, run_eagerly=False)
+            #switch loss to val loss 
+            #pass early stopping for callbacks
+            history = model.fit(dataset_train, 
+                      validation_data = dataset_valid, 
+                      validation_steps=dataset_valid.steps_per_epoch, 
+                      epochs=10000, 
+                      steps_per_epoch=dataset_train.steps_per_epoch, 
+                      callbacks=[
+                                 early_stop
+                                ])
+            
+            if opt_type == 'adam':
+                model.optimizer.finalize_variable_values(model.trainable_variables)
+    
+            validation_loss = history.history['val_loss']
+            train_loss = history.history['loss']
+            epochs = np.array(range(len(validation_loss)))
+    
+            min_val_loss = np.min(history.history['val_loss'])
+            if min_val_loss < curr_best_val_loss:
+                curr_best_model = model
+                curr_best_val_loss = min_val_loss
+    
+            plt.plot(epochs, validation_loss, color='blue')
+            plt.title(f'Validation Loss Per Epoch, Best: {curr_best_val_loss} Final: {min_val_loss}')
+            plt.plot(epochs, train_loss, color='red')
+            plt.savefig(os.path.join(dir_path, f'{sample_type}_{i}_model_loss.png'))
+            plt.close()
+            model.save(os.path.join(dir_path, f'{sample_type}_{i}_model.keras'), save_format='keras')
+        curr_best_model.save(os.path.join(dir_path, f'{sample_type}_best_model.keras'), save_format='keras')
+        print(f"\nAAM: Best model saved for {sample_type} samples {opt_type}.")
@@ -27,13 +27,13 @@
 K = tf.keras
 
 def get_sample_type(file_path):
-            filename = os.path.basename(file_path)
-            # Remove the 'training_metadata_' prefix and the file extension
-            if filename.startswith('training_metadata_'):
-                sample_type = filename[len('training_metadata_'):]
-                sample_type = os.path.splitext(sample_type)[0]
-                return sample_type
-            return "Unknown"
+    filename = os.path.basename(file_path)
+    # Remove the 'training_metadata_' prefix and the file extension
+    if filename.startswith('training_metadata_'):
+        sample_type = filename[len('training_metadata_'):]
+        sample_type = os.path.splitext(sample_type)[0]
+        return sample_type
+    return "Unknown"
 
 #function that creates training and valid split and trains each model
 def train_model(train_fp, opt_type, hidden_dim, num_hidden_layers, dropout_rate, learning_rate, beta_1=None, beta_2=None, weight_decay=None, momentum=None, model_fp=None, large=True, use_cova=False):