Merge pull request #2 from ivichadriana/main

adding test file for workflow to run properly
greenelab · Aug 12, 2024 · ebff1bc · ebff1bc
2 parents 2e445e6 + 96a92e9
commit ebff1bc
Show file tree

Hide file tree

Showing 10 changed files with 1,727 additions and 33 deletions.
diff --git a/docs/notebooks/hyperparameter_grid_search.ipynb b/docs/notebooks/hyperparameter_grid_search.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,8 +20,11 @@ urls.Source = "https://github.com/ivichadriana/single_translator_VAE"
 urls.Home-page = "https://github.com/ivichadriana/single_translator_VAE"
 dependencies = [
     "anndata",
-    # for debug logging (referenced from the issue template)
     "session-info",
+    "scvi",
+    "scvi-tools",
+    "scanpy",
+    "openTSNE"
 ]
 
 [project.optional-dependencies]

diff --git a/src/single_translator_VAE/pl/__init__.py b/src/single_translator_VAE/pl/__init__.py
@@ -1,5 +1,6 @@
 from .plot import (
     deconvolution_results,
+    deconvolution_tsne,
     estimated_vs_real_proportions,
     loss_plots,
     pca_of_augmentation,

diff --git a/src/single_translator_VAE/pl/plot.py b/src/single_translator_VAE/pl/plot.py
@@ -5,10 +5,79 @@
 import seaborn as sns
 import torch
 from matplotlib.patches import Patch
+from openTSNE import TSNEEmbedding, affinity, initialization
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 
 
+def deconvolution_tsne(normalized_pseudo_df, ref_df):
+    """
+    Generate t-SNE plots of reference data frames projected into the t-SNE space of normalized pseudobulks.
+
+    This function visualizes the alignment of reference cell type expression profiles with pseudobulk data
+    using t-Distributed Stochastic Neighbor Embedding (t-SNE). The function projects both the normalized
+    pseudobulk data and the reference data into a shared t-SNE space and plots the results for comparison.
+
+    Parameters
+    ----------
+    normalized_pseudo_df : pandas.DataFrame
+        DataFrame containing the normalized pseudobulk expression data. Each row represents a sample,
+        and each column represents a gene.
+    ref_df : pandas.DataFrame
+        DataFrame containing the reference expression data. Each row represents a gene, and each column
+        represents a cell type or sample.
+
+    Returns
+    -------
+    None
+        The function generates and displays a t-SNE plot comparing the reference and pseudobulk data.
+        The plot is displayed using matplotlib.
+
+    Notes
+    -----
+    - The t-SNE embeddings are optimized separately for the pseudobulk and reference data.
+    - The PerplexityBasedNN method is used to compute the affinities, and PCA is used for initial embedding.
+    - The function uses a high perplexity value of 60 to capture the global structure of the data.
+    - The reference data is transformed into the pseudobulk t-SNE space for visualization.
+    """
+    # tsne plots of reference dataframes projected in normalized pseudobulks TSNE
+    # figures
+    plt.figure(figsize=(12, 5))
+    # data used
+    x_train = normalized_pseudo_df.values
+    x_test = ref_df.T.values
+
+    # Compute the affinities between data points
+    affinities_train = affinity.PerplexityBasedNN(
+        x_train,
+        perplexity=60,
+        metric="euclidean",
+        n_jobs=8,
+        random_state=42,
+        verbose=True,
+    )
+    # initialize coordinates for embedd.
+    init_train = initialization.pca(x_train, random_state=42)
+    embedding_train = TSNEEmbedding(
+        init_train,
+        affinities_train,
+        negative_gradient_method="fft",
+        n_jobs=8,
+        verbose=True,
+    )
+    # optimize embedding
+    embedding_train = embedding_train.optimize(n_iter=500)
+    # transform both in train embedd.
+    tsne_train = embedding_train.transform(x_train)
+    tsne_test = embedding_train.transform(x_test)
+    plt.scatter(tsne_train[:, 0], tsne_train[:, 1], label="Pseudobulks tSNE")
+    plt.scatter(tsne_test[:, 0], tsne_test[:, 1], label="Reference tSNE")
+    plt.title("Reference in Pseudobulks TSNE")
+    plt.xlabel("tSNE 1")
+    plt.ylabel("tSNE 2")
+    plt.legend()
+
+
 def loss_plots(
     loss_to_plot: Optional[str] = "summed",
     training_losses: np.ndarray = None,
@@ -17,7 +86,6 @@ def loss_plots(
     all_val_losses: Optional[dict] = None,
 ):
     """
-
     Plot the training loss over epochs.
 
     Parameters
@@ -539,22 +607,28 @@ def deconvolution_results(evaluation_results):
     rmse_values = [evaluation_results[ref]["RMSE"] for ref in ref_names]
     correlation_values = [evaluation_results[ref]["Correlation"] for ref in ref_names]
 
+    # Define custom colors for each reference
+    colors = ["red", "blue", "pink"]
+    palette = dict(zip(ref_names, colors))
+
     # Plot RMSE
     plt.figure(figsize=(10, 5))
-    sns.barplot(x=ref_names, y=rmse_values, palette="viridis")
+    sns.barplot(x=ref_names, y=rmse_values, palette=palette, hue=ref_names, dodge=False, alpha=0.6)
     plt.title("RMSE for Each Reference")
     plt.ylabel("RMSE")
     plt.xlabel("Reference")
     plt.ylim(0, max(rmse_values) * 1.1)
+    plt.legend([], [], frameon=False)  # Hides the legend
     plt.show()
 
     # Plot Pearson's Correlation
     plt.figure(figsize=(10, 5))
-    sns.barplot(x=ref_names, y=correlation_values, palette="plasma")
+    sns.barplot(x=ref_names, y=correlation_values, palette=palette, hue=ref_names, dodge=False, alpha=0.6)
     plt.title("Pearson's Correlation for Each Reference")
     plt.ylabel("Correlation")
     plt.xlabel("Reference")
     plt.ylim(0, 1)
+    plt.legend([], [], frameon=False)  # Hides the legend
     plt.show()
 
 

diff --git a/src/single_translator_VAE/pp/single_cell_preprocessing.py b/src/single_translator_VAE/pp/single_cell_preprocessing.py
@@ -307,8 +307,8 @@ def match_cell_types(adata1: AnnData, adata2: AnnData, cells_to_keep: np.array):
         AnnData with same cell types.
     """
     # match cells:
-    adata1 = adata1[adata1.obs["cell_types"].isin(cells_to_keep)]
-    adata2 = adata2[adata2.obs["cell_types"].isin(cells_to_keep)]
+    adata1 = adata1[adata1.obs["cell_types"].isin(cells_to_keep)].copy()
+    adata2 = adata2[adata2.obs["cell_types"].isin(cells_to_keep)].copy()
 
     return adata1, adata2
 

diff --git a/src/single_translator_VAE/tl/__init__.py b/src/single_translator_VAE/tl/__init__.py
@@ -1,5 +1,6 @@
 from .single_cell_tools import (
     augment_data,
+    calc_nnls,
     calculate_rmse,
     calculate_rmse_per_celltype,
     evaluate_deconvolution_references,

diff --git a/src/single_translator_VAE/tl/single_cell_tools.py b/src/single_translator_VAE/tl/single_cell_tools.py
@@ -6,10 +6,12 @@
 import pandas as pd
 import ray as ray
 import scanpy as sc
+import sklearn as sk
 import torch
 import torch.nn
 from scipy.optimize import nnls
 from scipy.stats import poisson
+from sklearn.preprocessing import MinMaxScaler
 
 import single_translator_VAE as sv
 
@@ -51,53 +53,76 @@ def prep_files_for_deconvolution(bulks, reference, path, file_name, deconvolutio
     """
     genes = reference.var.index
     cell_types = reference.obs.cell_types.unique()
-    reference = reference.X.toarray()
 
     method_path = os.path.join(path, deconvolution_method, "input_data")
     if not os.path.exists(method_path):
         os.makedirs(method_path)
 
-    file_paths = {}
+    files = {}
     match deconvolution_method:
         case "cibersortx":
             reference_data = np.vstack([np.append("gene_ids", cell_types), np.column_stack([genes, reference])])
             ref_file_path = os.path.join(method_path, f"{file_name}_reference.txt")
             np.savetxt(ref_file_path, reference_data, fmt="%s", delimiter="\t")
-            file_paths["reference"] = ref_file_path
+            files["reference"] = ref_file_path
 
             if bulks is not None:
                 bulks_data = np.vstack(
                     [np.append("gene_ids", np.arange(bulks.shape[1])), np.column_stack([genes, bulks])]
                 )
                 bulks_file_path = os.path.join(method_path, f"{file_name}_bulks.txt")
                 np.savetxt(bulks_file_path, bulks_data, fmt="%s", delimiter="\t")
-                file_paths["bulks"] = bulks_file_path
+                files["bulks"] = bulks_file_path
+
         case "nnls":
-            reference_data = np.hstack([np.array([["gene_ids"] + list(genes)]).T, np.vstack([cell_types, reference]).T])
+            # Making cell type reference, then scaling
+            ref_raw = pd.DataFrame(index=genes, columns=cell_types)
+            for cell_type in cell_types:
+                cell_df = reference[reference.obs["cell_types"].isin([cell_type])].X.toarray()
+                cell_sample = sk.utils.resample(cell_df, n_samples=10000, replace=True)
+                ref_raw[cell_type] = cell_sample.sum(axis=0)
+
+            # clippign before scaling to 95th pecentile
+            clip_upper = np.quantile(ref_raw.values, 0.95)
+            ref_raw_val = np.clip(ref_raw.values, 0, clip_upper)
+
+            # and scaling to be between values 0 and 1 to use for NNLS
+            scaler = MinMaxScaler()
+            scaler.fit(ref_raw_val)
+            ref_raw_val = scaler.transform(ref_raw_val)
+            reference_data = pd.DataFrame(ref_raw_val, index=genes, columns=cell_types)
+
             ref_file_path = os.path.join(method_path, f"{file_name}_reference.txt")
             np.savetxt(ref_file_path, reference_data, fmt="%s", delimiter="\t")
-            file_paths["reference"] = ref_file_path
+            files["reference"] = reference_data
 
             if bulks is not None:
-                bulks_data = np.hstack(
-                    [np.array([["gene_ids"] + list(genes)]).T, np.vstack([np.arange(bulks.shape[1]), bulks]).T]
-                )
+                # clippign before scaling to 95th pecentile
+                clip_upper = np.quantile(bulks.values, 0.95)
+                pseudo_df = np.clip(bulks.values, 0, clip_upper)
+                # and normalize to values between 0 and 1
+                scaler = MinMaxScaler()
+                scaler.fit(pseudo_df)
+                normalized_pseudo_df = scaler.transform(pseudo_df)
+                bulks_data = pd.DataFrame(normalized_pseudo_df, columns=genes)
+
                 bulks_file_path = os.path.join(method_path, f"{file_name}_bulks.txt")
                 np.savetxt(bulks_file_path, bulks_data, fmt="%s", delimiter="\t")
-                file_paths["bulks"] = bulks_file_path
+                files["bulks"] = bulks_data
+
         case "bayesprism":
             reference_df = pd.DataFrame(reference, index=genes, columns=cell_types)
             ref_file_path = os.path.join(method_path, f"{file_name}_reference.csv")
             reference_df.to_csv(ref_file_path, index_label=False)
-            file_paths["reference"] = ref_file_path
+            files["reference"] = ref_file_path
 
             if bulks is not None:
                 bulks_df = pd.DataFrame(bulks, index=genes, columns=np.arange(bulks.shape[1]))
                 bulks_file_path = os.path.join(method_path, f"{file_name}_bulks.csv")
                 bulks_df.to_csv(bulks_file_path, index_label=False)
-                file_paths["bulks"] = bulks_file_path
+                files["bulks"] = bulks_file_path
 
-    return file_paths
+    return files
 
 
 def make_pseudobulks(adata, number_of_bulks, num_cells, prop_type, noise):
@@ -133,6 +158,7 @@ def make_pseudobulks(adata, number_of_bulks, num_cells, prop_type, noise):
             raise ValueError("prop_type must be either 'random' or 'real'")
 
         cell_counts = (prop_vector * num_cells).astype(int)
+
         while np.any(cell_counts == 0):
             prop_vector = np.random.dirichlet(np.ones(len(cell_types)))
             cell_counts = (prop_vector * num_cells).astype(int)
@@ -446,7 +472,7 @@ def calc_nnls(all_refs, prop_df, pseudo_df):
 
         # Extracting reference matrix and verifying its integrity
         ref = ref_df.values
-        if ref.shape[0] != prop_df.shape[1]:
+        if ref.shape[1] != prop_df.shape[1]:
             raise ValueError(f"Reference '{exp}' and prop_df have inconsistent dimensions.")
 
         # Calculate predicted values and residuals for each row

diff --git a/src/single_translator_VAE/vae/vae_model.py b/src/single_translator_VAE/vae/vae_model.py
@@ -86,6 +86,7 @@ def __init__(
         encode_batch: Tunable[bool] = False,
         transform_batch: torch.Tensor | None = None,
         batch_representation: str = "one-hot",
+        class_weights: bool = True,
         # kl_annealing_start: Tunable[float] = 10.0,
         # kl_annealing_steepness: Tunable[float] = 0.5,
         **model_kwargs,
@@ -112,11 +113,15 @@ def __init__(
         self.encode_batch = encode_batch
         self.transform_batch = transform_batch
         self.batch_representation = batch_representation
+        self.class_weights = class_weights
 
         library_log_means, library_log_vars = _init_library_size(self.adata_manager, self.summary_stats["n_batch"])
 
         # self.summary_stats provides information about anndata dimensions and other tensor info
 
+        if self.class_weights:
+            self.class_weights_tensor = self.calculate_class_weighting()
+
         self.module = VAEModule(
             n_input=self.summary_stats["n_vars"],
             n_hidden=self.n_hidden,
@@ -135,6 +140,8 @@ def __init__(
             encode_batch=self.encode_batch,
             transform_batch=self.transform_batch,
             batch_representation=self.batch_representation,
+            class_weights=self.class_weights,
+            class_weights_tensor=self.class_weights_tensor if self.class_weights else None,
             **model_kwargs,
         )
         self._model_summary_string = (
@@ -150,6 +157,22 @@ def __init__(
         tl["index"], tl["label"] = range(0, len(self.adata.obs)), self.adata.obs.labels_key.values
         self.train_labels = tl.copy()
 
+    def calculate_class_weighting(self):
+        """
+        Calculates class weights based on the inverse frequency of each class in the dataset.
+
+        Returns
+        -------
+        torch.Tensor
+            A tensor containing the weights for each class.
+        """
+        labels = self.adata.obs["labels_key"]
+        unique_labels, counts = np.unique(labels, return_counts=True)
+        total_samples = len(labels)
+        class_weights = total_samples / (len(unique_labels) * counts)
+        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)
+        return class_weights_tensor
+
     @classmethod
     @setup_anndata_dsp.dedent
     def setup_anndata(
@@ -272,8 +295,8 @@ def train(
         max_epochs: int = 500,
         use_gpu: bool = False,
         validation_size: float = 0.2,
-        patience: int = 15,
-        min_delta: float = 0.1,
+        patience: int = 18,
+        min_delta: float = 0.35,
     ):
         """
         Train the model with validation support and a progress bar, with early stopping.
@@ -317,7 +340,7 @@ def train(
         optimizer = Adam(self.module.parameters(), lr=self.lr, weight_decay=self.weight_decay)
 
         # Initialize learning rate scheduler
-        scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.5, min_lr=1e-5)
+        scheduler = ReduceLROnPlateau(optimizer, "min", patience=20, factor=0.85, min_lr=1e-5)
 
         # Progress bar setup
         progress_bar = tqdm(range(max_epochs), desc="Epochs", leave=True)