Skip to content

Commit

Permalink
Merge pull request #2 from ivichadriana/main
Browse files Browse the repository at this point in the history
adding test file for workflow to run properly
  • Loading branch information
ivichadriana authored Aug 12, 2024
2 parents 2e445e6 + 96a92e9 commit ebff1bc
Show file tree
Hide file tree
Showing 10 changed files with 1,727 additions and 33 deletions.
1,490 changes: 1,490 additions & 0 deletions docs/notebooks/hyperparameter_grid_search.ipynb

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ urls.Source = "https://github.com/ivichadriana/single_translator_VAE"
urls.Home-page = "https://github.com/ivichadriana/single_translator_VAE"
dependencies = [
"anndata",
# for debug logging (referenced from the issue template)
"session-info",
"scvi",
"scvi-tools",
"scanpy",
"openTSNE"
]

[project.optional-dependencies]
Expand Down
1 change: 1 addition & 0 deletions src/single_translator_VAE/pl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .plot import (
deconvolution_results,
deconvolution_tsne,
estimated_vs_real_proportions,
loss_plots,
pca_of_augmentation,
Expand Down
80 changes: 77 additions & 3 deletions src/single_translator_VAE/pl/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,79 @@
import seaborn as sns
import torch
from matplotlib.patches import Patch
from openTSNE import TSNEEmbedding, affinity, initialization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def deconvolution_tsne(normalized_pseudo_df, ref_df):
"""
Generate t-SNE plots of reference data frames projected into the t-SNE space of normalized pseudobulks.
This function visualizes the alignment of reference cell type expression profiles with pseudobulk data
using t-Distributed Stochastic Neighbor Embedding (t-SNE). The function projects both the normalized
pseudobulk data and the reference data into a shared t-SNE space and plots the results for comparison.
Parameters
----------
normalized_pseudo_df : pandas.DataFrame
DataFrame containing the normalized pseudobulk expression data. Each row represents a sample,
and each column represents a gene.
ref_df : pandas.DataFrame
DataFrame containing the reference expression data. Each row represents a gene, and each column
represents a cell type or sample.
Returns
-------
None
The function generates and displays a t-SNE plot comparing the reference and pseudobulk data.
The plot is displayed using matplotlib.
Notes
-----
- The t-SNE embeddings are optimized separately for the pseudobulk and reference data.
- The PerplexityBasedNN method is used to compute the affinities, and PCA is used for initial embedding.
- The function uses a high perplexity value of 60 to capture the global structure of the data.
- The reference data is transformed into the pseudobulk t-SNE space for visualization.
"""
# tsne plots of reference dataframes projected in normalized pseudobulks TSNE
# figures
plt.figure(figsize=(12, 5))
# data used
x_train = normalized_pseudo_df.values
x_test = ref_df.T.values

# Compute the affinities between data points
affinities_train = affinity.PerplexityBasedNN(
x_train,
perplexity=60,
metric="euclidean",
n_jobs=8,
random_state=42,
verbose=True,
)
# initialize coordinates for embedd.
init_train = initialization.pca(x_train, random_state=42)
embedding_train = TSNEEmbedding(
init_train,
affinities_train,
negative_gradient_method="fft",
n_jobs=8,
verbose=True,
)
# optimize embedding
embedding_train = embedding_train.optimize(n_iter=500)
# transform both in train embedd.
tsne_train = embedding_train.transform(x_train)
tsne_test = embedding_train.transform(x_test)
plt.scatter(tsne_train[:, 0], tsne_train[:, 1], label="Pseudobulks tSNE")
plt.scatter(tsne_test[:, 0], tsne_test[:, 1], label="Reference tSNE")
plt.title("Reference in Pseudobulks TSNE")
plt.xlabel("tSNE 1")
plt.ylabel("tSNE 2")
plt.legend()


def loss_plots(
loss_to_plot: Optional[str] = "summed",
training_losses: np.ndarray = None,
Expand All @@ -17,7 +86,6 @@ def loss_plots(
all_val_losses: Optional[dict] = None,
):
"""
Plot the training loss over epochs.
Parameters
Expand Down Expand Up @@ -539,22 +607,28 @@ def deconvolution_results(evaluation_results):
rmse_values = [evaluation_results[ref]["RMSE"] for ref in ref_names]
correlation_values = [evaluation_results[ref]["Correlation"] for ref in ref_names]

# Define custom colors for each reference
colors = ["red", "blue", "pink"]
palette = dict(zip(ref_names, colors))

# Plot RMSE
plt.figure(figsize=(10, 5))
sns.barplot(x=ref_names, y=rmse_values, palette="viridis")
sns.barplot(x=ref_names, y=rmse_values, palette=palette, hue=ref_names, dodge=False, alpha=0.6)
plt.title("RMSE for Each Reference")
plt.ylabel("RMSE")
plt.xlabel("Reference")
plt.ylim(0, max(rmse_values) * 1.1)
plt.legend([], [], frameon=False) # Hides the legend
plt.show()

# Plot Pearson's Correlation
plt.figure(figsize=(10, 5))
sns.barplot(x=ref_names, y=correlation_values, palette="plasma")
sns.barplot(x=ref_names, y=correlation_values, palette=palette, hue=ref_names, dodge=False, alpha=0.6)
plt.title("Pearson's Correlation for Each Reference")
plt.ylabel("Correlation")
plt.xlabel("Reference")
plt.ylim(0, 1)
plt.legend([], [], frameon=False) # Hides the legend
plt.show()


Expand Down
4 changes: 2 additions & 2 deletions src/single_translator_VAE/pp/single_cell_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,8 @@ def match_cell_types(adata1: AnnData, adata2: AnnData, cells_to_keep: np.array):
AnnData with same cell types.
"""
# match cells:
adata1 = adata1[adata1.obs["cell_types"].isin(cells_to_keep)]
adata2 = adata2[adata2.obs["cell_types"].isin(cells_to_keep)]
adata1 = adata1[adata1.obs["cell_types"].isin(cells_to_keep)].copy()
adata2 = adata2[adata2.obs["cell_types"].isin(cells_to_keep)].copy()

return adata1, adata2

Expand Down
1 change: 1 addition & 0 deletions src/single_translator_VAE/tl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .single_cell_tools import (
augment_data,
calc_nnls,
calculate_rmse,
calculate_rmse_per_celltype,
evaluate_deconvolution_references,
Expand Down
54 changes: 40 additions & 14 deletions src/single_translator_VAE/tl/single_cell_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
import pandas as pd
import ray as ray
import scanpy as sc
import sklearn as sk
import torch
import torch.nn
from scipy.optimize import nnls
from scipy.stats import poisson
from sklearn.preprocessing import MinMaxScaler

import single_translator_VAE as sv

Expand Down Expand Up @@ -51,53 +53,76 @@ def prep_files_for_deconvolution(bulks, reference, path, file_name, deconvolutio
"""
genes = reference.var.index
cell_types = reference.obs.cell_types.unique()
reference = reference.X.toarray()

method_path = os.path.join(path, deconvolution_method, "input_data")
if not os.path.exists(method_path):
os.makedirs(method_path)

file_paths = {}
files = {}
match deconvolution_method:
case "cibersortx":
reference_data = np.vstack([np.append("gene_ids", cell_types), np.column_stack([genes, reference])])
ref_file_path = os.path.join(method_path, f"{file_name}_reference.txt")
np.savetxt(ref_file_path, reference_data, fmt="%s", delimiter="\t")
file_paths["reference"] = ref_file_path
files["reference"] = ref_file_path

if bulks is not None:
bulks_data = np.vstack(
[np.append("gene_ids", np.arange(bulks.shape[1])), np.column_stack([genes, bulks])]
)
bulks_file_path = os.path.join(method_path, f"{file_name}_bulks.txt")
np.savetxt(bulks_file_path, bulks_data, fmt="%s", delimiter="\t")
file_paths["bulks"] = bulks_file_path
files["bulks"] = bulks_file_path

case "nnls":
reference_data = np.hstack([np.array([["gene_ids"] + list(genes)]).T, np.vstack([cell_types, reference]).T])
# Making cell type reference, then scaling
ref_raw = pd.DataFrame(index=genes, columns=cell_types)
for cell_type in cell_types:
cell_df = reference[reference.obs["cell_types"].isin([cell_type])].X.toarray()
cell_sample = sk.utils.resample(cell_df, n_samples=10000, replace=True)
ref_raw[cell_type] = cell_sample.sum(axis=0)

# clippign before scaling to 95th pecentile
clip_upper = np.quantile(ref_raw.values, 0.95)
ref_raw_val = np.clip(ref_raw.values, 0, clip_upper)

# and scaling to be between values 0 and 1 to use for NNLS
scaler = MinMaxScaler()
scaler.fit(ref_raw_val)
ref_raw_val = scaler.transform(ref_raw_val)
reference_data = pd.DataFrame(ref_raw_val, index=genes, columns=cell_types)

ref_file_path = os.path.join(method_path, f"{file_name}_reference.txt")
np.savetxt(ref_file_path, reference_data, fmt="%s", delimiter="\t")
file_paths["reference"] = ref_file_path
files["reference"] = reference_data

if bulks is not None:
bulks_data = np.hstack(
[np.array([["gene_ids"] + list(genes)]).T, np.vstack([np.arange(bulks.shape[1]), bulks]).T]
)
# clippign before scaling to 95th pecentile
clip_upper = np.quantile(bulks.values, 0.95)
pseudo_df = np.clip(bulks.values, 0, clip_upper)
# and normalize to values between 0 and 1
scaler = MinMaxScaler()
scaler.fit(pseudo_df)
normalized_pseudo_df = scaler.transform(pseudo_df)
bulks_data = pd.DataFrame(normalized_pseudo_df, columns=genes)

bulks_file_path = os.path.join(method_path, f"{file_name}_bulks.txt")
np.savetxt(bulks_file_path, bulks_data, fmt="%s", delimiter="\t")
file_paths["bulks"] = bulks_file_path
files["bulks"] = bulks_data

case "bayesprism":
reference_df = pd.DataFrame(reference, index=genes, columns=cell_types)
ref_file_path = os.path.join(method_path, f"{file_name}_reference.csv")
reference_df.to_csv(ref_file_path, index_label=False)
file_paths["reference"] = ref_file_path
files["reference"] = ref_file_path

if bulks is not None:
bulks_df = pd.DataFrame(bulks, index=genes, columns=np.arange(bulks.shape[1]))
bulks_file_path = os.path.join(method_path, f"{file_name}_bulks.csv")
bulks_df.to_csv(bulks_file_path, index_label=False)
file_paths["bulks"] = bulks_file_path
files["bulks"] = bulks_file_path

return file_paths
return files


def make_pseudobulks(adata, number_of_bulks, num_cells, prop_type, noise):
Expand Down Expand Up @@ -133,6 +158,7 @@ def make_pseudobulks(adata, number_of_bulks, num_cells, prop_type, noise):
raise ValueError("prop_type must be either 'random' or 'real'")

cell_counts = (prop_vector * num_cells).astype(int)

while np.any(cell_counts == 0):
prop_vector = np.random.dirichlet(np.ones(len(cell_types)))
cell_counts = (prop_vector * num_cells).astype(int)
Expand Down Expand Up @@ -446,7 +472,7 @@ def calc_nnls(all_refs, prop_df, pseudo_df):

# Extracting reference matrix and verifying its integrity
ref = ref_df.values
if ref.shape[0] != prop_df.shape[1]:
if ref.shape[1] != prop_df.shape[1]:
raise ValueError(f"Reference '{exp}' and prop_df have inconsistent dimensions.")

# Calculate predicted values and residuals for each row
Expand Down
29 changes: 26 additions & 3 deletions src/single_translator_VAE/vae/vae_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __init__(
encode_batch: Tunable[bool] = False,
transform_batch: torch.Tensor | None = None,
batch_representation: str = "one-hot",
class_weights: bool = True,
# kl_annealing_start: Tunable[float] = 10.0,
# kl_annealing_steepness: Tunable[float] = 0.5,
**model_kwargs,
Expand All @@ -112,11 +113,15 @@ def __init__(
self.encode_batch = encode_batch
self.transform_batch = transform_batch
self.batch_representation = batch_representation
self.class_weights = class_weights

library_log_means, library_log_vars = _init_library_size(self.adata_manager, self.summary_stats["n_batch"])

# self.summary_stats provides information about anndata dimensions and other tensor info

if self.class_weights:
self.class_weights_tensor = self.calculate_class_weighting()

self.module = VAEModule(
n_input=self.summary_stats["n_vars"],
n_hidden=self.n_hidden,
Expand All @@ -135,6 +140,8 @@ def __init__(
encode_batch=self.encode_batch,
transform_batch=self.transform_batch,
batch_representation=self.batch_representation,
class_weights=self.class_weights,
class_weights_tensor=self.class_weights_tensor if self.class_weights else None,
**model_kwargs,
)
self._model_summary_string = (
Expand All @@ -150,6 +157,22 @@ def __init__(
tl["index"], tl["label"] = range(0, len(self.adata.obs)), self.adata.obs.labels_key.values
self.train_labels = tl.copy()

def calculate_class_weighting(self):
"""
Calculates class weights based on the inverse frequency of each class in the dataset.
Returns
-------
torch.Tensor
A tensor containing the weights for each class.
"""
labels = self.adata.obs["labels_key"]
unique_labels, counts = np.unique(labels, return_counts=True)
total_samples = len(labels)
class_weights = total_samples / (len(unique_labels) * counts)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)
return class_weights_tensor

@classmethod
@setup_anndata_dsp.dedent
def setup_anndata(
Expand Down Expand Up @@ -272,8 +295,8 @@ def train(
max_epochs: int = 500,
use_gpu: bool = False,
validation_size: float = 0.2,
patience: int = 15,
min_delta: float = 0.1,
patience: int = 18,
min_delta: float = 0.35,
):
"""
Train the model with validation support and a progress bar, with early stopping.
Expand Down Expand Up @@ -317,7 +340,7 @@ def train(
optimizer = Adam(self.module.parameters(), lr=self.lr, weight_decay=self.weight_decay)

# Initialize learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.5, min_lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=20, factor=0.85, min_lr=1e-5)

# Progress bar setup
progress_bar = tqdm(range(max_epochs), desc="Epochs", leave=True)
Expand Down
Loading

0 comments on commit ebff1bc

Please sign in to comment.