From b38cdf25c58767b409ae5956b4dd4c0cf641c6c5 Mon Sep 17 00:00:00 2001 From: Jack Leary Date: Wed, 18 Oct 2023 12:08:03 -0400 Subject: [PATCH] improved gene embedding and smoothed counts functions --- R/embedGenes.R | 55 ++++++++++++++++++++++++++++------------ R/smoothedCountsMatrix.R | 5 ++++ 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/R/embedGenes.R b/R/embedGenes.R index 56e0752..bae37a9 100644 --- a/R/embedGenes.R +++ b/R/embedGenes.R @@ -1,4 +1,4 @@ -#' Generate a table of fitted values and celltype metadata for genes of interest. +#' Generate PCA & UMAP embeddings of fitted gene dynamics. #' #' @name embedGenes #' @author Jack Leary @@ -9,8 +9,9 @@ #' @description Embed genes in dimension-reduced space given a smoothed counts matrix. #' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL. #' @param genes A character vector of genes to embed. If not specified, all genes in \code{smoothed.counts} are used. Defaults to NULL. +#' @param pca.init A boolean specifying whether or not the embedded PCs should be used as initialization for clustering and UMAP. The default is to cluster / embed the raw dynamics i.e., defaults to FALSE. #' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30. -#' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2. +#' @param pc.return (Optional) How many principal components should be included in the output? Defaults to 2. #' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE. #' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL. #' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20. @@ -27,8 +28,9 @@ embedGenes <- function(smoothed.counts = NULL, genes = NULL, + pca.init = FALSE, pc.embed = 30, - pcs.return = 2, + pc.return = 2, cluster.genes = TRUE, gene.meta.data = NULL, k.param = 20, @@ -44,20 +46,41 @@ embedGenes <- function(smoothed.counts = NULL, n = pc.embed, center = TRUE, scale. = TRUE) - smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x, - n_components = 2, - metric = "cosine", - n_neighbors = k.param, - init = "spectral") + if (pca.init) { + smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x, + n_components = 2, + metric = "cosine", + n_neighbors = k.param, + init = "spectral", + nn_method = "annoy") + } else { + smoothed_counts_umap <- uwot::umap(smoothed.counts, + n_components = 2, + metric = "cosine", + n_neighbors = k.param, + init = "spectral", + nn_method = "annoy") + } # clustering w/ silhouette score parameter tuning if (cluster.genes) { - smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x, - k = k.param, - type = "jaccard", - BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) + if (pca.init) { + smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x, + k = k.param, + type = "jaccard", + BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) + } else { + smoothed_counts_snn <- bluster::makeSNNGraph(smoothed.counts, + k = k.param, + type = "jaccard", + BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) + } if (is.null(resolution.param)) { - dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) - clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) { + if (pca.init) { + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) + } else { + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed.counts)) + } + clust_runs <- purrr::map(seq(0.1, 0.7, by = 0.1), \(r) { smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn, objective_function = "modularity", resolution_parameter = r) @@ -83,8 +106,8 @@ embedGenes <- function(smoothed.counts = NULL, cluster_vec <- NA_integer_ } # prepare results - pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)]) - colnames(pca_df) <- paste0("pc", seq(pcs.return)) + pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pc.return)]) + colnames(pca_df) <- paste0("pc", seq(pc.return)) gene_df <- data.frame(gene = genes, leiden = cluster_vec, umap1 = smoothed_counts_umap[, 1], diff --git a/R/smoothedCountsMatrix.R b/R/smoothedCountsMatrix.R index bdcbd85..d8c6b71 100644 --- a/R/smoothedCountsMatrix.R +++ b/R/smoothedCountsMatrix.R @@ -12,6 +12,7 @@ #' @param pt A data.frame of pseudotime values for each cell. Defaults to NULL. #' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL. #' @param genes (Optional) A character vector of genes with which to subset the results. Defaults to NULL. +#' @param log1p.norm A boolean specifying whether the smoothed counts should be log1p-transformed after depth normalization. Defaults to FALSE. #' @param parallel.exec Should \code{furrr} be used to speed up execution? Defaults to TRUE. #' @param n.cores If parallel execution is desired, how many cores should be utilized? Defaults to 2. #' @return A list of matrices of smoothed counts, with each element of the list being a single pseudotime lineage. @@ -31,6 +32,7 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL, size.factor.offset = NULL, pt = NULL, genes = NULL, + log1p.norm = FALSE, parallel.exec = TRUE, n.cores = 2) { # check inputs @@ -65,6 +67,9 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL, }) %>% purrr::reduce(cbind) colnames(fitted_vals_mat) <- names(fitted_vals_list) + if (log1p.norm) { + fitted_vals_mat <- log1p(fitted_vals_mat) + } return(fitted_vals_mat) }) names(lineage_mat_list) <- paste0("Lineage_", lineages)