From b38cdf25c58767b409ae5956b4dd4c0cf641c6c5 Mon Sep 17 00:00:00 2001
From: Jack Leary <j.leary@ufl.edu>
Date: Wed, 18 Oct 2023 12:08:03 -0400
Subject: [PATCH] improved gene embedding and smoothed counts functions

---
 R/embedGenes.R           | 55 ++++++++++++++++++++++++++++------------
 R/smoothedCountsMatrix.R |  5 ++++
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/R/embedGenes.R b/R/embedGenes.R
index 56e0752..bae37a9 100644
--- a/R/embedGenes.R
+++ b/R/embedGenes.R
@@ -1,4 +1,4 @@
-#' Generate a table of fitted values and celltype metadata for genes of interest.
+#' Generate PCA & UMAP embeddings of fitted gene dynamics.
 #'
 #' @name embedGenes
 #' @author Jack Leary
@@ -9,8 +9,9 @@
 #' @description Embed genes in dimension-reduced space given a smoothed counts matrix.
 #' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL.
 #' @param genes A character vector of genes to embed. If not specified, all genes in \code{smoothed.counts} are used. Defaults to NULL.
+#' @param pca.init A boolean specifying whether or not the embedded PCs should be used as initialization for clustering and UMAP. The default is to cluster / embed the raw dynamics i.e., defaults to FALSE.
 #' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30.
-#' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2.
+#' @param pc.return (Optional) How many principal components should be included in the output? Defaults to 2.
 #' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE.
 #' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL.
 #' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.
@@ -27,8 +28,9 @@
 
 embedGenes <- function(smoothed.counts = NULL,
                        genes = NULL,
+                       pca.init = FALSE,
                        pc.embed = 30,
-                       pcs.return = 2,
+                       pc.return = 2,
                        cluster.genes = TRUE,
                        gene.meta.data = NULL,
                        k.param = 20,
@@ -44,20 +46,41 @@ embedGenes <- function(smoothed.counts = NULL,
                                              n = pc.embed,
                                              center = TRUE,
                                              scale. = TRUE)
-  smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x,
-                                     n_components = 2,
-                                     metric = "cosine",
-                                     n_neighbors = k.param,
-                                     init = "spectral")
+  if (pca.init) {
+    smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x,
+                                       n_components = 2,
+                                       metric = "cosine",
+                                       n_neighbors = k.param,
+                                       init = "spectral",
+                                       nn_method = "annoy")
+  } else {
+    smoothed_counts_umap <- uwot::umap(smoothed.counts,
+                                       n_components = 2,
+                                       metric = "cosine",
+                                       n_neighbors = k.param,
+                                       init = "spectral",
+                                       nn_method = "annoy")
+  }
   # clustering w/ silhouette score parameter tuning
   if (cluster.genes) {
-    smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x,
-                                                 k = k.param,
-                                                 type = "jaccard",
-                                                 BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
+    if (pca.init) {
+      smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x,
+                                                   k = k.param,
+                                                   type = "jaccard",
+                                                   BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
+    } else {
+      smoothed_counts_snn <- bluster::makeSNNGraph(smoothed.counts,
+                                                   k = k.param,
+                                                   type = "jaccard",
+                                                   BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
+    }
     if (is.null(resolution.param)) {
-      dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
-      clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) {
+      if (pca.init) {
+        dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
+      } else {
+        dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed.counts))
+      }
+      clust_runs <- purrr::map(seq(0.1, 0.7, by = 0.1), \(r) {
         smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn,
                                                         objective_function = "modularity",
                                                         resolution_parameter = r)
@@ -83,8 +106,8 @@ embedGenes <- function(smoothed.counts = NULL,
     cluster_vec <- NA_integer_
   }
   # prepare results
-  pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)])
-  colnames(pca_df) <- paste0("pc", seq(pcs.return))
+  pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pc.return)])
+  colnames(pca_df) <- paste0("pc", seq(pc.return))
   gene_df <- data.frame(gene = genes,
                         leiden = cluster_vec,
                         umap1 = smoothed_counts_umap[, 1],
diff --git a/R/smoothedCountsMatrix.R b/R/smoothedCountsMatrix.R
index bdcbd85..d8c6b71 100644
--- a/R/smoothedCountsMatrix.R
+++ b/R/smoothedCountsMatrix.R
@@ -12,6 +12,7 @@
 #' @param pt A data.frame of pseudotime values for each cell. Defaults to NULL.
 #' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.
 #' @param genes (Optional) A character vector of genes with which to subset the results. Defaults to NULL.
+#' @param log1p.norm A boolean specifying whether the smoothed counts should be log1p-transformed after depth normalization. Defaults to FALSE.
 #' @param parallel.exec Should \code{furrr} be used to speed up execution? Defaults to TRUE.
 #' @param n.cores If parallel execution is desired, how many cores should be utilized? Defaults to 2.
 #' @return A list of matrices of smoothed counts, with each element of the list being a single pseudotime lineage.
@@ -31,6 +32,7 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL,
                                  size.factor.offset = NULL,
                                  pt = NULL,
                                  genes = NULL,
+                                 log1p.norm = FALSE,
                                  parallel.exec = TRUE,
                                  n.cores = 2) {
   # check inputs
@@ -65,6 +67,9 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL,
                        }) %>%
                        purrr::reduce(cbind)
     colnames(fitted_vals_mat) <- names(fitted_vals_list)
+    if (log1p.norm) {
+      fitted_vals_mat <- log1p(fitted_vals_mat)
+    }
     return(fitted_vals_mat)
   })
   names(lineage_mat_list) <- paste0("Lineage_", lineages)