Merge pull request #127 from jr-leary7/dev

updated tests, docs, and added gene embedding fn -- closes #126
jr-leary7 · Sep 25, 2023 · 5248ffe · 5248ffe
2 parents 63c3c18 + 3e83ffa
commit 5248ffe
Show file tree

Hide file tree

Showing 8 changed files with 206 additions and 65 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -43,6 +43,8 @@ URL: https://github.com/jr-leary7/scLANE
 BugReports: https://github.com/jr-leary7/scLANE/issues
 Suggests: 
     covr, 
+    coop,
+    uwot, 
     ggh4x, 
     knitr,
     irlba, 

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(clusterGenes)
 export(createCellOffset)
+export(embedGenes)
 export(enrichDynamicGenes)
 export(extractBreakpoints)
 export(fitGLMM)
@@ -95,11 +96,11 @@ importFrom(purrr,reduce)
 importFrom(scales,label_comma)
 importFrom(scales,label_number)
 importFrom(splines,bs)
+importFrom(stats,as.dist)
 importFrom(stats,as.formula)
 importFrom(stats,coef)
 importFrom(stats,cutree)
 importFrom(stats,deviance)
-importFrom(stats,dist)
 importFrom(stats,fitted)
 importFrom(stats,fitted.values)
 importFrom(stats,hclust)

diff --git a/NEWS.md b/NEWS.md
@@ -1,14 +1,21 @@
-# scLANE 0.7.2 
+# `scLANE` 0.7.3
+
+* Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering. 
+* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. 
+* Added `theme_scLANE()` for output plots. 
+* Enhanced documentation. 
+
+# `scLANE` 0.7.2 
 
 * Added a function named `sortGenesHeatmap()` that aids in the creation of expression cascade heatmaps by sorting genes according to where in pseudotime their peak expression is. 
 * Changed the parameter `approx.knot` in the `testDynamic()` function to use (stochasticity-controlled) subsampling instead of `seq()` to reduce candidate knot space. 
-* Added `summarizeModels()` to sum up slopes across pseudotime intervals
+* Added `summarizeModels()` to sum up slopes across pseudotime intervals. 
 
-# scLANE 0.7.1
+# `scLANE` 0.7.1
 
 * Changed input format of all functions to allow counts matrices formatted as `SingleCellExperiment` or `Seurat` objects, sparse matrices, or dense matrices.
 * Updated visualization functions to reflect changes made in `ggplot2` v3.4 (mostly changing the `size` parameter in line-based geoms to be `linewidth` instead). 
 
-# scLANE 0.6.3
+# `scLANE` 0.6.3
 
 * Added a `NEWS.md` file to track changes to the package.
diff --git a/R/clusterGenes.R b/R/clusterGenes.R
@@ -2,10 +2,10 @@
 #'
 #' @name clusterGenes
 #' @author Jack Leary
-#' @description This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score.
+#' @description This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score based on the cosine distance.
 #' @import magrittr
 #' @importFrom purrr map discard map2 reduce
-#' @importFrom stats setNames hclust cutree kmeans dist
+#' @importFrom stats setNames hclust cutree kmeans as.dist
 #' @param test.dyn.res The list returned by \code{\link{testDynamic}} - no extra processing required. Defaults to NULL.
 #' @param pt A data.frame containing the pseudotime or latent time estimates for each cell. Defaults to NULL.
 #' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.
@@ -19,6 +19,7 @@
 #' }
 #' @return A data.frame of with three columns: \code{Gene}, \code{Lineage}, and \code{Cluster}.
 #' @seealso \code{\link{testDynamic}}
+#' @seealso \code{\link{embedGenes}}
 #' @seealso \code{\link{plotClusteredGenes}}
 #' @export
 #' @examples
@@ -74,102 +75,80 @@ clusterGenes <- function(test.dyn.res = NULL,
                                              n = n.PC,
                                              center = TRUE,
                                              scale. = TRUE)
+      dist_matrix <- stats::as.dist(1 - coop::tcosine(x = fitted_vals_pca$x))
+    } else {
+      dist_matrix <- stats::as.dist(1 - coop::tcosine(x = fitted_vals_mat))
     }
     # hierarchical clustering routine w/ Ward's linkage
     if (clust.algo  == "hclust") {
-      if (use.pca) {
-        hclust_tree <- stats::hclust(stats::dist(fitted_vals_pca$x), method = "ward.D2")
-      } else {
-        hclust_tree <- stats::hclust(stats::dist(fitted_vals_mat), method = "ward.D2")
-      }
+      hclust_tree <- stats::hclust(dist_matrix, method = "ward.D2")
       k_vals <- c(2:10)
       sil_vals <- vector("numeric", 9L)
+      clust_list <- vector("list", 9L)
       for (k in seq_along(k_vals)) {
         clust_res <- stats::cutree(hclust_tree, k = k_vals[k])
-        if (use.pca) {
-          sil_res <- cluster::silhouette(clust_res, stats::dist(fitted_vals_pca$x))
-        } else {
-          sil_res <- cluster::silhouette(clust_res, stats::dist(fitted_vals_mat))
-        }
-        sil_vals[k] <- mean(sil_res[, 3])  # silhouette widths stored in third column
+        sil_res <- cluster::silhouette(clust_res, dist_matrix)
+        sil_vals[k] <- mean(sil_res[, 3])
+        clust_list[[k]] <- clust_res
       }
-      k_to_use <- k_vals[which.max(sil_vals)]
-      clust_res <- stats::cutree(hclust_tree, k = k_to_use)
       gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),
                                   Lineage = lineages[l],
-                                  Cluster = clust_res)
+                                  Cluster = clust_list[[which.max(sil_vals)]])
     # k-means clustering routine w/ Hartigan-Wong algorithm
     } else if (clust.algo == "kmeans") {
       k_vals <- c(2:10)
       sil_vals <- vector("numeric", 9L)
+      clust_list <- vector("list", 9L)
       for (k in seq_along(k_vals)) {
         if (use.pca) {
           clust_res <- stats::kmeans(fitted_vals_pca$x,
                                      centers = k_vals[k],
                                      nstart = 5,
                                      algorithm = "Hartigan-Wong")
-          sil_res <- cluster::silhouette(clust_res$cluster, stats::dist(fitted_vals_pca$x))
         } else {
           clust_res <- stats::kmeans(fitted_vals_mat,
                                      centers = k_vals[k],
                                      nstart = 5,
                                      algorithm = "Hartigan-Wong")
-          sil_res <- cluster::silhouette(clust_res$cluster, stats::dist(fitted_vals_mat))
         }
-        sil_vals[k] <- mean(sil_res[, 3])  # silhouette widths stored in third column
-      }
-      k_to_use <- k_vals[which.max(sil_vals)]
-      if (use.pca) {
-        clust_res <- stats::kmeans(fitted_vals_pca$x,
-                                   centers = k_to_use,
-                                   nstart = 10,
-                                   algorithm = "Hartigan-Wong")
-      } else {
-        clust_res <- stats::kmeans(fitted_vals_mat,
-                                   centers = k_to_use,
-                                   nstart = 10,
-                                   algorithm = "Hartigan-Wong")
+        clust_list[[k]] <- clust_res
+        sil_res <- cluster::silhouette(clust_res$cluster, dist_matrix)
+        sil_vals[k] <- mean(sil_res[, 3])
       }
       gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),
                                   Lineage = lineages[l],
-                                  Cluster = clust_res$cluster)
+                                  Cluster = clust_list[[which.max(sil_vals)]]$cluster)
     # Leiden clustering routine
     } else if (clust.algo == "leiden") {
       if (use.pca) {
         fitted_vals_graph <- bluster::makeSNNGraph(x = fitted_vals_pca$x,
-                                                   k = 30,
+                                                   k = 20,
                                                    type = "jaccard",
-                                                   BNPARAM = BiocNeighbors::AnnoyParam(distance = "Euclidean"))
+                                                   BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
       } else {
         fitted_vals_graph <- bluster::makeSNNGraph(x = fitted_vals_mat,
-                                                   k = 30,
+                                                   k = 20,
                                                    type = "jaccard",
-                                                   BNPARAM = BiocNeighbors::AnnoyParam(distance = "Euclidean"))
+                                                   BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
       }
       res_vals <- seq(0.1, 1, by = 0.1)
       sil_vals <- vector("numeric", 10L)
+      clust_list <- vector("list", 10L)
       for (r in seq_along(res_vals)) {
         clust_res <- igraph::cluster_leiden(graph = fitted_vals_graph,
                                             objective_function = "modularity",
                                             resolution_parameter = res_vals[r])
         if (clust_res$nb_clusters == 1) {
           sil_vals[r] <- 0
         } else {
-          if (use.pca) {
-            sil_res <- cluster::silhouette(clust_res$membership, stats::dist(fitted_vals_pca$x))
-          } else {
-            sil_res <- cluster::silhouette(clust_res$membership, stats::dist(fitted_vals_mat))
-          }
+          sil_res <- cluster::silhouette(clust_res$membership, dist_matrix)
           sil_vals[r] <- mean(sil_res[, 3])
         }
+        clust_list[[r]] <- clust_res$membership
       }
-      res_to_use <- res_vals[which.max(sil_vals)]
-      clust_res <- igraph::cluster_leiden(graph = fitted_vals_graph,
-                                          objective_function = "modularity",
-                                          resolution_parameter = res_to_use)
       gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),
                                   Lineage = lineages[l],
-                                  Cluster = clust_res$membership)
+                                  Cluster = clust_list[[which.max(sil_vals)]])
     }
     gene_cluster_list[[l]] <- gene_clusters
   }

diff --git a/R/embedGenes.R b/R/embedGenes.R
@@ -0,0 +1,78 @@
+#' Generate a table of fitted values and celltype metadata for genes of interest.
+#'
+#' @name embedGenes
+#' @author Jack Leary
+#' @import magrittr
+#' @importFrom purrr map map_dbl
+#' @importFrom dplyr bind_cols
+#' @importFrom stats as.dist
+#' @description Embed genes in dimension-reduced space given a smoothed counts matrix.
+#' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL.
+#' @param genes A character vector of genes to embed. If not specified, all genes in \code{test.dyn.res} are used. Defaults to NULL.
+#' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30.
+#' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2.
+#' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE.
+#' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL.
+#' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.
+#' @param random.seed (Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312.
+#' @return A data.frame containing embedding coordinates, cluster IDs, and metadata for each gene.
+#' @export
+#' @examples
+#' \dontrun{
+#' embedGenes(smoothed_counts$Lineage_A,
+#'            pcs.return = 3,
+#'            cluster.genes = TRUE)
+#' }
+
+embedGenes <- function(smoothed.counts = NULL,
+                       genes = NULL,
+                       pc.embed = 30,
+                       pcs.return = 2,
+                       cluster.genes = TRUE,
+                       gene.meta.data = NULL,
+                       k.param = 20,
+                       random.seed = 312) {
+  # check inputs
+  if (is.null(smoothed.counts)) { stop("You forgot to provide a smoothed counts matrix to embedGenes().") }
+  # embeddings
+  set.seed(random.seed)
+  smoothed_counts_pca <- irlba::prcomp_irlba(t(smoothed.counts),
+                                             pc.embed = pc.embed,
+                                             center = TRUE,
+                                             scale. = TRUE)
+  smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x,
+                                     n_components = 2,
+                                     metric = "cosine",
+                                     n_neighbors = k.param,
+                                     init = "spectral")
+  # clustering w/ silhouette score parameter tuning
+  smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x,
+                                               k = k.param,
+                                               type = "jaccard",
+                                               BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
+  dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
+  clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5), \(r) {
+    smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn,
+                                                    objective_function = "modularity",
+                                                    resolution_parameter = r)
+    if (smoothed_counts_clust$nb_clusters == 1) {
+      sil_val <- 0
+    } else {
+      sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3])
+    }
+    clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L),
+                      resolution = r,
+                      silhouette = sil_val)
+    return(clust_res)
+  })
+  best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette))
+  # prepare results
+  pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)])
+  colnames(pca_df) <- paste0("pc", seq(pcs.return))
+  gene_clust_df <- data.frame(gene = colnames(smoothed.counts),
+                              leiden = clust_runs[[best_clustering]]$clusters,
+                              umap1 = smoothed_counts_umap[, 1],
+                              umap2 = smoothed_counts_umap[, 2])
+  gene_clust_df <- dplyr::bind_cols(gene_clust_df, pca_df)
+  return(gene_clust_df)
+}
diff --git a/man/clusterGenes.Rd b/man/clusterGenes.Rd
diff --git a/man/embedGenes.Rd b/man/embedGenes.Rd