From 3e83ffa1a6a608dcc8937222212347f7555ce2fe Mon Sep 17 00:00:00 2001 From: Jack Leary Date: Mon, 25 Sep 2023 13:20:54 -0400 Subject: [PATCH] updated tests, docs, and added gene embedding fn -- closes #126 --- DESCRIPTION | 2 + NAMESPACE | 3 +- NEWS.md | 15 +++++-- R/clusterGenes.R | 71 ++++++++++++-------------------- R/embedGenes.R | 78 ++++++++++++++++++++++++++++++++++++ man/clusterGenes.Rd | 4 +- man/embedGenes.Rd | 50 +++++++++++++++++++++++ tests/testthat/test_scLANE.R | 48 ++++++++++++++++------ 8 files changed, 206 insertions(+), 65 deletions(-) create mode 100644 R/embedGenes.R create mode 100644 man/embedGenes.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 8356220..fd37722 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,6 +43,8 @@ URL: https://github.com/jr-leary7/scLANE BugReports: https://github.com/jr-leary7/scLANE/issues Suggests: covr, + coop, + uwot, ggh4x, knitr, irlba, diff --git a/NAMESPACE b/NAMESPACE index 980ab85..dbf4c7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(clusterGenes) export(createCellOffset) +export(embedGenes) export(enrichDynamicGenes) export(extractBreakpoints) export(fitGLMM) @@ -95,11 +96,11 @@ importFrom(purrr,reduce) importFrom(scales,label_comma) importFrom(scales,label_number) importFrom(splines,bs) +importFrom(stats,as.dist) importFrom(stats,as.formula) importFrom(stats,coef) importFrom(stats,cutree) importFrom(stats,deviance) -importFrom(stats,dist) importFrom(stats,fitted) importFrom(stats,fitted.values) importFrom(stats,hclust) diff --git a/NEWS.md b/NEWS.md index 066e0bc..4d9e81b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,14 +1,21 @@ -# scLANE 0.7.2 +# `scLANE` 0.7.3 + +* Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering. +* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. +* Added `theme_scLANE()` for output plots. +* Enhanced documentation. + +# `scLANE` 0.7.2 * Added a function named `sortGenesHeatmap()` that aids in the creation of expression cascade heatmaps by sorting genes according to where in pseudotime their peak expression is. * Changed the parameter `approx.knot` in the `testDynamic()` function to use (stochasticity-controlled) subsampling instead of `seq()` to reduce candidate knot space. -* Added `summarizeModels()` to sum up slopes across pseudotime intervals +* Added `summarizeModels()` to sum up slopes across pseudotime intervals. -# scLANE 0.7.1 +# `scLANE` 0.7.1 * Changed input format of all functions to allow counts matrices formatted as `SingleCellExperiment` or `Seurat` objects, sparse matrices, or dense matrices. * Updated visualization functions to reflect changes made in `ggplot2` v3.4 (mostly changing the `size` parameter in line-based geoms to be `linewidth` instead). -# scLANE 0.6.3 +# `scLANE` 0.6.3 * Added a `NEWS.md` file to track changes to the package. diff --git a/R/clusterGenes.R b/R/clusterGenes.R index 30a15e2..ff81b59 100644 --- a/R/clusterGenes.R +++ b/R/clusterGenes.R @@ -2,10 +2,10 @@ #' #' @name clusterGenes #' @author Jack Leary -#' @description This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score. +#' @description This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score based on the cosine distance. #' @import magrittr #' @importFrom purrr map discard map2 reduce -#' @importFrom stats setNames hclust cutree kmeans dist +#' @importFrom stats setNames hclust cutree kmeans as.dist #' @param test.dyn.res The list returned by \code{\link{testDynamic}} - no extra processing required. Defaults to NULL. #' @param pt A data.frame containing the pseudotime or latent time estimates for each cell. Defaults to NULL. #' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL. @@ -19,6 +19,7 @@ #' } #' @return A data.frame of with three columns: \code{Gene}, \code{Lineage}, and \code{Cluster}. #' @seealso \code{\link{testDynamic}} +#' @seealso \code{\link{embedGenes}} #' @seealso \code{\link{plotClusteredGenes}} #' @export #' @examples @@ -74,80 +75,65 @@ clusterGenes <- function(test.dyn.res = NULL, n = n.PC, center = TRUE, scale. = TRUE) + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = fitted_vals_pca$x)) + } else { + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = fitted_vals_mat)) } # hierarchical clustering routine w/ Ward's linkage if (clust.algo == "hclust") { - if (use.pca) { - hclust_tree <- stats::hclust(stats::dist(fitted_vals_pca$x), method = "ward.D2") - } else { - hclust_tree <- stats::hclust(stats::dist(fitted_vals_mat), method = "ward.D2") - } + hclust_tree <- stats::hclust(dist_matrix, method = "ward.D2") k_vals <- c(2:10) sil_vals <- vector("numeric", 9L) + clust_list <- vector("list", 9L) for (k in seq_along(k_vals)) { clust_res <- stats::cutree(hclust_tree, k = k_vals[k]) - if (use.pca) { - sil_res <- cluster::silhouette(clust_res, stats::dist(fitted_vals_pca$x)) - } else { - sil_res <- cluster::silhouette(clust_res, stats::dist(fitted_vals_mat)) - } - sil_vals[k] <- mean(sil_res[, 3]) # silhouette widths stored in third column + sil_res <- cluster::silhouette(clust_res, dist_matrix) + sil_vals[k] <- mean(sil_res[, 3]) + clust_list[[k]] <- clust_res } - k_to_use <- k_vals[which.max(sil_vals)] - clust_res <- stats::cutree(hclust_tree, k = k_to_use) gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat), Lineage = lineages[l], - Cluster = clust_res) + Cluster = clust_list[[which.max(sil_vals)]]) # k-means clustering routine w/ Hartigan-Wong algorithm } else if (clust.algo == "kmeans") { k_vals <- c(2:10) sil_vals <- vector("numeric", 9L) + clust_list <- vector("list", 9L) for (k in seq_along(k_vals)) { if (use.pca) { clust_res <- stats::kmeans(fitted_vals_pca$x, centers = k_vals[k], nstart = 5, algorithm = "Hartigan-Wong") - sil_res <- cluster::silhouette(clust_res$cluster, stats::dist(fitted_vals_pca$x)) } else { clust_res <- stats::kmeans(fitted_vals_mat, centers = k_vals[k], nstart = 5, algorithm = "Hartigan-Wong") - sil_res <- cluster::silhouette(clust_res$cluster, stats::dist(fitted_vals_mat)) } - sil_vals[k] <- mean(sil_res[, 3]) # silhouette widths stored in third column - } - k_to_use <- k_vals[which.max(sil_vals)] - if (use.pca) { - clust_res <- stats::kmeans(fitted_vals_pca$x, - centers = k_to_use, - nstart = 10, - algorithm = "Hartigan-Wong") - } else { - clust_res <- stats::kmeans(fitted_vals_mat, - centers = k_to_use, - nstart = 10, - algorithm = "Hartigan-Wong") + clust_list[[k]] <- clust_res + sil_res <- cluster::silhouette(clust_res$cluster, dist_matrix) + sil_vals[k] <- mean(sil_res[, 3]) } gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat), Lineage = lineages[l], - Cluster = clust_res$cluster) + Cluster = clust_list[[which.max(sil_vals)]]$cluster) # Leiden clustering routine } else if (clust.algo == "leiden") { if (use.pca) { fitted_vals_graph <- bluster::makeSNNGraph(x = fitted_vals_pca$x, - k = 30, + k = 20, type = "jaccard", - BNPARAM = BiocNeighbors::AnnoyParam(distance = "Euclidean")) + BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) } else { fitted_vals_graph <- bluster::makeSNNGraph(x = fitted_vals_mat, - k = 30, + k = 20, type = "jaccard", - BNPARAM = BiocNeighbors::AnnoyParam(distance = "Euclidean")) + BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) } res_vals <- seq(0.1, 1, by = 0.1) sil_vals <- vector("numeric", 10L) + clust_list <- vector("list", 10L) for (r in seq_along(res_vals)) { clust_res <- igraph::cluster_leiden(graph = fitted_vals_graph, objective_function = "modularity", @@ -155,21 +141,14 @@ clusterGenes <- function(test.dyn.res = NULL, if (clust_res$nb_clusters == 1) { sil_vals[r] <- 0 } else { - if (use.pca) { - sil_res <- cluster::silhouette(clust_res$membership, stats::dist(fitted_vals_pca$x)) - } else { - sil_res <- cluster::silhouette(clust_res$membership, stats::dist(fitted_vals_mat)) - } + sil_res <- cluster::silhouette(clust_res$membership, dist_matrix) sil_vals[r] <- mean(sil_res[, 3]) } + clust_list[[r]] <- clust_res$membership } - res_to_use <- res_vals[which.max(sil_vals)] - clust_res <- igraph::cluster_leiden(graph = fitted_vals_graph, - objective_function = "modularity", - resolution_parameter = res_to_use) gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat), Lineage = lineages[l], - Cluster = clust_res$membership) + Cluster = clust_list[[which.max(sil_vals)]]) } gene_cluster_list[[l]] <- gene_clusters } diff --git a/R/embedGenes.R b/R/embedGenes.R new file mode 100644 index 0000000..f41b307 --- /dev/null +++ b/R/embedGenes.R @@ -0,0 +1,78 @@ +#' Generate a table of fitted values and celltype metadata for genes of interest. +#' +#' @name embedGenes +#' @author Jack Leary +#' @import magrittr +#' @importFrom purrr map map_dbl +#' @importFrom dplyr bind_cols +#' @importFrom stats as.dist +#' @description Embed genes in dimension-reduced space given a smoothed counts matrix. +#' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL. +#' @param genes A character vector of genes to embed. If not specified, all genes in \code{test.dyn.res} are used. Defaults to NULL. +#' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30. +#' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2. +#' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE. +#' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL. +#' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20. +#' @param random.seed (Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312. +#' @return A data.frame containing embedding coordinates, cluster IDs, and metadata for each gene. +#' @export +#' @examples +#' \dontrun{ +#' embedGenes(smoothed_counts$Lineage_A, +#' pcs.return = 3, +#' cluster.genes = TRUE) +#' } + +embedGenes <- function(smoothed.counts = NULL, + genes = NULL, + pc.embed = 30, + pcs.return = 2, + cluster.genes = TRUE, + gene.meta.data = NULL, + k.param = 20, + random.seed = 312) { + # check inputs + if (is.null(smoothed.counts)) { stop("You forgot to provide a smoothed counts matrix to embedGenes().") } + # embeddings + set.seed(random.seed) + smoothed_counts_pca <- irlba::prcomp_irlba(t(smoothed.counts), + pc.embed = pc.embed, + center = TRUE, + scale. = TRUE) + smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x, + n_components = 2, + metric = "cosine", + n_neighbors = k.param, + init = "spectral") + # clustering w/ silhouette score parameter tuning + smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x, + k = k.param, + type = "jaccard", + BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) + clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5), \(r) { + smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn, + objective_function = "modularity", + resolution_parameter = r) + if (smoothed_counts_clust$nb_clusters == 1) { + sil_val <- 0 + } else { + sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3]) + } + clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L), + resolution = r, + silhouette = sil_val) + return(clust_res) + }) + best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette)) + # prepare results + pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)]) + colnames(pca_df) <- paste0("pc", seq(pcs.return)) + gene_clust_df <- data.frame(gene = colnames(smoothed.counts), + leiden = clust_runs[[best_clustering]]$clusters, + umap1 = smoothed_counts_umap[, 1], + umap2 = smoothed_counts_umap[, 2]) + gene_clust_df <- dplyr::bind_cols(gene_clust_df, pca_df) + return(gene_clust_df) +} diff --git a/man/clusterGenes.Rd b/man/clusterGenes.Rd index 0637e33..135c081 100644 --- a/man/clusterGenes.Rd +++ b/man/clusterGenes.Rd @@ -33,7 +33,7 @@ clusterGenes( A data.frame of with three columns: \code{Gene}, \code{Lineage}, and \code{Cluster}. } \description{ -This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score. +This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score based on the cosine distance. } \details{ \itemize{ @@ -58,6 +58,8 @@ clusterGenes(gene_stats, \seealso{ \code{\link{testDynamic}} +\code{\link{embedGenes}} + \code{\link{plotClusteredGenes}} } \author{ diff --git a/man/embedGenes.Rd b/man/embedGenes.Rd new file mode 100644 index 0000000..9743bf6 --- /dev/null +++ b/man/embedGenes.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/embedGenes.R +\name{embedGenes} +\alias{embedGenes} +\title{Generate a table of fitted values and celltype metadata for genes of interest.} +\usage{ +embedGenes( + smoothed.counts = NULL, + genes = NULL, + pc.embed = 30, + pcs.return = 2, + cluster.genes = TRUE, + gene.meta.data = NULL, + k.param = 20, + random.seed = 312 +) +} +\arguments{ +\item{smoothed.counts}{The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL.} + +\item{genes}{A character vector of genes to embed. If not specified, all genes in \code{test.dyn.res} are used. Defaults to NULL.} + +\item{pc.embed}{(Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30.} + +\item{pcs.return}{(Optional) How many principal components should be included in the output? Defaults to 2.} + +\item{cluster.genes}{(Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE.} + +\item{gene.meta.data}{(Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL.} + +\item{k.param}{(Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.} + +\item{random.seed}{(Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312.} +} +\value{ +A data.frame containing embedding coordinates, cluster IDs, and metadata for each gene. +} +\description{ +Embed genes in dimension-reduced space given a smoothed counts matrix. +} +\examples{ +\dontrun{ +embedGenes(smoothed_counts$Lineage_A, + pcs.return = 3, + cluster.genes = TRUE) +} +} +\author{ +Jack Leary +} diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R index 24ab3db..1b57648 100644 --- a/tests/testthat/test_scLANE.R +++ b/tests/testthat/test_scLANE.R @@ -1,8 +1,8 @@ # load data & prepare for testing load(system.file("testdata/sim_test_data.RData", package = "scLANE")) cell_offset <- createCellOffset(sim_data) -genes_to_test <- c(rownames(sim_data)[SummarizedExperiment::rowData(sim_data)$geneStatus_overall == "Dynamic"][1:5], - rownames(sim_data)[SummarizedExperiment::rowData(sim_data)$geneStatus_overall == "NotDynamic"][1:5]) +genes_to_test <- c(rownames(sim_data)[SummarizedExperiment::rowData(sim_data)$geneStatus_overall == "Dynamic"][1:10], + rownames(sim_data)[SummarizedExperiment::rowData(sim_data)$geneStatus_overall == "NotDynamic"][1:10]) counts_test <- t(as.matrix(SingleCellExperiment::counts(sim_data)[genes_to_test, ])) pt_test <- data.frame(PT = sim_data$cell_time_normed) @@ -41,7 +41,7 @@ withr::with_output_sink(tempfile(), { cor.structure = "ar1", id.vec = sim_data$subject, n.cores = 2, - track.time = TRUE) + track.time = FALSE) glmm_gene_stats <- testDynamic(sim_data, pt = pt_test, genes = genes_to_test, @@ -149,12 +149,20 @@ withr::with_output_sink(tempfile(), { id.vec = sim_data$subject) # downstream analysis set.seed(312) - gene_clusters <- clusterGenes(test.dyn.res = glm_gene_stats, - pt = pt_test, - size.factor.offset = cell_offset, - clust.algo = "leiden") + gene_clusters_leiden <- clusterGenes(test.dyn.res = glm_gene_stats, + pt = pt_test, + size.factor.offset = cell_offset, + clust.algo = "leiden") + gene_clusters_kmeans <- clusterGenes(test.dyn.res = glm_gene_stats, + pt = pt_test, + size.factor.offset = cell_offset, + clust.algo = "kmeans") + gene_clusters_hclust <- clusterGenes(test.dyn.res = glm_gene_stats, + pt = pt_test, + size.factor.offset = cell_offset, + clust.algo = "hclust") gene_clust_table <- plotClusteredGenes(glm_gene_stats, - gene.clusters = gene_clusters, + gene.clusters = gene_clusters_leiden, size.factor.offset = cell_offset, pt = pt_test, n.cores = 2) @@ -163,6 +171,11 @@ withr::with_output_sink(tempfile(), { size.factor.offset = cell_offset, parallel.exec = TRUE, n.cores = 2) + gene_embedding <- embedGenes(smoothed.counts = smoothed_counts$Lineage_A, + pc.embed = 5, + pcs.return = 2, + k.param = 5, + random.seed = 312) sorted_genes <- sortGenesHeatmap(heatmap.mat = smoothed_counts$Lineage_A, pt.vec = pt_test$PT) fitted_values_table <- getFittedValues(test.dyn.res = glm_gene_stats, @@ -201,9 +214,9 @@ test_that("testDynamic() output", { expect_s3_class(glm_gene_stats, "scLANE") expect_s3_class(gee_gene_stats, "scLANE") expect_s3_class(glmm_gene_stats, "scLANE") - expect_length(glm_gene_stats, 10) - expect_length(gee_gene_stats, 10) - expect_length(glmm_gene_stats, 10) + expect_length(glm_gene_stats, 20) + expect_length(gee_gene_stats, 20) + expect_length(glmm_gene_stats, 20) expect_s3_class(glm_gene_stats$ABCF1$Lineage_A$MARGE_Summary, "data.frame") expect_s3_class(gee_gene_stats$ABCF1$Lineage_A$MARGE_Summary, "data.frame") expect_s3_class(glmm_gene_stats$ABCF1$Lineage_A$MARGE_Summary, "data.frame") @@ -293,8 +306,12 @@ test_that("plotModels() output", { }) test_that("clusterGenes() output", { - expect_s3_class(gene_clusters, "data.frame") - expect_equal(ncol(gene_clusters), 3) + expect_s3_class(gene_clusters_leiden, "data.frame") + expect_s3_class(gene_clusters_kmeans, "data.frame") + expect_s3_class(gene_clusters_hclust, "data.frame") + expect_equal(ncol(gene_clusters_leiden), 3) + expect_equal(ncol(gene_clusters_kmeans), 3) + expect_equal(ncol(gene_clusters_hclust), 3) }) test_that("plotClusteredGenes() output", { @@ -308,6 +325,11 @@ test_that("smoothedCountsMatrix() output", { expect_type(smoothed_counts$Lineage_A, "double") }) +test_that("embedGenes() output", { + expect_s3_class(gene_embedding, "data.frame") + expect_equal(ncol(gene_embedding), 6) +}) + test_that("sortGenesHeatmap() output", { expect_type(sorted_genes, "character") expect_length(sorted_genes, ncol(smoothed_counts$Lineage_A))