diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..3912071 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.github$ diff --git a/.github/workflows/render-README.yaml b/.github/workflows/render-README.yaml new file mode 100644 index 0000000..7d373b9 --- /dev/null +++ b/.github/workflows/render-README.yaml @@ -0,0 +1,36 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Additional help taken from https://fromthebottomoftheheap.net/2020/04/30/rendering-your-readme-with-github-actions/ + +on: + push: + branches: [main, master] + paths: README.Rmd + +name: render-README + +jobs: + render: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v3 + - uses: r-lib/actions/setup-r@v2 + - uses: r-lib/actions/setup-r-dependencies@v2 + - uses: r-lib/actions/setup-pandoc@v2 + + - name: install CRAN packages + run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "dplyr", "purrr", "remotes"))' + - name: install BioConductor packages + run: Rscript -e 'install.packages("BiocManager"); BiocManager::install(c("SingleCellExperiment", "scater", "scran", "scaffold"))' + - name: install GitHub packages + run: Rscript -e 'remotes::install_github("jr-leary7/scLANE")' + + - name: render README + run: Rscript -e 'rmarkdown::render("README.Rmd", output_format = "md_document")' + + - name: commit rendered README + run: | + git add README.md man/figures/README-* + git commit -m "Recompiled README.Rmd" || echo "No changes to commit" + git push origin main || echo "No changes to commit" diff --git a/DESCRIPTION b/DESCRIPTION index 97f5d3e..62b4cde 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This package uses truncated power basis spline models to build flex Downstream analysis functionalities include model comparison, dynamic gene clustering, smoothed counts generation, gene set enrichment testing, & visualization. License: MIT + file LICENSE Encoding: UTF-8 -LazyData: true +LazyData: false RoxygenNote: 7.2.1 Depends: glm2, @@ -70,6 +70,7 @@ LinkingTo: biocViews: RNASeq, Software, + Clustering, TimeCourse, Sequencing, Regression, @@ -77,4 +78,5 @@ biocViews: Visualization, GeneExpression, Transcriptomics, + GeneSetEnrichment, DifferentialExpression diff --git a/R/GetResultsDE.R b/R/GetResultsDE.R index c23265b..ab0e3ed 100644 --- a/R/GetResultsDE.R +++ b/R/GetResultsDE.R @@ -5,7 +5,7 @@ #' @description This function turns the nested list differential expression results of \code{\link{testDynamic}} and turns them into a tidy \code{data.frame}. #' @import magrittr #' @importFrom purrr map_dfr -#' @importFrom dplyr arrange mutate across if_else with_groups relocate +#' @importFrom dplyr arrange desc mutate across if_else with_groups relocate #' @importFrom tidyselect everything #' @importFrom stats p.adjust p.adjust.methods #' @param test.dyn.res The nested list returned by \code{\link{testDynamic}}. Defaults to NULL. @@ -38,7 +38,7 @@ getResultsDE <- function(test.dyn.res = NULL, dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z)))) }) }) %>% - dplyr::arrange(P_Val, Test_Stat) %>% + dplyr::arrange(P_Val, dplyr::desc(Test_Stat)) %>% dplyr::mutate(P_Val_Adj = stats::p.adjust(P_Val, method = p.adj.method), Gene_Dynamic_Lineage = dplyr::if_else(P_Val_Adj < fdr.cutoff, 1, 0, missing = 0)) %>% dplyr::with_groups(Gene, diff --git a/R/ModelLRT.R b/R/ModelLRT.R index 8ad21a9..dbf5aa9 100644 --- a/R/ModelLRT.R +++ b/R/ModelLRT.R @@ -29,45 +29,43 @@ modelLRT <- function(mod.1 = NULL, P_Val = NA, Model_Type = NA, Notes = "No test performed due to model failure.") - return(res) - } - - mod.1 <- mod.1$final_mod - - if (is.null(mod.1) || is.null(mod.0)) { stop("You must provide two models to compare.") } - LRT_notes <- NA_character_ - # compute LRT - if (is.glmm) { - if (!(inherits(mod.1, "glmmTMB") && inherits(mod.0, "glmmTMB"))) { stop("Models must be of class glmmTMB.") } - mod.1_ll <- -mod.1$fit$objective - mod.0_ll <- -mod.0$fit$objective - lrt_stat <- 2 * (mod.1_ll - mod.0_ll) - if (is.na(logLik(mod.1))) { - if (!mod.1$sdr$pdHess) { - LRT_notes <- "Non-positive definite Hessian in alternate GLMM, probably because of shallow log-likelihood." - } else { - LRT_notes <- "Log-likelihood for alternate GLMM is very shallow / flat, exercise caution." + } else { + mod.1 <- mod.1$final_mod + if (is.null(mod.1) || is.null(mod.0)) { stop("You must provide two models to compare.") } + LRT_notes <- NA_character_ + # compute LRT + if (is.glmm) { + if (!(inherits(mod.1, "glmmTMB") && inherits(mod.0, "glmmTMB"))) { stop("Models must be of class glmmTMB.") } + mod.1_ll <- -mod.1$fit$objective + mod.0_ll <- -mod.0$fit$objective + lrt_stat <- 2 * (mod.1_ll - mod.0_ll) + if (is.na(logLik(mod.1))) { + if (!mod.1$sdr$pdHess) { + LRT_notes <- "Non-positive definite Hessian in alternate GLMM, probably because of shallow log-likelihood." + } else { + LRT_notes <- "Log-likelihood for alternate GLMM is very shallow / flat, exercise caution." + } } + } else { + if (!(inherits(mod.1, "glm") || inherits(mod.1, "lm"))) { stop("Models must be of class lm or glm.") } + mod.1_ll <- stats::logLik(mod.1) + mod.0_ll <- stats::logLik(mod.0) + lrt_stat <- as.numeric(2 * (mod.1_ll - mod.0_ll)) } - } else { - if (!(inherits(mod.1, "glm") || inherits(mod.1, "lm"))) { stop("Models must be of class lm or glm.") } - mod.1_ll <- stats::logLik(mod.1) - mod.0_ll <- stats::logLik(mod.0) - lrt_stat <- as.numeric(2 * (mod.1_ll - mod.0_ll)) - } - dgr_free <- attributes(stats::logLik(mod.1))$df - attributes(stats::logLik(mod.0))$df - if (dgr_free == 0) { - dgr_free <- 1 + dgr_free <- attributes(stats::logLik(mod.1))$df - attributes(stats::logLik(mod.0))$df + if (dgr_free == 0) { + dgr_free <- 1 + } + p_val <- stats::pchisq(lrt_stat, + dgr_free, + lower = FALSE) + res <- list(Alt_Mod_LL = as.numeric(mod.1_ll), + Null_Mod_LL = as.numeric(mod.0_ll), + LRT_Stat = lrt_stat, + DF = dgr_free, + P_Val = p_val, + Model_Type = ifelse(is.glmm, "GLMM", "GLM"), + Notes = LRT_notes) } - p_val <- stats::pchisq(lrt_stat, - dgr_free, - lower = FALSE) - res <- list(Alt_Mod_LL = as.numeric(mod.1_ll), - Null_Mod_LL = as.numeric(mod.0_ll), - LRT_Stat = lrt_stat, - DF = dgr_free, - P_Val = p_val, - Model_Type = ifelse(is.glmm, "GLMM", "GLM"), - Notes = LRT_notes) return(res) } diff --git a/R/clusterGenes.R b/R/clusterGenes.R index ff81b59..368ed58 100644 --- a/R/clusterGenes.R +++ b/R/clusterGenes.R @@ -50,7 +50,7 @@ clusterGenes <- function(test.dyn.res = NULL, if (!clust.algo %in% c("hclust", "kmeans", "leiden")) { stop("clust.algo must be one of 'hclust', 'kmeans', or 'leiden'.") } if ((use.pca & is.null(n.PC)) || (use.pca & n.PC <= 0)) { stop("n.PC must be a non-zero integer when clustering on principal components.") } if (is.null(lineages)) { - lineages <- LETTERS[1:length(test.dyn.res[[1]])] + lineages <- LETTERS[seq(length(test.dyn.res[[1]]))] } gene_cluster_list <- vector("list", length = length(lineages)) for (l in seq_along(lineages)) { @@ -82,7 +82,7 @@ clusterGenes <- function(test.dyn.res = NULL, # hierarchical clustering routine w/ Ward's linkage if (clust.algo == "hclust") { hclust_tree <- stats::hclust(dist_matrix, method = "ward.D2") - k_vals <- c(2:10) + k_vals <- seq(2, 10) sil_vals <- vector("numeric", 9L) clust_list <- vector("list", 9L) for (k in seq_along(k_vals)) { @@ -96,7 +96,7 @@ clusterGenes <- function(test.dyn.res = NULL, Cluster = clust_list[[which.max(sil_vals)]]) # k-means clustering routine w/ Hartigan-Wong algorithm } else if (clust.algo == "kmeans") { - k_vals <- c(2:10) + k_vals <- seq(2, 10) sil_vals <- vector("numeric", 9L) clust_list <- vector("list", 9L) for (k in seq_along(k_vals)) { diff --git a/R/embedGenes.R b/R/embedGenes.R index b027a44..56e0752 100644 --- a/R/embedGenes.R +++ b/R/embedGenes.R @@ -14,6 +14,7 @@ #' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE. #' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL. #' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20. +#' @param resolution.param (Optional) The value of the resolution parameter for the Leiden algorithm. If unspecified, silhouette scoring is used to select an optimal value. Defaults to NULL. #' @param random.seed (Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312. #' @return A data.frame containing embedding coordinates, cluster IDs, and metadata for each gene. #' @export @@ -31,6 +32,7 @@ embedGenes <- function(smoothed.counts = NULL, cluster.genes = TRUE, gene.meta.data = NULL, k.param = 20, + resolution.param = NULL, random.seed = 312) { # check inputs if (is.null(smoothed.counts)) { stop("You forgot to provide a smoothed counts matrix to embedGenes().") } @@ -53,23 +55,30 @@ embedGenes <- function(smoothed.counts = NULL, k = k.param, type = "jaccard", BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) - dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) - clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) { + if (is.null(resolution.param)) { + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) + clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) { + smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn, + objective_function = "modularity", + resolution_parameter = r) + if (smoothed_counts_clust$nb_clusters == 1) { + sil_val <- 0 + } else { + sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3]) + } + clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L), + resolution = r, + silhouette = sil_val) + return(clust_res) + }) + best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette)) + cluster_vec <- clust_runs[[best_clustering]]$clusters + } else { smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn, objective_function = "modularity", - resolution_parameter = r) - if (smoothed_counts_clust$nb_clusters == 1) { - sil_val <- 0 - } else { - sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3]) - } - clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L), - resolution = r, - silhouette = sil_val) - return(clust_res) - }) - best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette)) - cluster_vec <- clust_runs[[best_clustering]]$clusters + resolution_parameter = resolution.param) + cluster_vec <- as.factor(smoothed_counts_clust$membership - 1L) + } } else { cluster_vec <- NA_integer_ } diff --git a/R/pull_sumy.R b/R/pull_sumy.R index dba3ed1..ef7db14 100644 --- a/R/pull_sumy.R +++ b/R/pull_sumy.R @@ -1,129 +1,140 @@ pull.null.sumy <- function(mod.obj, is.gee, is.glmm) { - - if (inherits(mod.obj, "try-error")) {return(list(null_sumy_df=NA, null_pred_df=NA, null_ll=NA_real_, null_dev=NA_real_))} - - if (is.gee) { - null_sumy_df <- try({ - null_gee_summary <- summary(mod.obj) - data.frame(term = null_gee_summary$coefnames, - estimate = unname(null_gee_summary$beta), - `std.error` = unname(null_gee_summary$se.robust), - statistic = unname(null_gee_summary$wald.test), - `p.value` = unname(null_gee_summary$p)) - }, silent = TRUE) - null_pred_df <- try({ - robust_vcov_mat <- as.matrix(mod.obj$var) - data.frame(null_link_fit = predict(mod.obj), - null_link_se = sqrt(apply((tcrossprod(mod.obj$X, robust_vcov_mat)) * mod.obj$X, 1, sum))) - }, silent = TRUE) - } else if (is.glmm) { - null_sumy_df <- try({ - null_glmm_summary <- as.data.frame(broom.mixed::tidy(mod.obj, effects = "fixed")) - data.frame(term = null_glmm_summary$term, - estimate = null_glmm_summary$estimate[1], - `std.error` = null_glmm_summary$std.error[1], - statistic = null_glmm_summary$statistic[1], - `p.value` = null_glmm_summary$p.value[1]) - }, silent = TRUE) - null_pred_df <- try({ - data.frame(predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>% - dplyr::rename(null_link_fit = fit, null_link_se = se.fit) - }, silent = TRUE) - } else { - null_sumy_df <- try({ - as.data.frame(broom.mixed::tidy(mod.obj)) # saves a few bytes by converting from tibble - }, silent = TRUE) - null_pred_df <- try({ - data.frame(stats::predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>% - dplyr::rename(null_link_fit = fit, null_link_se = se.fit) - }, silent = TRUE) + # check inputs + if (inherits(mod.obj, "try-error")) { + res <- list(null_sumy_df = NA, + null_pred_df = NA, + null_ll = NA_real_, + null_dev = NA_real_) + } else { + # pull model summary + if (is.gee) { + null_sumy_df <- try({ + null_gee_summary <- summary(mod.obj) + data.frame(term = null_gee_summary$coefnames, + estimate = unname(null_gee_summary$beta), + `std.error` = unname(null_gee_summary$se.robust), + statistic = unname(null_gee_summary$wald.test), + `p.value` = unname(null_gee_summary$p)) + }, silent = TRUE) + null_pred_df <- try({ + robust_vcov_mat <- as.matrix(mod.obj$var) + data.frame(null_link_fit = predict(mod.obj), + null_link_se = sqrt(apply((tcrossprod(mod.obj$X, robust_vcov_mat)) * mod.obj$X, 1, sum))) + }, silent = TRUE) + } else if (is.glmm) { + null_sumy_df <- try({ + null_glmm_summary <- as.data.frame(broom.mixed::tidy(mod.obj, effects = "fixed")) + data.frame(term = null_glmm_summary$term, + estimate = null_glmm_summary$estimate[1], + `std.error` = null_glmm_summary$std.error[1], + statistic = null_glmm_summary$statistic[1], + `p.value` = null_glmm_summary$p.value[1]) + }, silent = TRUE) + null_pred_df <- try({ + data.frame(predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>% + dplyr::rename(null_link_fit = fit, null_link_se = se.fit) + }, silent = TRUE) + } else { + null_sumy_df <- try({ + as.data.frame(broom.mixed::tidy(mod.obj)) # saves a few bytes by converting from tibble + }, silent = TRUE) + null_pred_df <- try({ + data.frame(stats::predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>% + dplyr::rename(null_link_fit = fit, null_link_se = se.fit) + }, silent = TRUE) + } + + null_ll <- ifelse(is.gee, NA_real_, as.numeric(stats::logLik(mod.obj))) + null_dev <- ifelse((is.gee || is.glmm), NA_real_, stats::deviance(mod.obj)) + + res <- list(null_sumy_df = null_sumy_df, + null_pred_df = null_pred_df, + null_ll = null_ll, + null_dev = null_dev) } - - null_ll <- ifelse(is.gee, NA_real_, as.numeric(stats::logLik(mod.obj))) - null_dev <- ifelse((is.gee || is.glmm), NA_real_, stats::deviance(mod.obj)) - - return(list(null_sumy_df=null_sumy_df, null_pred_df=null_pred_df, null_ll=null_ll, null_dev=null_dev)) - + return(res) } -pull.marge.sumy <- function(mod.obj, is.gee, is.glmm) { - +pull.marge.sumy <- function(mod.obj, + is.gee, + is.glmm) { + # check inputs if (inherits(mod.obj, "try-error")) { - - return(list(marge_pred_df=NA, marge_sumy_df=NA, - ll_marge=NA_real_, marge_fit_notes=NA_character_)) - } - - - if (is.gee) { - robust_vcov_mat <- as.matrix(mod.obj$final_mod$var) - marge_pred_df <- try({ - data.frame(marge_link_fit = predict(mod.obj$final_mod), - marge_link_se = sqrt(apply((tcrossprod(mod.obj$final_mod$X, robust_vcov_mat)) * mod.obj$final_mod$X, 1, sum))) - }, silent = TRUE) - marge_sumy_df <- try({ - marge_gee_summary <- summary(mod.obj$final_mod) - data.frame(term = marge_gee_summary$coefnames, - estimate = unname(marge_gee_summary$beta), - `std.error` = unname(marge_gee_summary$se.robust), - statistic = unname(marge_gee_summary$wald.test), - `p.value` = unname(marge_gee_summary$p)) - }, silent = TRUE) - - } else if (is.glmm) { - marge_pred_df <- try({ - data.frame(predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>% - dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit) - }, silent = TRUE) - marge_sumy_df <- try({ - marge_glmm_summary <- broom.mixed::tidy(mod.obj$final_mod, effects = "fixed") - data.frame(term = mod.obj$marge_coef_names, # use marge-style hinge function names instead of X1, X2, ..., XP (makes interpretability easier) - estimate = marge_glmm_summary$estimate, - `std.error` = marge_glmm_summary$std.error, - statistic = marge_glmm_summary$statistic, - `p.value` = marge_glmm_summary$p.value) - }, silent = TRUE) - + res <- list(marge_pred_df = NA, + marge_sumy_df = NA, + ll_marge = NA_real_, + marge_fit_notes = NA_character_) } else { - marge_pred_df <- try({ - data.frame(stats::predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>% - dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit) - }, silent = TRUE) - marge_sumy_df <- try({ - as.data.frame(broom.mixed::tidy(mod.obj$final_mod)) %>% - lapply(unname) %>% - as.data.frame() - }, silent = TRUE) + # pull model summary + if (is.gee) { + robust_vcov_mat <- as.matrix(mod.obj$final_mod$var) + marge_pred_df <- try({ + data.frame(marge_link_fit = predict(mod.obj$final_mod), + marge_link_se = sqrt(apply((tcrossprod(mod.obj$final_mod$X, robust_vcov_mat)) * mod.obj$final_mod$X, 1, sum))) + }, silent = TRUE) + marge_sumy_df <- try({ + marge_gee_summary <- summary(mod.obj$final_mod) + data.frame(term = marge_gee_summary$coefnames, + estimate = unname(marge_gee_summary$beta), + `std.error` = unname(marge_gee_summary$se.robust), + statistic = unname(marge_gee_summary$wald.test), + `p.value` = unname(marge_gee_summary$p)) + }, silent = TRUE) + + } else if (is.glmm) { + marge_pred_df <- try({ + data.frame(predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>% + dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit) + }, silent = TRUE) + marge_sumy_df <- try({ + marge_glmm_summary <- broom.mixed::tidy(mod.obj$final_mod, effects = "fixed") + data.frame(term = mod.obj$marge_coef_names, # use marge-style hinge function names instead of X1, X2, ..., XP (makes interpretability easier) + estimate = marge_glmm_summary$estimate, + `std.error` = marge_glmm_summary$std.error, + statistic = marge_glmm_summary$statistic, + `p.value` = marge_glmm_summary$p.value) + }, silent = TRUE) + + } else { + marge_pred_df <- try({ + data.frame(stats::predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>% + dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit) + }, silent = TRUE) + marge_sumy_df <- try({ + as.data.frame(broom.mixed::tidy(mod.obj$final_mod)) %>% + lapply(unname) %>% + as.data.frame() + }, silent = TRUE) + } + + # get log-likelihood for GLMM or GLM cases + if (is.glmm) { + ll_marge <- -mod.obj$final_mod$fit$objective + } else if (!(is.gee || is.glmm)) { + ll_marge <- as.numeric(stats::logLik(mod.obj$final_mod)) + } else { + ll_marge <- NA_real_ + } + # check positive-definiteness of Hessian for GLMM -- might have an effect on LRT stat / accompanying p-value + if (is.glmm) { + if (!mod.obj$final_mod$sdr$pdHess) { + marge_fit_notes <- "Non-positive definite Hessian in GLMM, probably due to shallow log-likelihood. Be careful!" + } else { + marge_fit_notes <- NA_character_ + } + } else { + marge_fit_notes <- NA_character_ + } + marge_dev <- ifelse((is.gee || is.glmm), NA_real_, stats::deviance(mod.obj$final_mod)) + res <- list(marge_pred_df = marge_pred_df, + marge_sumy_df = marge_sumy_df, + ll_marge = ll_marge, + marge_dev = marge_dev, + marge_fit_notes = marge_fit_notes) } - - # get log-likelihood for GLMM or GLM cases - if (is.glmm) { - ll_marge <- -mod.obj$final_mod$fit$objective - } else if (!(is.gee || is.glmm)) { - ll_marge <- as.numeric(stats::logLik(mod.obj$final_mod)) - } else { - ll_marge <- NA_real_ - } - # check positive-definiteness of Hessian for GLMM -- might have an effect on LRT stat / accompanying p-value - if (is.glmm) { - if (!mod.obj$final_mod$sdr$pdHess) { - marge_fit_notes <- "Non-positive definite Hessian in GLMM, probably due to shallow log-likelihood. Be careful!" - } else { - marge_fit_notes <- NA_character_ - } - } else { - marge_fit_notes <- NA_character_ - } - - -marge_dev <- ifelse((is.gee || is.glmm), NA_real_, stats::deviance(mod.obj$final_mod)) - - - return(list(marge_pred_df=marge_pred_df, marge_sumy_df=marge_sumy_df, - ll_marge=ll_marge, marge_dev=marge_dev, marge_fit_notes=marge_fit_notes)) - -} \ No newline at end of file + return(res) +} diff --git a/R/testDynamic.R b/R/testDynamic.R index 1111db5..4a8b762 100644 --- a/R/testDynamic.R +++ b/R/testDynamic.R @@ -140,11 +140,10 @@ testDynamic <- function(expr.mat = NULL, if (parallel.exec) { cl <- parallel::makeCluster(n.cores) doParallel::registerDoParallel(cl) - parallel::clusterSetRNGStream(cl, iseed = random.seed) } else { cl <- foreach::registerDoSEQ() - set.seed(random.seed) } + parallel::clusterSetRNGStream(cl, iseed = random.seed) # convert dense counts matrix to file-backed matrix expr.mat <- bigstatsr::as_FBM(expr.mat, @@ -152,7 +151,7 @@ testDynamic <- function(expr.mat = NULL, is_read_only = TRUE) # build list of objects to prevent from being sent to parallel workers - necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "print_nums", + necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "n_lineages", "id.vec", "cor.structure", "is.gee", "glmm.adaptive", "size.factor.offset") if (any(ls(envir = .GlobalEnv) %in% necessary_vars)) { no_export <- c(ls(envir = .GlobalEnv)[-which(ls(envir = .GlobalEnv) %in% necessary_vars)], @@ -343,14 +342,10 @@ testDynamic <- function(expr.mat = NULL, # end parallelization & clean up each worker node withr::with_output_sink(tempfile(), { - if (parallel.exec) { - parallel::clusterEvalQ(cl, expr = { - gc(verbose = FALSE, full = TRUE) - }) - parallel::stopCluster(cl) - } - rm(cl) - gc(verbose = FALSE, full = TRUE) + parallel::clusterEvalQ(cl, expr = { + gc(verbose = FALSE, full = TRUE) + }) + parallel::stopCluster(cl) }) # clean up errors w/ proper formatting names(test_stats) <- genes @@ -389,16 +384,17 @@ testDynamic <- function(expr.mat = NULL, total_time <- end_time - start_time total_time_units <- attributes(total_time)$units total_time_numeric <- as.numeric(total_time) - print(paste0("testDynamic evaluated ", - length(genes), - " genes across ", - n_lineages, - " ", - ifelse(n_lineages == 1, "lineage ", "lineages "), - "in ", - round(total_time_numeric, 3), - " ", - total_time_units)) + time_message <- paste0("testDynamic evaluated ", + length(genes), + " genes across ", + n_lineages, + " ", + ifelse(n_lineages == 1, "lineage ", "lineages "), + "in ", + round(total_time_numeric, 3), + " ", + total_time_units) + message(time_message) } class(test_stats) <- "scLANE" return(test_stats) diff --git a/R/testSlope.R b/R/testSlope.R index 1a0f811..0cb5d32 100644 --- a/R/testSlope.R +++ b/R/testSlope.R @@ -5,7 +5,7 @@ #' @description This function tests whether each gene's estimated \eqn{\beta} for pseudotime differs significantly from 0 over each empirically estimated sets of knots / pseudotime interval using a Wald test. #' @import magrittr #' @importFrom purrr map_dfr -#' @importFrom dplyr arrange mutate if_else with_groups +#' @importFrom dplyr arrange desc mutate if_else with_groups #' @importFrom stats p.adjust #' @param test.dyn.res The list returned by \code{\link{testDynamic}} - no extra processing required. Defaults to NULL. #' @param p.adj.method The method used to adjust the \emph{p}-values for each slope. Defaults to "holm". @@ -29,7 +29,7 @@ testSlope <- function(test.dyn.res = NULL, if (is.null(test.dyn.res)) { stop("You forgot to provide results from testDynamic() to testSlope().") } # create table of results slope_df <- purrr::map_dfr(test.dyn.res, \(x) purrr::map_dfr(x, \(y) data.frame(y["MARGE_Slope_Data"][[1]]))) %>% - dplyr::arrange(P_Val) %>% + dplyr::arrange(P_Val, dplyr::desc(Test_Stat)) %>% dplyr::mutate(P_Val_Adj = stats::p.adjust(P_Val, method = p.adj.method)) %>% dplyr::arrange(Gene, Breakpoint) %>% dplyr::mutate(Gene_Dynamic_Lineage_Slope = dplyr::if_else(P_Val_Adj < fdr.cutoff, 1, 0, missing = 0)) %>% diff --git a/R/theme_scLANE.R b/R/theme_scLANE.R index 5642b34..9908a11 100644 --- a/R/theme_scLANE.R +++ b/R/theme_scLANE.R @@ -7,6 +7,7 @@ #' @param base.size The base font size. Defaults to 12. #' @param base.lwd The base linewidth. Defaults to 0.75. #' @param base.family The font family to be used throughout. Defaults to "sans". +#' @param umap (Optional) If set to TRUE, removes axis text and ticks for a cleaner look. Defaults to FALSE. #' @return A \code{ggplot2} theme. #' @export #' @examples @@ -20,11 +21,18 @@ theme_scLANE <- function(base.size = 12, base.lwd = 0.75, - base.family = "sans") { - ggplot2::theme_classic(base_size = base.size, - base_family = base.family, - base_line_size = base.lwd, - base_rect_size = base.lwd) + - ggplot2::theme(strip.clip = "off", - strip.background = ggplot2::element_rect(linewidth = base.lwd)) + base.family = "sans", + umap = FALSE) { + scLANE_theme <- ggplot2::theme_classic(base_size = base.size, + base_family = base.family, + base_line_size = base.lwd, + base_rect_size = base.lwd) + + ggplot2::theme(strip.clip = "off", + strip.background = ggplot2::element_rect(linewidth = base.lwd)) + if (umap) { + scLANE_theme <- scLANE_theme + + ggplot2::theme(axis.ticks = ggplot2::element_blank(), + axis.text = ggplot2::element_blank()) + } + return(scLANE_theme) } diff --git a/README.Rmd b/README.Rmd index b4a4d0a..e8bdbb5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -40,7 +40,7 @@ You can install the most recent version of `scLANE` with: remotes::install_github("jr-leary7/scLANE") ``` -## Model Structure +## Model structure The `scLANE` package enables users to accurately determine differential expression of genes over pseudotime or latent time, and to characterize gene's dynamics using interpretable model coefficients. `scLANE` builds upon the [`marge` modeling framework](https://github.com/JakubStats/marge), allowing users to characterize their trajectory's effects on mRNA expression using negative binomial [GLMs](https://en.wikipedia.org/wiki/Generalized_linear_model), [GEEs](https://en.wikipedia.org/wiki/Generalized_estimating_equation), or [GLMMs](https://en.wikipedia.org/wiki/Generalized_linear_mixed_model), depending on the experimental design & biological questions of interest. This modeling framework is an extension of the well-known [Multivariate Adapative Regression Splines (MARS)](https://en.wikipedia.org/wiki/Multivariate_adaptive_regression_spline) method, which uses truncated power basis splines to build nonlinear models using a [generalized cross validation (GCV) criterion](https://doi.org/10.48550/arXiv.1706.02495). @@ -50,7 +50,7 @@ A quickstart guide on how to use `scLANE` with simulated data continues below, a Our method relies on a relatively simple test in order to define whether a given gene is differentially expressed (or "dynamic") over the provided trajectory. While the exact structure of the test differs by model backend, the concept is the same: the spline-based NB GLM / GEE / GLMM is treated as the alternate model, and a null model is fit using the corresponding model backend. If the GLM backend is used, then the null model is simply an intercept-only NB GLM; the GEE backend fits an intercept-only model with the same working correlation structure as the alternate model, and if the GLMM backend is used then the null model is an intercept-only model with random intercepts for each subject. The alternate hypothesis is thus that at least one of the estimated coefficients is significantly different from zero. We predict a given gene to be dynamic if the adjusted *p*-value of the test is less than an *a priori* threshold; the default threshold is 0.01, and the default adjustment method is [the Holm correction](https://en.wikipedia.org/wiki/Holm–Bonferroni_method). -## Input Data +## Input data `scLANE` has a few basic inputs with standardized formats, though different modeling frameworks require different inputs; for example, the GEE & GLMM backends require a vector of subject IDs corresponding to each cell. We'll start by simulating some scRNA-seq counts data with a subset of genes being dynamic over a trajectory using the [`scaffold` R package](https://github.com/rhondabacher/scaffold). As part of our simulation studies I wrote the function sourced below to generate multi-subject scRNA-seq datasets with trajectories partially conserved between subjects. @@ -100,7 +100,7 @@ plotPCA(sim_data, colour_by = "subject") + theme_scLANE() ``` -## Trajectory DE Testing +## Trajectory DE testing Since we have multi-subject data, we can use any of the three model backends to run our DE testing. We'll start with the simplest model, the GLM, then work our way through the other options in order of increasing complexity. We first prepare our inputs - a dataframe containing our cell ordering, a set of genes to build models for, and a vector of per-cell size factors to be used as offsets during estimation. In reality, it's usually unnecessary to fit a model for every single gene in a dataset, as trajectories are usually estimated using a subset of the entire set of genes (usually a few thousand most highly variable genes). For the purpose of demonstration, we'll select 50 genes each from the dynamic and non-dynamic populations. **Note**: in this case we're working with a single pseudotime lineage, though in real datasets several lineages often exist; in order to fit models for a subset of lineages simply remove the corresponding columns from the cell ordering dataframe passed as input to `testDynamic()`. @@ -112,7 +112,7 @@ order_df <- data.frame(X = sim_data$cell_time_normed) cell_offset <- createCellOffset(sim_data) ``` -### GLM Backend +### GLM framework Running `testDynamic()` provides us with a nested list containing model output & DE test results for each gene over each pseudotime / latent time lineage. In this case, since we have a true cell ordering we only have one lineage. Parallel processing is turned on by default, and we use 2 cores here to speed up runtime. @@ -149,7 +149,7 @@ caret::confusionMatrix(factor(scLANE_res_glm$Gene_Dynamic_Overall, levels = c(0, positive = "1") ``` -### GEE Backend +### GEE framework The function call is essentially the same when using the GLM backend, with the exception of needing to provide a sorted vector of subject IDs & a desired correlation structure, the default being [the AR1 structure](https://en.wikipedia.org/wiki/Autoregressive_model). We also need to flip the `is.gee` flag in order to indicate that we'd like to fit estimating equations models (instead of mixed models). Since fitting GEEs is a fair bit more computationally complex than fitting GLMs, DE testing with the GEE backend takes a bit longer. Using more cores and / or running the tests on an HPC cluster speeds things up considerably. Here we specify an [AR1 correlation structure](https://rdrr.io/cran/nlme/man/corAR1.html), which is the default for the GEE backend. @@ -187,7 +187,7 @@ caret::confusionMatrix(factor(scLANE_res_gee$Gene_Dynamic_Overall, levels = c(0, positive = "1") ``` -### GLMM Backend +### GLMM framework We re-run the DE tests a final time using the GLMM backend. This is the most complex model architecture we support, and is the trickiest to interpret. We recommend using it when you're most interested in how a trajectory differs between subjects e.g., if the subjects belong to groups like Treatment & Control, and you expect the Treatment group to experience a different progression through the biological process. Executing the function with the GLMM backend differs only in that we switch the `is.glmm` flag to `TRUE` and no longer need to specify a working correlation structure. **Note**: the GLMM backend is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. @@ -226,9 +226,9 @@ caret::confusionMatrix(factor(scLANE_res_glmm$Gene_Dynamic_Overall, levels = c(0 positive = "1") ``` -## Downstream Analysis & Visualization +## Downstream analysis & visualization -### Model Comparison +### Model comparison We can use the `plotModels()` to visually compare different types of modeling backends. It takes as input the results from `testDynamic()`, as well as a few specifications for which models & lineages should be plotted. While more complex visualizations can be created from our model output, this function gives us a good first glance at which models fit the underlying trend the best. Here we show the output generated using the GLM backend, split by model type. The intercept-only model shows the null hypothesis against which the scLANE model is compared using the likelihood ratio test and the GLM displays the inadequacy of monotonic modeling architectures for nonlinear dynamics. A GAM shows essentially the same trend as the `scLANE` model, though the fitted trend from `scLANE` is more interpretable & has a narrower confidence interval. @@ -260,7 +260,7 @@ plotModels(scLANE_models_glmm, plot.scLANE = TRUE) ``` -### Gene Clustering +### Gene clustering After generating a suitable set of models, we can cluster the genes in a semi-supervised fashion using `clusterGenes()`. This function uses the Leiden algorithm (the default), hierarchical clustering, or *k*-means and selects the best set of clustering hyperparameters using the silhouette score. If desired PCA can be run prior to clustering, but the default is to cluster on the fitted values themselves. We then use the results as input to `plotClusteredGenes()`, which generates a table of fitted values per-gene, per-lineage over pseudotime along with the accompanying cluster labels. @@ -280,7 +280,7 @@ slice_sample(gene_clust_table, n = 5) %>% col.names = c("Gene", "Lineage", "Cell", "Fitted (link)", "Fitted (response)", "Pseudotime", "Cluster")) ``` -The results can then be plotted as desired using `ggplot2` or another visualization package. Upon visual inspection, the genes seem to cluster based on whether they are more dynamic or more static over pseudotime. +The results can then be plotted as desired using `ggplot2` or another visualization package. ```{r plot-clust} ggplot(gene_clust_table, aes(x = PT, y = FITTED, color = CLUSTER, group = GENE)) + @@ -294,7 +294,7 @@ ggplot(gene_clust_table, aes(x = PT, y = FITTED, color = CLUSTER, group = GENE)) theme_scLANE() ``` -## Gene-level Embeddings +### Gene embeddings After extracting a matrix of fitted dynamics using `smoothedCountsMatrix()`, we can embed the genes in PCA & UMAP space in order to visualize clusters of similarly-behaving genes. @@ -310,10 +310,10 @@ ggplot(gene_embedding, aes(x = umap1, y = umap2, color = leiden)) + labs(x = "UMAP 1", y = "UMAP 2", color = "Leiden") + - theme_scLANE() + theme_scLANE(umap = TRUE) ``` -## Knot distributions +## Knot distribution Lastly, we can pull the locations in pseudotime of all the knots fitted by `scLANE`. Visualizing this distribution gives us some idea of where transcriptional switches are occurring in the set of genes classified as dynamic. @@ -334,13 +334,13 @@ ggplot(knot_dist, aes(x = knot)) + theme_scLANE() ``` -# Conclusions & Best Practices +# Conclusions & best practices In general, starting with the GLM backend is probably your best bet unless you have a strong prior belief that expression trends will differ significantly between subjects. If that is the case, you should use the GEE backend if you're interested in population-level estimates, but are worried about wrongly predicting differential expression when differences in expression are actually caused by inter-subject variation. If you're interested in generating subject-specific estimates then the GLMM backend should be used; take care when interpreting the fixed vs. random effects though, and consult a biostatistician if necessary. If you have a large dataset (10,000+ cells), you should start with the GLM backend, since standard error estimates don't differ much between modeling methods given high enough *n*. In addition, running the tests on an HPC cluster with 4+ CPUs and 64+ GB of RAM will help your computations to complete swiftly. Datasets with smaller numbers of cells or fewer genes of interest may be easily analyzed in an R session on a local machine. -# Contact Information +# Contact information This package is developed & maintained by Jack Leary. Feel free to reach out by [opening an issue](https://github.com/jr-leary7/scLANE/issues) or by email (j.leary@ufl.edu) if more detailed assistance is needed. diff --git a/README.md b/README.md index c9dc864..9abb6ac 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,15 @@ - [`scLANE`](#sclane) - [Installation](#installation) - - [Model Structure](#model-structure) + - [Model structure](#model-structure) - [Usage](#usage) - - [Input Data](#input-data) - - [Trajectory DE Testing](#trajectory-de-testing) - - [Downstream Analysis & - Visualization](#downstream-analysis--visualization) - - [Gene-level Embeddings](#gene-level-embeddings) - - [Knot distributions](#knot-distributions) -- [Conclusions & Best Practices](#conclusions--best-practices) -- [Contact Information](#contact-information) + - [Input data](#input-data) + - [Trajectory DE testing](#trajectory-de-testing) + - [Downstream analysis & + visualization](#downstream-analysis--visualization) + - [Knot distribution](#knot-distribution) +- [Conclusions & best practices](#conclusions--best-practices) +- [Contact information](#contact-information) - [References](#references) @@ -37,7 +36,7 @@ You can install the most recent version of `scLANE` with: remotes::install_github("jr-leary7/scLANE") ``` -## Model Structure +## Model structure The `scLANE` package enables users to accurately determine differential expression of genes over pseudotime or latent time, and to characterize @@ -82,7 +81,7 @@ gene to be dynamic if the adjusted *p*-value of the test is less than an adjustment method is [the Holm correction](https://en.wikipedia.org/wiki/Holm–Bonferroni_method). -## Input Data +## Input data `scLANE` has a few basic inputs with standardized formats, though different modeling frameworks require different inputs; for example, the @@ -156,7 +155,7 @@ plotPCA(sim_data, colour_by = "subject") + -## Trajectory DE Testing +## Trajectory DE testing Since we have multi-subject data, we can use any of the three model backends to run our DE testing. We’ll start with the simplest model, the @@ -182,7 +181,7 @@ order_df <- data.frame(X = sim_data$cell_time_normed) cell_offset <- createCellOffset(sim_data) ``` -### GLM Backend +### GLM framework Running `testDynamic()` provides us with a nested list containing model output & DE test results for each gene over each pseudotime / latent @@ -197,7 +196,6 @@ scLANE_models_glm <- testDynamic(sim_data, size.factor.offset = cell_offset, n.cores = 4, track.time = TRUE) -#> [1] "testDynamic evaluated 100 genes across 1 lineage in 20.105 secs" ``` After the function finishes running, we use `getResultsDE()` to generate @@ -222,7 +220,7 @@ select(scLANE_res_glm, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_ | WAPAL | A | 361.360 | 0 | 0 | 1 | | JARID2 | A | 353.612 | 0 | 0 | 1 | | ERGIC3 | A | 294.186 | 0 | 0 | 1 | -| PFDN2 | A | 259.121 | 0 | 0 | 1 | +| PFDN2 | A | 258.867 | 0 | 0 | 1 | After creating a reference table of the ground truth status of each gene - `1` denotes a dynamic gene and `0` a non-dynamic one - and adding @@ -268,7 +266,7 @@ caret::confusionMatrix(factor(scLANE_res_glm$Gene_Dynamic_Overall, levels = c(0, #> ``` -### GEE Backend +### GEE framework The function call is essentially the same when using the GLM backend, with the exception of needing to provide a sorted vector of subject IDs @@ -293,7 +291,6 @@ scLANE_models_gee <- testDynamic(sim_data, cor.structure = "ar1", n.cores = 4, track.time = TRUE) -#> [1] "testDynamic evaluated 100 genes across 1 lineage in 2.115 mins" ``` We again generate the table of DE test results. The variance of the @@ -310,13 +307,13 @@ select(scLANE_res_gee, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_ col.names = c("Gene", "Lineage", "Wald stat.", "P-value", "Adj. p-value", "Predicted dynamic status")) ``` -| Gene | Lineage | Wald stat. | P-value | Adj. p-value | Predicted dynamic status | -|:-------|:--------|-----------:|--------:|-------------:|-------------------------:| -| TIMP1 | A | 74.959 | 0 | 0 | 1 | -| FLOT2 | A | 77.081 | 0 | 0 | 1 | -| CBX6 | A | 90.526 | 0 | 0 | 1 | -| TSPAN1 | A | 109.564 | 0 | 0 | 1 | -| GGNBP2 | A | 117.029 | 0 | 0 | 1 | +| Gene | Lineage | Wald stat. | P-value | Adj. p-value | Predicted dynamic status | +|:-------|:--------|------------:|--------:|-------------:|-------------------------:| +| CKAP4 | A | 4172613.197 | 0 | 0 | 1 | +| DGUOK | A | 64351.893 | 0 | 0 | 1 | +| EMC3 | A | 22164.408 | 0 | 0 | 1 | +| ERGIC3 | A | 16466.403 | 0 | 0 | 1 | +| RAB1B | A | 5678.834 | 0 | 0 | 1 | We create the same confusion matrix as before. Empirically speaking, when the underlying dynamics don’t differ much between subjects, GEEs @@ -335,32 +332,32 @@ caret::confusionMatrix(factor(scLANE_res_gee$Gene_Dynamic_Overall, levels = c(0, #> #> Reference #> Prediction 0 1 -#> 0 47 16 -#> 1 3 34 +#> 0 47 15 +#> 1 3 35 #> -#> Accuracy : 0.81 -#> 95% CI : (0.7193, 0.8816) +#> Accuracy : 0.82 +#> 95% CI : (0.7305, 0.8897) #> No Information Rate : 0.5 -#> P-Value [Acc > NIR] : 1.351e-10 +#> P-Value [Acc > NIR] : 3.074e-11 #> -#> Kappa : 0.62 +#> Kappa : 0.64 #> -#> Mcnemar's Test P-Value : 0.005905 +#> Mcnemar's Test P-Value : 0.009522 #> -#> Sensitivity : 0.6800 +#> Sensitivity : 0.7000 #> Specificity : 0.9400 -#> Pos Pred Value : 0.9189 -#> Neg Pred Value : 0.7460 +#> Pos Pred Value : 0.9211 +#> Neg Pred Value : 0.7581 #> Prevalence : 0.5000 -#> Detection Rate : 0.3400 -#> Detection Prevalence : 0.3700 -#> Balanced Accuracy : 0.8100 +#> Detection Rate : 0.3500 +#> Detection Prevalence : 0.3800 +#> Balanced Accuracy : 0.8200 #> #> 'Positive' Class : 1 #> ``` -### GLMM Backend +### GLMM framework We re-run the DE tests a final time using the GLMM backend. This is the most complex model architecture we support, and is the trickiest to @@ -387,7 +384,6 @@ scLANE_models_glmm <- testDynamic(sim_data, id.vec = sim_data$subject, n.cores = 4, track.time = TRUE) -#> [1] "testDynamic evaluated 100 genes across 1 lineage in 2.009 mins" ``` Like the GLM backend, the GLMM backend uses a likelihood ratio test to @@ -408,7 +404,7 @@ select(scLANE_res_glmm, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic | Gene | Lineage | LRT stat. | P-value | Adj. p-value | Predicted dynamic status | |:-------|:--------|----------:|--------:|-------------:|-------------------------:| -| WAPAL | A | 366.148 | 0 | 0 | 1 | +| WAPAL | A | 366.154 | 0 | 0 | 1 | | JARID2 | A | 334.714 | 0 | 0 | 1 | | FLOT2 | A | 314.979 | 0 | 0 | 1 | | ISOC2 | A | 223.028 | 0 | 0 | 1 | @@ -459,9 +455,9 @@ caret::confusionMatrix(factor(scLANE_res_glmm$Gene_Dynamic_Overall, levels = c(0 #> ``` -## Downstream Analysis & Visualization +## Downstream analysis & visualization -### Model Comparison +### Model comparison We can use the `plotModels()` to visually compare different types of modeling backends. It takes as input the results from `testDynamic()`, @@ -509,7 +505,7 @@ plotModels(scLANE_models_glmm, -### Gene Clustering +### Gene clustering After generating a suitable set of models, we can cluster the genes in a semi-supervised fashion using `clusterGenes()`. This function uses the @@ -537,17 +533,16 @@ slice_sample(gene_clust_table, n = 5) %>% col.names = c("Gene", "Lineage", "Cell", "Fitted (link)", "Fitted (response)", "Pseudotime", "Cluster")) ``` -| Gene | Lineage | Cell | Fitted (link) | Fitted (response) | Pseudotime | Cluster | -|:------|:--------|:-----|--------------:|------------------:|-----------:|:--------| -| CPA3 | A | 915 | -2.317 | 0.117 | 0.288 | 1 | -| FLOT2 | A | 722 | -0.518 | 0.587 | 0.805 | 1 | -| MPG | A | 603 | -2.988 | 0.053 | 0.507 | 1 | -| RPL9 | A | 156 | 0.336 | 6.309 | 0.390 | 2 | -| GRINA | A | 1162 | 0.199 | 1.554 | 0.905 | 2 | +| Gene | Lineage | Cell | Fitted (link) | Fitted (response) | Pseudotime | Cluster | +|:--------|:--------|:-----|--------------:|------------------:|-----------:|:--------| +| RPL23 | A | 905 | 2.991 | 18.571 | 0.262 | 1 | +| ARHGEF9 | A | 974 | -1.098 | 0.353 | 0.435 | 1 | +| BAD | A | 8 | 0.411 | 1.054 | 0.020 | 1 | +| ARHGEF9 | A | 417 | -2.104 | 0.413 | 0.043 | 1 | +| SF3B5 | A | 535 | 0.498 | 2.513 | 0.338 | 1 | The results can then be plotted as desired using `ggplot2` or another -visualization package. Upon visual inspection, the genes seem to cluster -based on whether they are more dynamic or more static over pseudotime. +visualization package. ``` r ggplot(gene_clust_table, aes(x = PT, y = FITTED, color = CLUSTER, group = GENE)) + @@ -563,7 +558,7 @@ ggplot(gene_clust_table, aes(x = PT, y = FITTED, color = CLUSTER, group = GENE)) -## Gene-level Embeddings +### Gene embeddings After extracting a matrix of fitted dynamics using `smoothedCountsMatrix()`, we can embed the genes in PCA & UMAP space in @@ -581,12 +576,12 @@ ggplot(gene_embedding, aes(x = umap1, y = umap2, color = leiden)) + labs(x = "UMAP 1", y = "UMAP 2", color = "Leiden") + - theme_scLANE() + theme_scLANE(umap = TRUE) ``` -## Knot distributions +## Knot distribution Lastly, we can pull the locations in pseudotime of all the knots fitted by `scLANE`. Visualizing this distribution gives us some idea of where @@ -612,7 +607,7 @@ ggplot(knot_dist, aes(x = knot)) + -# Conclusions & Best Practices +# Conclusions & best practices In general, starting with the GLM backend is probably your best bet unless you have a strong prior belief that expression trends will differ @@ -633,7 +628,7 @@ computations to complete swiftly. Datasets with smaller numbers of cells or fewer genes of interest may be easily analyzed in an R session on a local machine. -# Contact Information +# Contact information This package is developed & maintained by Jack Leary. Feel free to reach out by [opening an issue](https://github.com/jr-leary7/scLANE/issues) or diff --git a/man/embedGenes.Rd b/man/embedGenes.Rd index 757cb28..4371623 100644 --- a/man/embedGenes.Rd +++ b/man/embedGenes.Rd @@ -12,6 +12,7 @@ embedGenes( cluster.genes = TRUE, gene.meta.data = NULL, k.param = 20, + resolution.param = NULL, random.seed = 312 ) } @@ -30,6 +31,8 @@ embedGenes( \item{k.param}{(Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.} +\item{resolution.param}{(Optional) The value of the resolution parameter for the Leiden algorithm. If unspecified, silhouette scoring is used to select an optimal value. Defaults to NULL.} + \item{random.seed}{(Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312.} } \value{ diff --git a/man/figures/README-plot-clust-1.png b/man/figures/README-plot-clust-1.png index 723072a..4eae6f2 100644 Binary files a/man/figures/README-plot-clust-1.png and b/man/figures/README-plot-clust-1.png differ diff --git a/man/figures/README-plot-models-glmm-1.png b/man/figures/README-plot-models-glmm-1.png index 434ccf9..8f572f2 100644 Binary files a/man/figures/README-plot-models-glmm-1.png and b/man/figures/README-plot-models-glmm-1.png differ diff --git a/man/figures/README-unnamed-chunk-1-1.png b/man/figures/README-unnamed-chunk-1-1.png index 3a15e1f..643e748 100644 Binary files a/man/figures/README-unnamed-chunk-1-1.png and b/man/figures/README-unnamed-chunk-1-1.png differ diff --git a/man/figures/README-unnamed-chunk-2-1.png b/man/figures/README-unnamed-chunk-2-1.png index ef24270..6add6f2 100644 Binary files a/man/figures/README-unnamed-chunk-2-1.png and b/man/figures/README-unnamed-chunk-2-1.png differ diff --git a/man/theme_scLANE.Rd b/man/theme_scLANE.Rd index 796067f..9e4b53e 100644 --- a/man/theme_scLANE.Rd +++ b/man/theme_scLANE.Rd @@ -4,7 +4,12 @@ \alias{theme_scLANE} \title{A \code{ggplot2} theme for \code{scLANE}.} \usage{ -theme_scLANE(base.size = 12, base.lwd = 0.75, base.family = "sans") +theme_scLANE( + base.size = 12, + base.lwd = 0.75, + base.family = "sans", + umap = FALSE +) } \arguments{ \item{base.size}{The base font size. Defaults to 12.} @@ -12,6 +17,8 @@ theme_scLANE(base.size = 12, base.lwd = 0.75, base.family = "sans") \item{base.lwd}{The base linewidth. Defaults to 0.75.} \item{base.family}{The font family to be used throughout. Defaults to "sans".} + +\item{umap}{(Optional) If set to TRUE, removes axis text and ticks for a cleaner look. Defaults to FALSE.} } \value{ A \code{ggplot2} theme.