diff --git a/.Rbuildignore b/.Rbuildignore index 3912071..b46f55f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^\.github$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 970da18..48fd75c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ .Rbuildignore inst/doc codecov.yml +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 62b4cde..aec8dd7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: scLANE Type: Package Title: Model gene expression dynamics with spline-based NB GLMs, GEEs, & GLMMs -Version: 0.7.4 +Version: 0.7.5 Authors@R: c(person(given = "Jack", family = "Leary", email = "j.leary@ufl.edu", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), person(given = "Rhonda", family = "Bacher", email = "rbacher@ufl.edu", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X"))) Description: This package uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. @@ -55,6 +55,7 @@ Suggests: bluster, cluster, rmarkdown, + BiocStyle, slingshot, gprofiler2, BiocGenerics, diff --git a/NAMESPACE b/NAMESPACE index 0981b7d..894e167 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,setNames) +importFrom(stats,vcov) importFrom(tidyr,pivot_longer) importFrom(tidyselect,everything) importFrom(utils,tail) diff --git a/NEWS.md b/NEWS.md index 2bf0437..8bc93eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,14 @@ -# `scLANE` 0.7.3 +# `scLANE` v0.7.5 + +* Preparing for BioConductor submission i.e., reformatting code, adding documentation, etc. +* Added convolution function `npConvolve()` to be used for e.g., heatmap smoothing. + +# `scLANE` v0.7.4 + +* Added the `getKnotDist()` function to pull the set of empirically-identified knots for a user-provided gene set. +* Minor enhancements & documentation improvements. + +# `scLANE` v0.7.3 * Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering. * Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. @@ -6,17 +16,17 @@ * Enhanced documentation. * Increased test coverage. -# `scLANE` 0.7.2 +# `scLANE` v0.7.2 * Added a function named `sortGenesHeatmap()` that aids in the creation of expression cascade heatmaps by sorting genes according to where in pseudotime their peak expression is. * Changed the parameter `approx.knot` in the `testDynamic()` function to use (stochasticity-controlled) subsampling instead of `seq()` to reduce candidate knot space. * Added `summarizeModels()` to sum up slopes across pseudotime intervals. -# `scLANE` 0.7.1 +# `scLANE` v0.7.1 * Changed input format of all functions to allow counts matrices formatted as `SingleCellExperiment` or `Seurat` objects, sparse matrices, or dense matrices. * Updated visualization functions to reflect changes made in `ggplot2` v3.4 (mostly changing the `size` parameter in line-based geoms to be `linewidth` instead). -# `scLANE` 0.6.3 +# `scLANE` v0.6.3 * Added a `NEWS.md` file to track changes to the package. diff --git a/R/GetResultsDE.R b/R/GetResultsDE.R index b9c8c78..5b0ac01 100644 --- a/R/GetResultsDE.R +++ b/R/GetResultsDE.R @@ -34,7 +34,7 @@ getResultsDE <- function(test.dyn.res = NULL, function(x) { purrr::map_dfr(x, function(y) { - as.data.frame(rbind(y[1:12])) %>% + as.data.frame(rbind(y[seq(12)])) %>% dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z)))) }) }) %>% diff --git a/R/embedGenes.R b/R/embedGenes.R index bae37a9..7bb7eed 100644 --- a/R/embedGenes.R +++ b/R/embedGenes.R @@ -41,7 +41,6 @@ embedGenes <- function(smoothed.counts = NULL, genes <- colnames(smoothed.counts) smoothed.counts <- t(smoothed.counts) # embeddings - set.seed(random.seed) smoothed_counts_pca <- irlba::prcomp_irlba(smoothed.counts, n = pc.embed, center = TRUE, @@ -52,14 +51,16 @@ embedGenes <- function(smoothed.counts = NULL, metric = "cosine", n_neighbors = k.param, init = "spectral", - nn_method = "annoy") + nn_method = "annoy", + seed = random.seed) } else { smoothed_counts_umap <- uwot::umap(smoothed.counts, n_components = 2, metric = "cosine", n_neighbors = k.param, init = "spectral", - nn_method = "annoy") + nn_method = "annoy", + seed = random.seed) } # clustering w/ silhouette score parameter tuning if (cluster.genes) { diff --git a/R/getFittedValues.R b/R/getFittedValues.R index fa8f636..77375bd 100644 --- a/R/getFittedValues.R +++ b/R/getFittedValues.R @@ -116,16 +116,16 @@ getFittedValues <- function(test.dyn.res = NULL, }) %>% purrr::reduce(rbind) if (!is.null(id.vec)) { - mod_df$id <- id.vec[!is.na(x)] + mod_df <- dplyr::mutate(mod_df, subj_id = id.vec[!is.na(x)]) } else { - mod_df$id <- NA_character_ + mod_df <- dplyr::mutate(mod_df, subj_id = NA_character_) } return(mod_df) }) final_df <- purrr::reduce(mod_df_list, rbind) %>% - dplyr::relocate(cell, id, lineage) - if (all(is.na(final_df$id))) { - final_df <- dplyr::select(final_df, -id) + dplyr::relocate(cell, subj_id, lineage) + if (is.null(id.vec)) { + final_df <- dplyr::select(final_df, -subj_id) } if (!is.null(filter.lineage)) { final_df <- dplyr::filter(final_df, !lineage %in% filter.lineage) diff --git a/R/marge2.R b/R/marge2.R index 773c348..73dfedc 100644 --- a/R/marge2.R +++ b/R/marge2.R @@ -411,7 +411,7 @@ marge2 <- function(X_pred = NULL, if (utils::tail(max(score_knot_one_int_mat, na.rm = TRUE), n = 1) > utils::tail(max(score_knot_one_add_mat, na.rm = TRUE), n = 1)) { int <- TRUE score_knot <- score_knot_one_int_mat - temp <- utils::tail(which(utils::tail(max(round(score_knot, 6), na.rm = T), n = 1) == round(score_knot, 6), arr.ind = TRUE), n = 1) + temp <- utils::tail(which(utils::tail(max(round(score_knot, 6), na.rm = TRUE), n = 1) == round(score_knot, 6), arr.ind = TRUE), n = 1) min_knot1 <- temp[1] best.var <- temp[2] } else { @@ -784,7 +784,7 @@ marge2 <- function(X_pred = NULL, wic_mat_2[1, (ncol_B + 1)] <- full.wic wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new) wic_mat_2[2, 2:(length(wic1_2) + 1)] <- wic1_2 - WIC_2 <- sum(apply(wic_mat_2[1:2, ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new) + WIC_2 <- sum(apply(wic_mat_2[seq(2), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new) WIC_vec_2 <- c(WIC_vec_2, WIC_2) variable.lowest_2 <- as.numeric(which(wic1_2 == min(wic1_2, na.rm = TRUE))[1]) @@ -795,14 +795,14 @@ marge2 <- function(X_pred = NULL, wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new_2) if (i != (ncol_B - 1)) { wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2 - WIC_2 <- sum(apply(wic_mat_2[1:(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2) + WIC_2 <- sum(apply(wic_mat_2[seq(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2) WIC_vec_2 <- c(WIC_vec_2, WIC_2) variable.lowest_2 <- as.numeric(which(wic1_2 == min(wic1_2, na.rm = TRUE))[1]) var.low.vec_2 <- c(var.low.vec_2, colnames(B_new_2)[variable.lowest_2 + 1]) B_new_2 <- as.matrix(B_new_2[, -(variable.lowest_2 + 1)]) } else { wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2 - WIC_2 <- sum(apply(wic_mat_2[1:(ncol_B), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2) + WIC_2 <- sum(apply(wic_mat_2[seq(ncol_B), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2) WIC_vec_2 <- c(WIC_vec_2, WIC_2) B_new_2 <- as.matrix(B_new_2[, -(variable.lowest_2)]) colnames(B_new_2) <- "Intercept" diff --git a/R/max_span.R b/R/max_span.R index 742b3b0..39dacb9 100644 --- a/R/max_span.R +++ b/R/max_span.R @@ -17,7 +17,7 @@ max_span <- function(X_red = NULL, N <- length(unique(X_red)) x <- sort(unique(X_red)) maxspan <- round((3 - log2(alpha / q))) - x_new <- x[-c(1:maxspan, floor(N - maxspan + 1):N)] + x_new <- x[-c(seq(maxspan), floor(N - maxspan + 1):N)] if (length(x_new) == 0) { x_new <- x } diff --git a/R/plotModels.R b/R/plotModels.R index 7b234ff..4e01a1f 100644 --- a/R/plotModels.R +++ b/R/plotModels.R @@ -154,7 +154,7 @@ plotModels <- function(test.dyn.res = NULL, se = TRUE, REML = FALSE) } - glm_preds <- data.frame(predict(glm_mod, type = "link", se.fit = TRUE)[1:2]) + glm_preds <- data.frame(predict(glm_mod, type = "link", se.fit = TRUE)[seq(2)]) } else { glm_formula <- "COUNT ~ PT" if (!is.null(size.factor.offset)) { @@ -168,7 +168,7 @@ plotModels <- function(test.dyn.res = NULL, method = "glm.fit2", link = log, init.theta = 1) - glm_preds <- data.frame(stats::predict(glm_mod, type = "link", se.fit = TRUE)[1:2]) + glm_preds <- data.frame(stats::predict(glm_mod, type = "link", se.fit = TRUE)[seq(2)]) } x <- dplyr::mutate(x, RESP_GLM = glm_preds$fit, @@ -202,7 +202,7 @@ plotModels <- function(test.dyn.res = NULL, pt = x$PT) } } - gam_preds <- data.frame(predict(gam_mod, type = "link", se.fit = TRUE)[1:2]) + gam_preds <- data.frame(predict(gam_mod, type = "link", se.fit = TRUE)[seq(2)]) x <- dplyr::mutate(x, RESP_GAM = gam_preds$fit, SE_GAM = gam_preds$se.fit) diff --git a/R/pull_sumy.R b/R/pull_sumy.R index ef7db14..f1a8706 100644 --- a/R/pull_sumy.R +++ b/R/pull_sumy.R @@ -31,7 +31,7 @@ pull.null.sumy <- function(mod.obj, is.gee, is.glmm) { `p.value` = null_glmm_summary$p.value[1]) }, silent = TRUE) null_pred_df <- try({ - data.frame(predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>% + data.frame(predict(mod.obj, type = "link", se.fit = TRUE)[seq(2)]) %>% dplyr::rename(null_link_fit = fit, null_link_se = se.fit) }, silent = TRUE) } else { @@ -39,7 +39,7 @@ pull.null.sumy <- function(mod.obj, is.gee, is.glmm) { as.data.frame(broom.mixed::tidy(mod.obj)) # saves a few bytes by converting from tibble }, silent = TRUE) null_pred_df <- try({ - data.frame(stats::predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>% + data.frame(stats::predict(mod.obj, type = "link", se.fit = TRUE)[seq(2)]) %>% dplyr::rename(null_link_fit = fit, null_link_se = se.fit) }, silent = TRUE) } @@ -87,7 +87,7 @@ pull.marge.sumy <- function(mod.obj, } else if (is.glmm) { marge_pred_df <- try({ - data.frame(predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>% + data.frame(predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>% dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit) }, silent = TRUE) marge_sumy_df <- try({ @@ -101,7 +101,7 @@ pull.marge.sumy <- function(mod.obj, } else { marge_pred_df <- try({ - data.frame(stats::predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>% + data.frame(stats::predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>% dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit) }, silent = TRUE) marge_sumy_df <- try({ diff --git a/R/score_fun_gee.R b/R/score_fun_gee.R index 3492cf9..75cab9b 100644 --- a/R/score_fun_gee.R +++ b/R/score_fun_gee.R @@ -54,13 +54,13 @@ score_fun_gee <- function(Y = NULL, J21 <- Sigma21 <- matrix(0, nrow = p, ncol = p1) for (i in seq(N)) { - k <- sum(n_vec[1:i]) + k <- sum(n_vec[seq(i)]) VS.est_i <- VS.est_list[[i]] AWA.est_i <- AWA.est_list[[i]] J2_i <- J2_list[[i]] Sigma2_i <- Sigma2_list[[i]] - D.est_i <- eigenMapMatMult(A = diag((mu.est[(sum(n_vec1[1:i]) + 1):k]), nrow = n_vec[i], ncol = n_vec[i]), - B = XA[(sum(n_vec1[1:i]) + 1):k, ], + D.est_i <- eigenMapMatMult(A = diag((mu.est[(sum(n_vec1[seq(i)]) + 1):k]), nrow = n_vec[i], ncol = n_vec[i]), + B = XA[(sum(n_vec1[seq(i)]) + 1):k, ], n_cores = 1) D_est_i_transpose <- t(D.est_i) J21 <- J21 + eigenMapMatMult(A = D_est_i_transpose, diff --git a/R/smoothedCountsMatrix.R b/R/smoothedCountsMatrix.R index d8c6b71..620006c 100644 --- a/R/smoothedCountsMatrix.R +++ b/R/smoothedCountsMatrix.R @@ -49,7 +49,7 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL, } else { genes <- names(test.dyn.res) } - lineages <- LETTERS[1:length(test.dyn.res[[1]])] + lineages <- LETTERS[seq(length(test.dyn.res[[1]]))] colnames(pt) <- paste0("Lineage_", lineages) lineage_mat_list <- purrr::map(lineages, \(x) { lineage_name <- paste0("Lineage_", x) diff --git a/R/sortGenesHeatmap.R b/R/sortGenesHeatmap.R index 0a54b42..1565280 100644 --- a/R/sortGenesHeatmap.R +++ b/R/sortGenesHeatmap.R @@ -31,18 +31,18 @@ sortGenesHeatmap <- function(heatmap.mat = NULL, pt.vec = NULL) { # identify point at which peak expression occurs for each gene across pseudotime gene_peak_order <- purrr::map(seq_len(ncol(heatmap.mat)), \(x) { data.frame(gene = colnames(heatmap.mat)[x], - pt = pt.vec, + pseudotime = pt.vec, mRNA = heatmap.mat[, x]) %>% dplyr::filter(mRNA == max(mRNA)) %>% dplyr::distinct() %>% dplyr::select(gene, - pt, + pseudotime, mRNA) }) %>% purrr::reduce(rbind) %>% dplyr::with_groups(gene, dplyr::summarise, - mu = mean(pt)) %>% + mu = mean(pseudotime)) %>% dplyr::arrange(mu) %>% dplyr::pull(gene) return(gene_peak_order) diff --git a/R/stat_out_score_gee_null.R b/R/stat_out_score_gee_null.R index 9ff6b82..4336b30 100644 --- a/R/stat_out_score_gee_null.R +++ b/R/stat_out_score_gee_null.R @@ -46,19 +46,19 @@ stat_out_score_gee_null <- function(Y = NULL, J11 <- Sigma11 <- matrix(0, ncol(B_null), ncol(B_null)) for (i in seq(N)) { - k <- sum(n_vec[1:i]) + k <- sum(n_vec[seq(i)]) # set up working correlation matrix structure if (cor.structure == "independence") { R_alpha <- diag(1, n_vec[i], n_vec[i]) } else if (cor.structure == "exchangeable") { R_alpha <- matrix(alpha_est, n_vec[i], n_vec[i]) + diag(c(1 - alpha_est), n_vec[i], n_vec[i]) } else if (cor.structure == "ar1") { - R_alpha <- alpha_est^outer(1:n_vec[i], 1:n_vec[i], \(x, y) abs(x - y)) + R_alpha <- alpha_est^outer(seq(n_vec[i]), seq(n_vec[i]), \(x, y) abs(x - y)) } else { stop("Currently unsupported correlation structure.") } # compute iteration values for each statistic - temp_seq_n <- (sum(n_vec1[1:i]) + 1):k + temp_seq_n <- (sum(n_vec1[seq(i)]) + 1):k mu_est_sub <- mu_est[temp_seq_n] diag_sqrt_V_est <- diag(sqrt(V_est[temp_seq_n]), nrow = n_vec[i], diff --git a/R/summarizeModel.R b/R/summarizeModel.R index 5282b21..75c368b 100644 --- a/R/summarizeModel.R +++ b/R/summarizeModel.R @@ -4,6 +4,7 @@ #' @author Jack Leary #' @author Rhonda Bacher #' @import magrittr +#' @importFrom stats vcov #' @description This function summarizes the model for each gene and allows for quantiative interpretation of fitted gene dynamics. #' @param marge.model The fitted model output from \code{\link{marge2}} (this function is internally called by \code{\link{testDynamic}}). Defaults to NULL. #' @param pt The predictor matrix of pseudotime. Defaults to NULL. @@ -27,7 +28,7 @@ summarizeModel <- function(marge.model = NULL, pt = NULL) { if (marge.model$model_type == "GEE") { coef_vcov <- as.matrix(marge.model$final_mod$var) } else { - coef_vcov <- vcov(marge.model$final_mod) + coef_vcov <- stats::vcov(marge.model$final_mod) } coef_df <- data.frame(coef_name = names(coef(marge.model$final_mod)), coef_value = unname(coef(marge.model$final_mod)), diff --git a/R/testDynamic.R b/R/testDynamic.R index 0d3a24f..7256815 100644 --- a/R/testDynamic.R +++ b/R/testDynamic.R @@ -29,7 +29,7 @@ #' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 2. #' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE. #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to FALSE. -#' @param track.time (Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Useful for debugging. Defaults to FALSE. +#' @param track.time (Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Defaults to TRUE. #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312. #' @details #' \itemize{ @@ -87,7 +87,7 @@ testDynamic <- function(expr.mat = NULL, parallel.exec = TRUE, n.cores = 2, approx.knot = TRUE, - track.time = FALSE, + track.time = TRUE, random.seed = 312) { # check inputs if (is.null(expr.mat) || is.null(pt)) { stop("You forgot some inputs to testDynamic().") } @@ -123,7 +123,7 @@ testDynamic <- function(expr.mat = NULL, # set pseudotime lineage column names automatically to prevent user error (uses e.g., "Lineage_A", "Lineage_B", etc.) n_lineages <- ncol(pt) - colnames(pt) <- paste0("Lineage_", LETTERS[1:n_lineages]) + colnames(pt) <- paste0("Lineage_", LETTERS[seq(n_lineages)]) # ensure subject ID vector meets criteria for GEE / GLMM fitting if ((is.gee || is.glmm) && is.null(id.vec)) { stop("You must provide a vector of IDs if you're using GEE / GLMM backends.") } @@ -333,11 +333,11 @@ testDynamic <- function(expr.mat = NULL, is.glmm = is.glmm) } # add test stats to result list - lineage_list[[j]]$Test_Stat = ifelse(is.gee, test_res$Wald_Stat, test_res$LRT_Stat) - lineage_list[[j]]$Test_Stat_Note = test_res$Notes - lineage_list[[j]]$P_Val = test_res$P_Val + lineage_list[[j]]$Test_Stat <- ifelse(is.gee, test_res$Wald_Stat, test_res$LRT_Stat) + lineage_list[[j]]$Test_Stat_Note <- test_res$Notes + lineage_list[[j]]$P_Val <- test_res$P_Val } - names(lineage_list) <- paste0("Lineage_", LETTERS[1:n_lineages]) + names(lineage_list) <- paste0("Lineage_", LETTERS[seq(n_lineages)]) lineage_list } @@ -374,7 +374,7 @@ testDynamic <- function(expr.mat = NULL, MARGE_Slope_Data = NULL, Gene_Dynamics = NULL) }) - names(reformatted_results) <- paste0("Lineage_", LETTERS[1:n_lineages]) + names(reformatted_results) <- paste0("Lineage_", LETTERS[seq(n_lineages)]) return(reformatted_results) } else { return(x) diff --git a/README.Rmd b/README.Rmd index 5cb67d2..cd3ac1d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -12,8 +12,7 @@ always_allow_html: true ```{r setup, include = FALSE} -knitr::opts_chunk$set(message = FALSE, - warning = FALSE, +knitr::opts_chunk$set(warning = FALSE, collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", @@ -99,15 +98,14 @@ cell_offset <- createCellOffset(sim_data) ### GLM framework -Running `testDynamic()` provides us with a nested list containing model output & DE test results for each gene over each pseudotime / latent time lineage. In this case, since we have a true cell ordering we only have one lineage. Parallel processing is turned on by default, and we use 2 cores here to speed up runtime. +Running `testDynamic()` provides us with a nested list containing model output & DE test results for each gene over each pseudotime / latent time lineage. In this case, since we have a true cell ordering we only have one lineage. Parallel processing is turned on by default, and we use 4 cores here to speed up runtime. -```{r glm-models, results='hold'} +```{r glm-models} scLANE_models_glm <- testDynamic(sim_data, pt = order_df, genes = gene_sample, size.factor.offset = cell_offset, - n.cores = 4, - track.time = TRUE) + n.cores = 4) ``` After the function finishes running, we use `getResultsDE()` to generate a sorted table of DE test results, with one row for each gene & lineage. The GLM backend uses a simple likelihood ratio test to compare the null & alternate models, with the test statistic assumed to be [asymptotically Chi-squared distributed](https://en.wikipedia.org/wiki/Likelihood-ratio_test). @@ -138,7 +136,7 @@ caret::confusionMatrix(factor(scLANE_res_glm$Gene_Dynamic_Overall, levels = c(0, The function call is essentially the same when using the GLM backend, with the exception of needing to provide a sorted vector of subject IDs & a desired correlation structure, the default being [the AR1 structure](https://en.wikipedia.org/wiki/Autoregressive_model). We also need to flip the `is.gee` flag in order to indicate that we'd like to fit estimating equations models (instead of mixed models). Since fitting GEEs is a fair bit more computationally complex than fitting GLMs, DE testing with the GEE backend takes a bit longer. Using more cores and / or running the tests on an HPC cluster speeds things up considerably. Here we specify an [AR1 correlation structure](https://rdrr.io/cran/nlme/man/corAR1.html), which is the default for the GEE backend. -```{r gee-models, results='hold'} +```{r gee-models} scLANE_models_gee <- testDynamic(sim_data, pt = order_df, genes = gene_sample, @@ -146,8 +144,7 @@ scLANE_models_gee <- testDynamic(sim_data, is.gee = TRUE, id.vec = sim_data$subject, cor.structure = "ar1", - n.cores = 4, - track.time = TRUE) + n.cores = 4) ``` We again generate the table of DE test results. The variance of the estimated coefficients is determined using [the sandwich estimator](https://online.stat.psu.edu/stat504/lesson/12/12.3), and a Wald test is used to compare the null & alternate models. @@ -185,8 +182,7 @@ scLANE_models_glmm <- testDynamic(sim_data, is.glmm = TRUE, glmm.adaptive = TRUE, id.vec = sim_data$subject, - n.cores = 4, - track.time = TRUE) + n.cores = 4) ``` Like the GLM backend, the GLMM backend uses a likelihood ratio test to compare the null & alternate models. We fit the two nested models using maximum likelihood estimation instead of [REML](https://en.wikipedia.org/wiki/Restricted_maximum_likelihood) in order to perform this test; the null model in this case is a negative binomial GLMM with a random intercept for each subject. diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd index 68a6e86..e4d043e 100644 --- a/man/testDynamic.Rd +++ b/man/testDynamic.Rd @@ -18,7 +18,7 @@ testDynamic( parallel.exec = TRUE, n.cores = 2, approx.knot = TRUE, - track.time = FALSE, + track.time = TRUE, random.seed = 312 ) } @@ -49,7 +49,7 @@ testDynamic( \item{approx.knot}{(Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.} -\item{track.time}{(Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Useful for debugging. Defaults to FALSE.} +\item{track.time}{(Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Defaults to TRUE.} \item{random.seed}{(Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.} } diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R index 6741a4e..d29a535 100644 --- a/tests/testthat/test_scLANE.R +++ b/tests/testthat/test_scLANE.R @@ -95,7 +95,8 @@ withr::with_output_sink(tempfile(), { method = "glm.fit2", y = FALSE, model = FALSE, - init.theta = 1) + init.theta = 1, + link = log) glm_lrt <- modelLRT(mod.1 = marge_mod_offset, mod.0 = null_mod_offset) # run GEE model -- no offset marge_mod_GEE <- marge2(X_pred = pt_test, diff --git a/vignettes/scLANE.Rmd b/vignettes/scLANE.Rmd new file mode 100644 index 0000000..c74f4f9 --- /dev/null +++ b/vignettes/scLANE.Rmd @@ -0,0 +1,254 @@ +--- +title: "Interpretable Trajectory DE Testing" +author: + - name: Jack R. Leary + email: j.leary@ufl.edu + affiliation: University of Florida Department of Biostatistics, Gainesville, FL +package: scLANE +output: + BiocStyle::html_document: + toc: true + toc_float: true + toc_depth: 2 + fig_width: 9 + fig_height: 6 + dev: png +vignette: > + %\VignetteIndexEntry{Interpretable Trajectory DE Testing} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, + warning = FALSE, + error = FALSE, + fig.align = "center") +``` + +# Introduction + +Single cell RNA-seq technologies allow scientists to profile developmental processes at the cellular level. Trajectory inference comprises a broad set of methods including pseudotime estimation, RNA velocity, and graph abstraction. These methods usually seek to identify some sort of pseudotemporal ordering of cells based on their transcriptomic similarity, with the assumption that it's possible to reconstruct the biological process being studied based on gene expression. This ordering is then used to perform trajectory differential expression (TDE) testing, wherein pseudotime is treated as a covariate and gene expression is the response. Genes with statistically significant associations with pseudotime are then investigated to determine their biological effect on the process being studied. + +In order to properly characterize transcriptional dynamics across trajectories models must be able to handle nonlinearity. This is traditionally done using generalized additive models (GAMs), though interpretation of that type of model is tricky and often subjective. The `scLANE` method proposes a negative-binomial nonlinear model that is piecewise linear across empirically chosen pseudotime intervals; within each interval the model is interpretable as a classical generalized linear model (GLM). This package is comprised of an efficient implementation of that model for both single- and multi-subject datasets, along with a suite of downstream analysis utilities. + +# Libraries + +First we'll need to load some packages & resolve a few function conflicts. + +```{r, results='hide'} +library(scran) +library(dplyr) +library(scater) +library(scLANE) +library(ggplot2) +library(ComplexHeatmap) +select <- dplyr::select +filter <- dplyr::filter +``` + +# Data + +We'll start by reading in some simulated scRNA-seq data with a simple trajectory structure that is homogeneous across several subjects. + +```{r} +sim_data <- readRDS(url("https://zenodo.org/record/8433077/files/scLANE_sim_data.Rds")) +``` + +The ground-truth pseudotime ordering is well-represented in PCA space: + +```{r, fig.cap="PCA embedding showing ground-truth pseudotime"} +plotPCA(sim_data, colour_by = "cell_time_normed") + + theme_scLANE(umap = TRUE) +``` + +We don't observe any clustering by subject ID, indicating that the trajectory is consistent across subjects. + +```{r, fig.cap="PCA embedding showing subject ID"} +plotPCA(sim_data, colour_by = "subject") + + theme_scLANE(umap = TRUE) +``` + +# `scLANE` testing + +The necessary inputs for `scLANE` are as follows: a dataframe containing $\ge 1$ pseudotime lineages, a sequencing depth-based offset, and a matrix of unnormalized gene expression counts (this can be a `SingleCellExperiment` or `Seurat` object, or an actual dense or sparse counts matrix). In addition, we can optionally provide a subset of genes to test - here we identify the top 1,000 most highly-variable genes (HVGs), and test only those. + +```{r} +order_df <- data.frame(PT = sim_data$cell_time_normed) +cell_offset <- createCellOffset(sim_data) +top1k_hvgs <- getTopHVGs(modelGeneVar(sim_data), n = 1e3) +``` + +The default modeling framework is based on GLMs, which we use here. Parallel processing is turned on by default in order to speed things up. + +```{r} +scLANE_models <- testDynamic(sim_data, + pt = order_df, + genes = top1k_hvgs, + size.factor.offset = cell_offset, + n.cores = 4) +``` + +After model fitting has completed, we generate a tidy table of DE test results and statistics using the `getResultsDE()` function. Each gene is given a binary dynamic vs. static classification (denoted as 1 or 0, respectively) over a given lineage / the trajectory as a whole if it's adjusted *p*-value is less than a specified threshold; the default is 0.01. + +```{r, } +scLANE_de_res <- getResultsDE(scLANE_models) +select(scLANE_de_res, Gene, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_Overall) %>% + slice_sample(n = 7) %>% + knitr::kable(digits = 3, + caption = "Sample of scLANE TDE statistics", + col.names = c("Gene", "LRT Stat.", "P-value", "Adj. p-value", "Pred. status")) +``` + +# Downstream analysis + +The package offers a variety of different downstream analysis functionalities, several of which we'll explore here. + +## Model comparison + +The `plotModels()` function allows us to compare different modeling frameworks. Here we visualize the most significant gene and compare our model with a GLM and a GAM. The monotonic GLM fails to capture the inherent nonlinearity of the gene's dynamics over pseudotime, and while the GAM does capture that trend it lacks the interpretability of `scLANE`. + +```{r, fig.cap="Modeling framework comparison"} +plotModels(scLANE_models, + gene = scLANE_de_res$Gene[1], + pt = order_df, + expr.mat = sim_data, + size.factor.offset = cell_offset, + plot.glm = TRUE, + plot.gam = TRUE) +``` + +Each gene's output contains a slot named `Gene_Dynamics` that summarizes the coefficients across each pseudotime interval. + +```{r} +knitr::kable(scLANE_models$HIST1H4C$Lineage_A$Gene_Dynamics, + caption = "Summarized coefficients from scLANE", + digits = 3) +``` + + +## Gene dynamics plots + +Using the `getFittedValues()` function we can generate a table of per-gene, per-cell expression estimations on a variety of scales (raw, depth-normalized, and log1p-normalized). Here we visualize the fitted dynamics for the top four most significantly TDE genes. + +```{r, fig.cap="Dynamics of the top 4 most TDE genes"} +getFittedValues(scLANE_models, + genes = scLANE_de_res$Gene[1:4], + pt = order_df, + expr.mat = sim_data, + size.factor.offset = cell_offset, + cell.meta.data = data.frame(cluster = sim_data$label)) %>% + ggplot(aes(x = pt, y = rna_log1p)) + + facet_wrap(~gene, ncol = 2) + + geom_point(aes(color = cluster), + size = 2, + alpha = 0.75, + stroke = 0) + + geom_ribbon(aes(ymin = scLANE_ci_ll_log1p, ymax = scLANE_ci_ul_log1p), + linewidth = 0, + fill = "grey70", + alpha = 0.9) + + geom_line(aes(y = scLANE_pred_log1p), + color = "black", + linewidth = 0.75) + + scale_x_continuous(labels = scales::label_number(accuracy = 0.01)) + + labs(x = "Pseudotime", + y = "Normalized Expression", + color = "Leiden") + + theme_scLANE() + + theme(strip.text.x = element_text(face = "italic")) + + guides(color = guide_legend(override.aes = list(alpha = 1, size = 2, stroke = 1))) +``` +## Heatmaps + +To generate a heatmap of expression cascades, we first pull a matrix of gene dynamics for all genes that are significantly TDE. + +```{r} +dyn_genes <- filter(scLANE_de_res, Gene_Dynamic_Overall == 1) %>% + pull(Gene) +smoothed_counts <- smoothedCountsMatrix(scLANE_models, + size.factor.offset = cell_offset, + pt = order_df, + genes = dyn_genes, + log1p.norm = TRUE) +``` + +Next, we set up column annotations for the cells, and order the genes by where their peak expression occurs during pseudotime with the `sortGenesHeatmap()` function. + +```{r} +col_anno_df <- data.frame(cell_name = colnames(sim_data), + leiden = as.factor(sim_data$label), + subject = as.factor(sim_data$subject), + pseudotime = sim_data$cell_time_normed) %>% + arrange(pseudotime) +gene_order <- sortGenesHeatmap(smoothed_counts$Lineage_A, pt.vec = sim_data$cell_time_normed) +heatmap_mat <- t(scale(smoothed_counts$Lineage_A)) +colnames(heatmap_mat) <- colnames(sim_data) +heatmap_mat <- heatmap_mat[, col_anno_df$cell_name] +heatmap_mat <- heatmap_mat[gene_order, ] +col_anno <- HeatmapAnnotation(Leiden = col_anno_df$leiden, + Subject = col_anno_df$subject, + Pseudotime = col_anno_df$pseudotime, + show_legend = TRUE, + show_annotation_name = FALSE, + gap = unit(1, "mm"), + border = TRUE) +``` + +Now we can finally plot the heatmap: + +```{r, fig.cap="Expression cascade heatmap"} +Heatmap(matrix = heatmap_mat, + name = "mRNA", + col = circlize::colorRamp2(colors = viridis::inferno(50), + breaks = seq(min(heatmap_mat), max(heatmap_mat), length.out = 50)), + cluster_columns = FALSE, + width = 9, + height = 6, + column_title = "Dynamic genes across pseudotime", + cluster_rows = FALSE, + top_annotation = col_anno, + border = TRUE, + show_column_names = FALSE, + show_row_names = FALSE, + use_raster = TRUE, + raster_by_magick = TRUE, + raster_quality = 5) +``` + +## Gene embeddings + +With `embedGenes()` we can compute a gene-level clustering along with PCA & UMAP embeddings. This allows us to examine how groups of genes behave, and annotate those groups based on the genes' biological functions. + +```{r, fig.cap="Embedding & clustering of gene dynamics"} +gene_embedding <- embedGenes(expm1(smoothed_counts$Lineage_A)) +ggplot(gene_embedding, aes(x = umap1, y = umap2, color = leiden)) + + geom_point(alpha = 0.75, + size = 2, + stroke = 0) + + labs(x = "UMAP 1", + y = "UMAP 2", + color = "Gene Cluster") + + theme_scLANE(umap = TRUE) + + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1, stroke = 1))) +``` + +## Trajectory enrichment + +Lastly, we can perform pathway analysis on the set of dynamic genes using the `enrichDynamicGenes()` function. This is built off of the `r CRANpkg("gprofiler2")` package, which supports a wide variety of species and pathway databases. + +```{r} +dyn_gene_enrichment <- enrichDynamicGenes(scLANE_de_res, species = "hsapiens") +filter(dyn_gene_enrichment$result, source == "GO:BP") %>% + select(term_id, term_name, p_value, source) %>% + slice_head(n = 7) %>% + knitr::kable(digits = 3, + caption = "Trajectory pathway enrichment statistics", + col.names = c("Term ID", "Term name", "Adj. p-value", "Source")) +``` + +# Session info + +```{r} +sessionInfo() +```