diff --git a/.github/workflows/render-README.yaml b/.github/workflows/render-README.yaml index baed670..b7a250d 100644 --- a/.github/workflows/render-README.yaml +++ b/.github/workflows/render-README.yaml @@ -22,9 +22,9 @@ jobs: - uses: r-lib/actions/setup-pandoc@v2 - name: install CRAN packages - run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "dplyr", "purrr", "remotes", "devtools", "BiocManager"))' + run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "dplyr", "purrr", "remotes", "devtools", "BiocManager", "Seurat"), dependencies = TRUE)' - name: install BioConductor packages - run: Rscript -e 'BiocManager::install(c("SingleCellExperiment", "scater", "scran"))' + run: Rscript -e 'BiocManager::install(c("SingleCellExperiment", "scater", "scran", "scuttle", "bluster"))' - name: install GitHub packages run: Rscript -e 'remotes::install_github("jr-leary7/scLANE")' - name: render README diff --git a/DESCRIPTION b/DESCRIPTION index 6cab99c..5529133 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,17 +44,22 @@ URL: https://github.com/jr-leary7/scLANE BugReports: https://github.com/jr-leary7/scLANE/issues Suggests: covr, + grid, coop, uwot, ggh4x, knitr, + UCell, irlba, rlang, igraph, + gtable, + ggpubr, Seurat, bluster, cluster, rmarkdown, + gridExtra, BiocStyle, slingshot, gprofiler2, diff --git a/NAMESPACE b/NAMESPACE index 63d2bc8..e56a2ed 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(embedGenes) export(enrichDynamicGenes) export(extractBreakpoints) export(fitGLMM) +export(geneProgramScoring) export(getFittedValues) export(getKnotDist) export(getResultsDE) @@ -14,10 +15,12 @@ export(modelLRT) export(nbGAM) export(npConvolve) export(plotClusteredGenes) +export(plotModelCoefs) export(plotModels) export(smoothedCountsMatrix) export(sortGenesHeatmap) export(stripGLM) +export(summarizeModel) export(testDynamic) export(testSlope) export(theme_scLANE) @@ -29,6 +32,7 @@ importFrom(MASS,ginv) importFrom(MASS,glm.nb) importFrom(MASS,negative.binomial) importFrom(MASS,theta.mm) +importFrom(Matrix,Matrix) importFrom(Matrix,colSums) importFrom(Matrix,t) importFrom(Rcpp,sourceCpp) @@ -46,6 +50,8 @@ importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,if_else) importFrom(dplyr,inner_join) +importFrom(dplyr,lag) +importFrom(dplyr,lead) importFrom(dplyr,mutate) importFrom(dplyr,ntile) importFrom(dplyr,pull) @@ -77,10 +83,12 @@ importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) +importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,guide_legend) importFrom(ggplot2,guides) importFrom(ggplot2,labs) +importFrom(ggplot2,scale_x_continuous) importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,theme) importFrom(ggplot2,theme_classic) diff --git a/NEWS.md b/NEWS.md index 6761dde..e658ebb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,13 @@ # `scLANE` v0.7.7 -* Added DOI badge to README -* Further compression of included datasets. +* Added DOI badge to README. +* Better compression of included datasets. +* Added `geneProgramScoring()` for module scoring of dynamic gene clusters. +* Added `plotModelCoefs()` to annotate gene dynamics plots with a table of model coefficients. # `scLANE` v0.7.6 -* Added Zenodo tracking. +* Added [Zenodo tracking](https://doi.org/10.5281/zenodo.10030621). * Added simulated dataset to `data/`. # `scLANE` v0.7.5 @@ -21,7 +23,7 @@ # `scLANE` v0.7.3 * Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering. -* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. +* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine distance. * Added `theme_scLANE()` for output plots. * Enhanced documentation. * Increased test coverage. diff --git a/R/geneProgramScoring.R b/R/geneProgramScoring.R new file mode 100644 index 0000000..49a6102 --- /dev/null +++ b/R/geneProgramScoring.R @@ -0,0 +1,79 @@ +#' Add per-cell module scores for gene programs. +#' +#' @name geneProgramScoring +#' @author Jack Leary +#' @import magrittr +#' @importFrom Matrix Matrix +#' @importFrom ggplot2 ggplot aes geom_point geom_vline geom_ribbon geom_line scale_x_continuous labs +#' @importFrom scales label_number +#' @description This function uses \code{\link[UCell]{ScoreSignatures_UCell}} to create a per-cell module score for each of the provided gene clusters. If the +#' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL. +#' @param genes A character vector of gene IDs. Defaults to NULL. +#' @param gene.clusters A factor containing the cluster assignment of each gene in \code{genes}. Defaults to NULL. +#' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL. +#' @param n.cores (Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2. +#' @return Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix. +#' @export +#' @examples +#' \dontrun{ +#' geneProgramScoring(seu_obj, +#' genes = gene_embed$gene, +#' gene.clusters = gene_embed$leiden, +#' program.labels = c("cell cycle", "organogenesis")) +#' } + +geneProgramScoring <- function(expr.mat = NULL, + genes = NULL, + gene.clusters = NULL, + program.labels = NULL, + n.cores = 2) { + # check inputs + if (is.null(expr.mat) || is.null(genes) || is.null(gene.clusters)) { stop("Arguments to geneProgramScoring() are missing.") } + if (!is.factor(gene.clusters)) { + gene.clusters <- as.factor(gene.clusters) + } + # set program labels + cluster.labels <- unique(gene.clusters) + if (is.null(program.labels)) { + program.labels <- paste0("cluster_", cluster.labels) + } else { + program.labels <- gsub(" ", "_", program.labels) + } + # set up query matrix + if (inherits(expr.mat, "SingleCellExperiment")) { + counts_matrix <- BiocGenerics::counts(expr.mat) + } else if (inherits(expr.mat, "Seurat")) { + counts_matrix <- Seurat::GetAssayData(expr.mat, + slot = "counts", + assay = Seurat::DefaultAssay(expr.mat)) + } else if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array")) { + counts_matrix <- Matrix::Matrix(expr.mat, sparse = TRUE) + } + # set up feature list + program_list <- split(genes, gene.clusters) + names(program_list) <- program.labels + # run UCell + program_scores <- UCell::ScoreSignatures_UCell(counts_matrix, + features = program_list, + ncores = n.cores) + # reformat program scores depending on input format + if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array") || inherits(expr.mat, "dgCMatrix")) { + colnames(program_scores) <- program.labels + } else { + for (g in seq(ncol(program_scores))) { + if (inherits(expr.mat, "SingleCellExperiment")) { + SummarizedExperiment::colData(expr.mat)[, program.labels[g]] <- program_scores[, g] + } else if (inherits(expr.mat, "Seurat")) { + expr.mat <- Seurat::AddMetaData(expr.mat, + metadata = program_scores[, g], + program.labels[g]) + } + } + } + # return results + if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array") || inherits(expr.mat, "dgCMatrix")) { + return(program_scores) + } else { + return(expr.mat) + } +} diff --git a/R/plotModelCoefs.R b/R/plotModelCoefs.R new file mode 100644 index 0000000..052f347 --- /dev/null +++ b/R/plotModelCoefs.R @@ -0,0 +1,137 @@ +#' Plot gene dynamics with estimated coefficients. +#' +#' @name plotModelCoefs +#' @author Jack Leary +#' @import magrittr +#' @importFrom dplyr select mutate lag lead +#' @importFrom tidyr pivot_longer +#' @importFrom ggplot2 ggplot aes geom_point geom_vline geom_ribbon geom_line scale_x_continuous labs +#' @importFrom scales label_number +#' @description Generate a plot of gene dynamics over a single pseudotime lineage, along with a table of coefficients across pseudotime intervals. +#' @param test.dyn.res The output from \code{\link{testDynamic}}. Defaults to NULL. +#' @param gene A character specifying which gene's dynamics should be plotted. Defaults to NULL. +#' @param pt A data.frame of pseudotime values for each cell. Defaults to NULL. +#' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL. +#' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL. +#' @param log1p.norm (Optional) Should log1p-normalized versions of expression & model predictions be returned as well? Defaults to TRUE. +#' @param lineage A character vector specifying which lineage should be plotted. Should be letters, i.e. lineage "A" or "B". Defaults to "A". +#' @return A \code{ggplot2} object displaying a gene dynamics plot & a table of coefficients across pseudotime intervals. +#' @export +#' @examples +#' \dontrun{ +#' plotModelCoefs(gene_stats, +#' gene = "BRCA2", +#' pt = pt_df, +#' expr.mat = seu_obj, +#' size.factor.offset = cell_offset) +#' } + +plotModelCoefs <- function(test.dyn.res = NULL, + gene = NULL, + pt = NULL, + expr.mat = NULL, + size.factor.offset = NULL, + lineage = "A", + log1p.norm = TRUE) { + # check inputs + if (is.null(test.dyn.res) || is.null(gene) || is.null(pt) || is.null(expr.mat)) { stop("Arguments to plotModelCoefs() are missing.") } + # pull fitted values + all_lineages <- gsub("Lineage_", "", names(test.dyn.res[[1]])) + if (length(all_lineages) == 1) { + gfv_filter <- NULL + } else { + gfv_filter <- all_lineages[all_lineages != lineage] + } + fitted_vals <- getFittedValues(test.dyn.res, + genes = gene, + pt = pt, + expr.mat = expr.mat, + size.factor.offset = size.factor.offset, + log1p.norm = log1p.norm, + filter.lineage = gfv_filter) + if (log1p.norm) { + fitted_vals <- dplyr::select(fitted_vals, + cell, + lineage, + pt, + gene, + rna = rna_log1p, + scLANE_pred = scLANE_pred_log1p, + scLANE_ci_ll = scLANE_ci_ll_log1p, + scLANE_ci_ul = scLANE_ci_ul_log1p) + } else { + fitted_vals <- dplyr::select(fitted_vals, + cell, + lineage, + pt, + gene, + rna, + scLANE_pred, + scLANE_ci_ll, + scLANE_ci_ul) + + } + # generate dynamics plot + dyn_plot <- ggplot2::ggplot(fitted_vals, ggplot2::aes(x = pt, y = rna)) + + ggplot2::geom_point(size = 1.5, + alpha = 0.6, + stroke = 0, + color = "grey30") + + ggplot2::geom_vline(data = data.frame(gene = gene, knot = unique(test.dyn.res[[gene]][[paste0("Lineage_", lineage)]]$MARGE_Slope_Data$Breakpoint)), + mapping = ggplot2::aes(xintercept = knot), + linetype = "dashed", + color = "black", + linewidth = 0.75) + + ggplot2::geom_ribbon(ggplot2::aes(ymin = scLANE_ci_ll, ymax = scLANE_ci_ul), + linewidth = 0, + fill = "darkgreen", + alpha = 0.35) + + ggplot2::geom_line(ggplot2::aes(y = scLANE_pred), + color = "darkgreen", + linewidth = 0.75) + + ggplot2::scale_x_continuous(labels = scales::label_number(accuracy = 0.01)) + + ggplot2::labs(x = "Pseudotime", + y = ifelse(log1p.norm, "Normalized Expression", "Expression")) + + theme_scLANE() + # generate coefficient summary + min_pt <- min(pt[, which(LETTERS == lineage)], na.rm = TRUE) + max_pt <- max(pt[, which(LETTERS == lineage)], na.rm = TRUE) + coef_sumy <- dplyr::select(test.dyn.res[[gene]][[paste0("Lineage_", lineage)]]$Gene_Dynamics, + -dplyr::starts_with("Trend")) %>% + tidyr::pivot_longer(dplyr::starts_with("Slope"), + names_to = "Segment", + values_to = "Coef") %>% + dplyr::mutate(Breakpoint_Lag = dplyr::lag(Breakpoint), + Breakpoint_Lead = dplyr::lead(Breakpoint), + Interval = NA_character_, + .before = 4) %>% + dplyr::mutate(Breakpoint_Lag = dplyr::if_else(is.na(Breakpoint_Lag), min_pt, Breakpoint_Lag), + Breakpoint_Lead = dplyr::if_else(is.na(Breakpoint_Lead), max_pt, Breakpoint_Lead), + Interval = paste0("(", round(Breakpoint_Lag, 3), ", ", round(Breakpoint_Lead, 3), ")")) %>% + dplyr::select(Interval, Coef) %>% + dplyr::mutate(Coef = round(Coef, 3)) + # convert coefficient summary to grob + coef_sumy_grob <- gridExtra::tableGrob(coef_sumy, + cols = c("Interval", "Coefficient"), + theme = gridExtra::ttheme_minimal(base_size = 11, + core = list(fg_params = list(hjust = 0, x = 0.05)), + colhead = list(fg_params = list(hjust = 0, x = 0.05))), + rows = NULL) %>% + gtable::gtable_add_grob(grobs = grid::rectGrob(gp = grid::gpar(fill = NA, lwd = 3)), + t = 1, + b = nrow(.), + l = 1, + r = ncol(.)) %>% + gtable::gtable_add_grob(grobs = grid::rectGrob(gp = grid::gpar(fill = NA, lwd = 3)), + t = 1, + l = 1, + r = ncol(.)) + + # combine objects + dyn_plot_anno <- ggpubr::ggarrange(dyn_plot, + coef_sumy_grob, + ncol = 2, + nrow = 1, + widths = c(2, 1)) + return(dyn_plot_anno) +} diff --git a/R/summarizeModel.R b/R/summarizeModel.R index 75c368b..744e9e9 100644 --- a/R/summarizeModel.R +++ b/R/summarizeModel.R @@ -10,6 +10,7 @@ #' @param pt The predictor matrix of pseudotime. Defaults to NULL. #' @return A data.frame of the model coefficients, cutpoint intervals, and formatted equations. #' @seealso \code{\link{marge2}} +#' @export #' @examples #' \dontrun{ #' summarizeModel(marge.model = marge_mod, pt = pt_df) diff --git a/README.Rmd b/README.Rmd index 303a330..975a0a7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -89,8 +89,9 @@ plotPCA(sim_data, colour_by = "subject") + Since we have multi-subject data, we can use any of the three model modes to run our DE testing. We'll start with the simplest model, the GLM, then work our way through the other options in order of increasing complexity. We first prepare our inputs - a dataframe containing our cell ordering, a set of genes to build models for, and a vector of per-cell size factors to be used as offsets during estimation. In reality, it's usually unnecessary to fit a model for every single gene in a dataset, as trajectories are usually estimated using a subset of the entire set of genes (usually a few thousand most highly variable genes). For the purpose of demonstration, we'll select 50 genes each from the dynamic and non-dynamic populations. -> [!NOTE] -> In this case we're working with a single pseudotime lineage, though in real datasets several lineages often exist; in order to fit models for a subset of lineages simply remove the corresponding columns from the cell ordering dataframe passed as input to `testDynamic()`. +{% note %} +**Note** In this case we're working with a single pseudotime lineage, though in real datasets several lineages often exist; in order to fit models for a subset of lineages simply remove the corresponding columns from the cell ordering dataframe passed as input to `testDynamic()`. +{% endnote %} ```{r prep-data} set.seed(312) @@ -165,8 +166,9 @@ scLANE_models_glmm <- testDynamic(sim_data, n.cores = 4) ``` -> [!NOTE] -> The GLMM mode is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. +{% note %} +**Note** The GLMM mode is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. +{% endnote %} Like the GLM mode, the GLMM mode uses a likelihood ratio test to compare the null & alternate models. We fit the two nested models using maximum likelihood estimation instead of [REML](https://en.wikipedia.org/wiki/Restricted_maximum_likelihood) in order to perform this test; the null model in this case is a negative binomial GLMM with a random intercept for each subject. diff --git a/README.md b/README.md index 69b2d81..986017e 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ the entire set of genes (usually a few thousand most highly variable genes). For the purpose of demonstration, we’ll select 50 genes each from the dynamic and non-dynamic populations. -> [!NOTE] In this case we’re working with a single pseudotime lineage, +> \[!NOTE\] In this case we’re working with a single pseudotime lineage, > though in real datasets several lineages often exist; in order to fit > models for a subset of lineages simply remove the corresponding > columns from the cell ordering dataframe passed as input to @@ -170,7 +170,7 @@ scLANE_models_glm <- testDynamic(sim_data, genes = gene_sample, size.factor.offset = cell_offset, n.cores = 4) -#> scLANE testing completed for 100 genes across 1 lineage in 21.213 secs +#> scLANE testing completed for 100 genes across 1 lineage in 24.807 secs ``` After the function finishes running, we use `getResultsDE()` to generate @@ -189,13 +189,13 @@ select(scLANE_res_glm, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_ col.names = c("Gene", "Lineage", "LRT stat.", "P-value", "Adj. p-value", "Predicted dynamic status")) ``` -| Gene | Lineage | LRT stat. | P-value | Adj. p-value | Predicted dynamic status | -|:--------|:--------|----------:|--------:|-------------:|-------------------------:| -| MFSD2B | A | 209.755 | 0.000 | 0.000 | 1 | -| ARHGEF9 | A | 6.428 | 0.040 | 0.442 | 0 | -| NOP14 | A | 7.727 | 0.005 | 0.114 | 0 | -| TMCO3 | A | 167.582 | 0.000 | 0.000 | 1 | -| SMG1 | A | 5.016 | 0.081 | 0.442 | 0 | +| Gene | Lineage | LRT stat. | P-value | Adj. p-value | Predicted dynamic status | +|:-------|:--------|----------:|--------:|-------------:|-------------------------:| +| MFSD2B | A | 209.755 | 0.000 | 0.000 | 1 | +| FOXD3 | A | 4.276 | 0.039 | 0.451 | 0 | +| NOP14 | A | 7.712 | 0.005 | 0.115 | 0 | +| TMCO3 | A | 167.759 | 0.000 | 0.000 | 1 | +| DLC1 | A | 2.803 | 0.094 | 0.451 | 0 | ### GEE mode @@ -218,7 +218,7 @@ scLANE_models_gee <- testDynamic(sim_data, id.vec = sim_data$subject, cor.structure = "ar1", n.cores = 4) -#> scLANE testing completed for 100 genes across 1 lineage in 2.253 mins +#> scLANE testing completed for 100 genes across 1 lineage in 2.343 mins ``` We again generate the table of DE test results. The variance of the @@ -237,11 +237,11 @@ select(scLANE_res_gee, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_ | Gene | Lineage | Wald stat. | P-value | Adj. p-value | Predicted dynamic status | |:---------|:--------|-----------:|--------:|-------------:|-------------------------:| -| CKAP4 | A | 19582.769 | 0 | 0 | 1 | -| TRAPPC1 | A | 28.553 | 0 | 0 | 1 | +| BAD | A | 301009.051 | 0 | 0.000 | 1 | +| SPCS3 | A | 18.352 | 0 | 0.002 | 1 | | GOLGA8EP | A | NA | NA | NA | 0 | -| PCF11 | A | 2672.280 | 0 | 0 | 1 | -| MPG | A | 924.419 | 0 | 0 | 1 | +| WAPAL | A | 3168.110 | 0 | 0.000 | 1 | +| MPG | A | 924.419 | 0 | 0.000 | 1 | ### GLMM mode @@ -265,10 +265,10 @@ scLANE_models_glmm <- testDynamic(sim_data, glmm.adaptive = TRUE, id.vec = sim_data$subject, n.cores = 4) -#> scLANE testing completed for 100 genes across 1 lineage in 2.39 mins +#> scLANE testing completed for 100 genes across 1 lineage in 2.37 mins ``` -> [!NOTE] The GLMM mode is still under development, as we are working +> \[!NOTE\] The GLMM mode is still under development, as we are working > on further reducing runtime and increasing the odds of the underlying > optimization process converging successfully. As such, updates will be > frequent and functionality / results may shift slightly. diff --git a/man/figures/README-plot-models-gee-1.png b/man/figures/README-plot-models-gee-1.png index da9b60e..27383e0 100644 Binary files a/man/figures/README-plot-models-gee-1.png and b/man/figures/README-plot-models-gee-1.png differ diff --git a/man/figures/README-unnamed-chunk-2-1.png b/man/figures/README-unnamed-chunk-2-1.png index 5ece4ca..f70fdb3 100644 Binary files a/man/figures/README-unnamed-chunk-2-1.png and b/man/figures/README-unnamed-chunk-2-1.png differ diff --git a/man/geneProgramScoring.Rd b/man/geneProgramScoring.Rd new file mode 100644 index 0000000..0dc2711 --- /dev/null +++ b/man/geneProgramScoring.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/geneProgramScoring.R +\name{geneProgramScoring} +\alias{geneProgramScoring} +\title{Add per-cell module scores for gene programs.} +\usage{ +geneProgramScoring( + expr.mat = NULL, + genes = NULL, + gene.clusters = NULL, + program.labels = NULL, + n.cores = 2 +) +} +\arguments{ +\item{expr.mat}{Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.} + +\item{genes}{A character vector of gene IDs. Defaults to NULL.} + +\item{gene.clusters}{A factor containing the cluster assignment of each gene in \code{genes}. Defaults to NULL.} + +\item{program.labels}{(Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.} + +\item{n.cores}{(Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2.} +} +\value{ +Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix. +} +\description{ +This function uses \code{\link[UCell]{ScoreSignatures_UCell}} to create a per-cell module score for each of the provided gene clusters. If the +} +\examples{ +\dontrun{ +geneProgramScoring(seu_obj, + genes = gene_embed$gene, + gene.clusters = gene_embed$leiden, + program.labels = c("cell cycle", "organogenesis")) +} +} +\author{ +Jack Leary +} diff --git a/man/plotModelCoefs.Rd b/man/plotModelCoefs.Rd new file mode 100644 index 0000000..87302ac --- /dev/null +++ b/man/plotModelCoefs.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotModelCoefs.R +\name{plotModelCoefs} +\alias{plotModelCoefs} +\title{Plot gene dynamics with estimated coefficients.} +\usage{ +plotModelCoefs( + test.dyn.res = NULL, + gene = NULL, + pt = NULL, + expr.mat = NULL, + size.factor.offset = NULL, + lineage = "A", + log1p.norm = TRUE +) +} +\arguments{ +\item{test.dyn.res}{The output from \code{\link{testDynamic}}. Defaults to NULL.} + +\item{gene}{A character specifying which gene's dynamics should be plotted. Defaults to NULL.} + +\item{pt}{A data.frame of pseudotime values for each cell. Defaults to NULL.} + +\item{expr.mat}{Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.} + +\item{size.factor.offset}{(Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.} + +\item{lineage}{A character vector specifying which lineage should be plotted. Should be letters, i.e. lineage "A" or "B". Defaults to "A".} + +\item{log1p.norm}{(Optional) Should log1p-normalized versions of expression & model predictions be returned as well? Defaults to TRUE.} +} +\value{ +A \code{ggplot2} object displaying a gene dynamics plot & a table of coefficients across pseudotime intervals. +} +\description{ +Generate a plot of gene dynamics over a single pseudotime lineage, along with a table of coefficients across pseudotime intervals. +} +\examples{ +\dontrun{ +plotModelCoefs(gene_stats, + gene = "BRCA2", + pt = pt_df, + expr.mat = seu_obj, + size.factor.offset = cell_offset) +} +} +\author{ +Jack Leary +} diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R index 6512005..311b8cb 100644 --- a/tests/testthat/test_scLANE.R +++ b/tests/testthat/test_scLANE.R @@ -165,6 +165,13 @@ withr::with_output_sink(tempfile(), { plot.null = TRUE, plot.glm = TRUE, plot.gam = TRUE) + coef_plot_glm <- plotModelCoefs(glm_gene_stats, + gene = glm_test_results$Gene[1], + pt = pt_test, + expr.mat = sim_data, + size.factor.offset = cell_offset, + lineage = "A", + log1p.norm = TRUE) # gene clustering set.seed(312) gene_clusters_leiden <- clusterGenes(test.dyn.res = glm_gene_stats, @@ -211,6 +218,13 @@ withr::with_output_sink(tempfile(), { pc.return = 2, k.param = 5, random.seed = 312) + # gene program scoring + sim_data <- geneProgramScoring(sim_data, + genes = gene_embedding$gene, + gene.clusters = gene_embedding$leiden) + sim_data_seu <- geneProgramScoring(sim_data_seu, + genes = gene_embedding$gene, + gene.clusters = gene_embedding$leiden) # enrichment analysis gsea_res <- enrichDynamicGenes(glm_test_results, species = "hsapiens") # coefficients @@ -343,6 +357,11 @@ test_that("plotModels() output", { expect_equal(ncol(plot_glmm$data), 12) }) +test_that("plotModelCoefs() output", { + expect_s3_class(coef_plot_glm, "ggplot") + expect_s3_class(coef_plot_glm, "ggarrange") +}) + test_that("clusterGenes() output", { expect_s3_class(gene_clusters_leiden, "data.frame") expect_s3_class(gene_clusters_kmeans, "data.frame") @@ -370,6 +389,13 @@ test_that("embedGenes() output", { expect_equal(ncol(gene_embedding_pca), 6) }) +test_that("geneProgramScoring() output", { + expect_equal(colnames(SummarizedExperiment::colData(sim_data))[7], "cluster_0") + expect_equal(colnames(SummarizedExperiment::colData(sim_data))[8], "cluster_1") + expect_equal(colnames(sim_data_seu@meta.data)[10], "cluster_0") + expect_equal(colnames(sim_data_seu@meta.data)[11], "cluster_1") +}) + test_that("sortGenesHeatmap() output", { expect_type(sorted_genes, "character") expect_length(sorted_genes, ncol(smoothed_counts$Lineage_A)) diff --git a/vignettes/scLANE.Rmd b/vignettes/scLANE.Rmd index 2104fa3..c963f27 100644 --- a/vignettes/scLANE.Rmd +++ b/vignettes/scLANE.Rmd @@ -72,7 +72,7 @@ plotPCA(sim_data, colour_by = "subject") + # `scLANE` testing -The necessary inputs for `scLANE` are as follows: a dataframe containing $\ge 1$ pseudotime lineages, a sequencing depth-based offset, and a matrix of unnormalized gene expression counts (this can be a `SingleCellExperiment` or `Seurat` object, or an actual dense or sparse counts matrix). In addition, we can optionally provide a subset of genes to test - here we identify the top 1,000 most highly-variable genes (HVGs), and test only those. +The necessary inputs for `scLANE` are as follows: a dataframe containing $\ge 1$ pseudotime lineages, a sequencing depth-based offset, and a matrix of gene expression counts (this can be a `SingleCellExperiment` or `Seurat` object, or an actual dense or sparse counts matrix). In addition, we can optionally provide a subset of genes to test - here we identify the top 1,000 most highly-variable genes (HVGs), and test only those. ```{r} order_df <- data.frame(PT = sim_data$cell_time_normed) @@ -95,7 +95,7 @@ After model fitting has completed, we generate a tidy table of DE test results a ```{r, } scLANE_de_res <- getResultsDE(scLANE_models) select(scLANE_de_res, Gene, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_Overall) %>% - slice_sample(n = 7) %>% + slice_sample(n = 5) %>% knitr::kable(digits = 3, caption = "Sample of scLANE TDE statistics", col.names = c("Gene", "LRT Stat.", "P-value", "Adj. p-value", "Pred. status")) @@ -122,7 +122,7 @@ plotModels(scLANE_models, Each gene's output contains a slot named `Gene_Dynamics` that summarizes the coefficients across each pseudotime interval. ```{r} -knitr::kable(scLANE_models$HIST1H4C$Lineage_A$Gene_Dynamics, +knitr::kable(scLANE_models[[scLANE_de_res$Gene[1]]]$Lineage_A$Gene_Dynamics, caption = "Summarized coefficients from scLANE", digits = 3) ``` @@ -234,6 +234,24 @@ ggplot(gene_embedding, aes(x = umap1, y = umap2, color = leiden)) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1, stroke = 1))) ``` +## Gene program scoring + +We assume that each cluster of similarly-behaving genes represents a gene program i.e., a set of genes that work together to perform a shared task. Each program is defined by the set of genes unique to it, and module scoring allows us to assign a per-cell numeric score for each set of genes. The `geneProgramScoring()` function performs this task using the `r Biocpkg("UCell")` package under the hood. + +```{r} +sim_data <- geneProgramScoring(sim_data, + genes = gene_embedding$gene, + gene.clusters = gene_embedding$leiden) + +``` + +Plotting the program scores for gene cluster 1 shows that cells at the end of the trajectory have overall high expression of genes in that cluster. + +```{r, fig.cap="Module scores for gene cluster 0"} +plotPCA(sim_data, colour_by = "cluster_1") + + theme_scLANE(umap = TRUE) +``` + ## Trajectory enrichment Lastly, we can perform pathway analysis on the set of dynamic genes using the `enrichDynamicGenes()` function. This is built off of the `r CRANpkg("gprofiler2")` package, which supports a wide variety of species and pathway databases. @@ -242,7 +260,7 @@ Lastly, we can perform pathway analysis on the set of dynamic genes using the `e dyn_gene_enrichment <- enrichDynamicGenes(scLANE_de_res, species = "hsapiens") filter(dyn_gene_enrichment$result, source == "GO:BP") %>% select(term_id, term_name, p_value, source) %>% - slice_head(n = 7) %>% + slice_head(n = 5) %>% knitr::kable(digits = 3, caption = "Trajectory pathway enrichment statistics", col.names = c("Term ID", "Term name", "Adj. p-value", "Source"))