diff --git a/NEWS.md b/NEWS.md index 4d9e81b..2bf0437 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. * Added `theme_scLANE()` for output plots. * Enhanced documentation. +* Increased test coverage. # `scLANE` 0.7.2 diff --git a/R/backward_sel.R b/R/backward_sel.R deleted file mode 100644 index 6bedd04..0000000 --- a/R/backward_sel.R +++ /dev/null @@ -1,37 +0,0 @@ -#' Backward selection function for MARS that uses the generalized cross validation criterion (GCV). -#' -#' @name backward_sel -#' @importFrom stats lm.fit fitted -#' @param Y : the response variable. -#' @param B_new : the model matrix. -#' @param pen : the set/fixed penalty used for the GCV. The default is 2. -#' @param GCV.null : GCV value for the intercept model. The default is 0.001. -#' @return \code{backward_sel} returns the GCV from the fitted model. -#' @author Jakub Stoklosa and David I. Warton -#' @references Friedman, J. (1991). Multivariate adaptive regression splines. \emph{The Annals of Statistics}, \strong{19}, 1--67. -#' @references Milborrow, S. (2017a). Notes on the \code{earth} package. Package vignette. Available at: \url{http://127.0.0.1:31355/library/earth/doc/earth-notes.pdf}. -#' @references Milborrow, S. (2017b). \code{earth}: Multivariate Adaptive Regression Splines. R package version 4.4.7. Available at \url{http://CRAN.R-project.org/package = earth.} -#' @references Stoklosa, J. and Warton, D.I. (2018). A generalized estimating equation approach to multivariate adaptive regression splines. \emph{Journal of Computational and Graphical Statistics}, \strong{27}, 245--253. -#' @importFrom stats lm.fit fitted -#' @seealso \code{\link{backward_sel_WIC}} - -backward_sel <- function(Y = NULL, - B_new = NULL, - pen = 2, - GCV.null = 0.001) { - # check inputs - if (is.null(Y) || is.null(B_new)) { stop("Some inputs are missing from backward_sel().") } - if (GCV.null == 0) { stop("GCV.null in backward_sel() cannot be set to 0.") } - N <- length(Y) - n_pred <- ncol(B_new) - 1 - GCV1 <- rep(NA, n_pred) - for (j in seq(n_pred)) { - B_new1 <- as.matrix(B_new[, -(j + 1)]) - n_pred1 <- ncol(B_new1) - mod1 <- stats::lm.fit(B_new1, Y) - RSS_back <- sum((Y - stats::fitted(mod1))^2) - p <- n_pred1 + pen * (n_pred1 - 1) / 2 # This matches the earth() package, SAS and Friedman (1991) penalty. - GCV1[j] <- 1 - (RSS_back / (N * (1 - p / N)^2)) / GCV.null - } - return(GCV1) -} diff --git a/R/backward_sel_WIC.R b/R/backward_sel_WIC.R index b10426e..26bc18d 100644 --- a/R/backward_sel_WIC.R +++ b/R/backward_sel_WIC.R @@ -1,25 +1,31 @@ #' Backward selection function for MARGE - uses the Wald information criterion (WIC). #' #' @name backward_sel_WIC -#' @param Y : the response variable. Defaults to NULL. -#' @param B_new : the model matrix. Defaults to NULL. +#' @author Jakub Stoklosa +#' @author David I. Warton +#' @author Jack Leary +#' @importFrom gamlss gamlss +#' @importFrom geeM geem +#' @importFrom MASS negative.binomial +#' @importFrom withr with_output_sink +#' @param Y The response variable. Defaults to NULL. +#' @param B_new The model matrix. Defaults to NULL. #' @param is.gee Is the model a GEE? Defaults to FALSE. #' @param id.vec A vector of observation IDs that is necessary for fitting a GEE model. Defaults to NULL. #' @param cor.structure The specified working correlation structure of the GEE model. Must be one of "independence", "ar1", or "exchangeable". Defaults to NULL. #' @return \code{backward_sel_WIC} returns the Wald statistic from the fitted model (the penalty is applied later on). -#' @author Jakub Stoklosa and David I. Warton #' @references Stoklosa, J. Gibb, H. Warton, D.I. Fast forward selection for Generalized Estimating Equations With a Large Number of Predictor Variables. \emph{Biometrics}, \strong{70}, 110--120. #' @references Stoklosa, J. and Warton, D.I. (2018). A generalized estimating equation approach to multivariate adaptive regression splines. \emph{Journal of Computational and Graphical Statistics}, \strong{27}, 245--253. -#' @importFrom gamlss gamlss -#' @importFrom geeM geem -#' @importFrom MASS negative.binomial -#' @seealso \code{\link{backward_sel}} -backward_sel_WIC <- function(Y = NULL, B_new = NULL, is.gee = FALSE, id.vec = NULL, cor.structure = NULL) { +backward_sel_WIC <- function(Y = NULL, + B_new = NULL, + is.gee = FALSE, + id.vec = NULL, + cor.structure = NULL) { # check inputs if (is.null(Y) || is.null(B_new)) { stop("Some inputs are missing from backward_sel_WIC().") } - if (is.gee & is.null(id.vec)) { stop("GEEs require a vector of observation IDs in backward_sel_WIC().") } - if (is.gee & is.null(cor.structure)) { stop("GEEs require a working correlation structure in backward_sel_WIC().") } + if (is.gee && is.null(id.vec)) { stop("GEEs require a vector of observation IDs in backward_sel_WIC().") } + if (is.gee && is.null(cor.structure)) { stop("GEEs require a working correlation structure in backward_sel_WIC().") } cor.structure <- tolower(cor.structure) if (is.gee) { fit <- geeM::geem(Y ~ B_new - 1, @@ -32,10 +38,9 @@ backward_sel_WIC <- function(Y = NULL, B_new = NULL, is.gee = FALSE, id.vec = NU fit <- gamlss::gamlss(Y ~ B_new - 1, family = "NBI", trace = FALSE) - sink(tempfile()) - fit_sum <- summary(fit) - sink() - fit_sum_mat <- as.matrix(fit_sum) + withr::with_output_sink(tempfile(), { + fit_sum_mat <- as.matrix(summary(fit)) + }) wald_stat <- ((fit_sum_mat[, 3])[-c(1, nrow(fit_sum_mat))])^2 } return(wald_stat) diff --git a/R/embedGenes.R b/R/embedGenes.R index f41b307..b027a44 100644 --- a/R/embedGenes.R +++ b/R/embedGenes.R @@ -8,7 +8,7 @@ #' @importFrom stats as.dist #' @description Embed genes in dimension-reduced space given a smoothed counts matrix. #' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL. -#' @param genes A character vector of genes to embed. If not specified, all genes in \code{test.dyn.res} are used. Defaults to NULL. +#' @param genes A character vector of genes to embed. If not specified, all genes in \code{smoothed.counts} are used. Defaults to NULL. #' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30. #' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2. #' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE. @@ -34,10 +34,12 @@ embedGenes <- function(smoothed.counts = NULL, random.seed = 312) { # check inputs if (is.null(smoothed.counts)) { stop("You forgot to provide a smoothed counts matrix to embedGenes().") } + genes <- colnames(smoothed.counts) + smoothed.counts <- t(smoothed.counts) # embeddings set.seed(random.seed) - smoothed_counts_pca <- irlba::prcomp_irlba(t(smoothed.counts), - pc.embed = pc.embed, + smoothed_counts_pca <- irlba::prcomp_irlba(smoothed.counts, + n = pc.embed, center = TRUE, scale. = TRUE) smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x, @@ -46,33 +48,38 @@ embedGenes <- function(smoothed.counts = NULL, n_neighbors = k.param, init = "spectral") # clustering w/ silhouette score parameter tuning - smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x, - k = k.param, - type = "jaccard", - BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) - dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) - clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5), \(r) { - smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn, - objective_function = "modularity", - resolution_parameter = r) - if (smoothed_counts_clust$nb_clusters == 1) { - sil_val <- 0 - } else { - sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3]) - } - clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L), - resolution = r, - silhouette = sil_val) - return(clust_res) - }) - best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette)) + if (cluster.genes) { + smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x, + k = k.param, + type = "jaccard", + BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine")) + dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x)) + clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) { + smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn, + objective_function = "modularity", + resolution_parameter = r) + if (smoothed_counts_clust$nb_clusters == 1) { + sil_val <- 0 + } else { + sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3]) + } + clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L), + resolution = r, + silhouette = sil_val) + return(clust_res) + }) + best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette)) + cluster_vec <- clust_runs[[best_clustering]]$clusters + } else { + cluster_vec <- NA_integer_ + } # prepare results pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)]) colnames(pca_df) <- paste0("pc", seq(pcs.return)) - gene_clust_df <- data.frame(gene = colnames(smoothed.counts), - leiden = clust_runs[[best_clustering]]$clusters, - umap1 = smoothed_counts_umap[, 1], - umap2 = smoothed_counts_umap[, 2]) - gene_clust_df <- dplyr::bind_cols(gene_clust_df, pca_df) - return(gene_clust_df) + gene_df <- data.frame(gene = genes, + leiden = cluster_vec, + umap1 = smoothed_counts_umap[, 1], + umap2 = smoothed_counts_umap[, 2]) + gene_df <- dplyr::bind_cols(gene_df, pca_df) + return(gene_df) } diff --git a/R/stripGLM.R b/R/stripGLM.R index 7389d8c..e85d156 100644 --- a/R/stripGLM.R +++ b/R/stripGLM.R @@ -17,8 +17,6 @@ stripGLM <- function(glm.obj = NULL) { if (inherits(glm.obj, "try-error")) {return(glm.obj)} if (is.null(glm.obj)) { stop("You forgot to supply inputs to stripGLM().") } if (!inherits(glm.obj, "glm")) { stop("Input to stripGLM() must be of class glm.") } - - # strip out unnecessary glm pieces glm.obj$effects <- c() diff --git a/man/backward_sel.Rd b/man/backward_sel.Rd deleted file mode 100644 index 88f17e4..0000000 --- a/man/backward_sel.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/backward_sel.R -\name{backward_sel} -\alias{backward_sel} -\title{Backward selection function for MARS that uses the generalized cross validation criterion (GCV).} -\usage{ -backward_sel(Y = NULL, B_new = NULL, pen = 2, GCV.null = 0.001) -} -\arguments{ -\item{Y}{: the response variable.} - -\item{B_new}{: the model matrix.} - -\item{pen}{: the set/fixed penalty used for the GCV. The default is 2.} - -\item{GCV.null}{: GCV value for the intercept model. The default is 0.001.} -} -\value{ -\code{backward_sel} returns the GCV from the fitted model. -} -\description{ -Backward selection function for MARS that uses the generalized cross validation criterion (GCV). -} -\references{ -Friedman, J. (1991). Multivariate adaptive regression splines. \emph{The Annals of Statistics}, \strong{19}, 1--67. - -Milborrow, S. (2017a). Notes on the \code{earth} package. Package vignette. Available at: \url{http://127.0.0.1:31355/library/earth/doc/earth-notes.pdf}. - -Milborrow, S. (2017b). \code{earth}: Multivariate Adaptive Regression Splines. R package version 4.4.7. Available at \url{http://CRAN.R-project.org/package = earth.} - -Stoklosa, J. and Warton, D.I. (2018). A generalized estimating equation approach to multivariate adaptive regression splines. \emph{Journal of Computational and Graphical Statistics}, \strong{27}, 245--253. -} -\seealso{ -\code{\link{backward_sel_WIC}} -} -\author{ -Jakub Stoklosa and David I. Warton -} diff --git a/man/backward_sel_WIC.Rd b/man/backward_sel_WIC.Rd index cc7f07d..4d45e6a 100644 --- a/man/backward_sel_WIC.Rd +++ b/man/backward_sel_WIC.Rd @@ -13,9 +13,9 @@ backward_sel_WIC( ) } \arguments{ -\item{Y}{: the response variable. Defaults to NULL.} +\item{Y}{The response variable. Defaults to NULL.} -\item{B_new}{: the model matrix. Defaults to NULL.} +\item{B_new}{The model matrix. Defaults to NULL.} \item{is.gee}{Is the model a GEE? Defaults to FALSE.} @@ -34,9 +34,10 @@ Stoklosa, J. Gibb, H. Warton, D.I. Fast forward selection for Generalized Estima Stoklosa, J. and Warton, D.I. (2018). A generalized estimating equation approach to multivariate adaptive regression splines. \emph{Journal of Computational and Graphical Statistics}, \strong{27}, 245--253. } -\seealso{ -\code{\link{backward_sel}} -} \author{ -Jakub Stoklosa and David I. Warton +Jakub Stoklosa + +David I. Warton + +Jack Leary } diff --git a/man/embedGenes.Rd b/man/embedGenes.Rd index 9743bf6..757cb28 100644 --- a/man/embedGenes.Rd +++ b/man/embedGenes.Rd @@ -18,7 +18,7 @@ embedGenes( \arguments{ \item{smoothed.counts}{The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL.} -\item{genes}{A character vector of genes to embed. If not specified, all genes in \code{test.dyn.res} are used. Defaults to NULL.} +\item{genes}{A character vector of genes to embed. If not specified, all genes in \code{smoothed.counts} are used. Defaults to NULL.} \item{pc.embed}{(Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30.} diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R index 1b57648..3f1ac41 100644 --- a/tests/testthat/test_scLANE.R +++ b/tests/testthat/test_scLANE.R @@ -1,6 +1,8 @@ # load data & prepare for testing load(system.file("testdata/sim_test_data.RData", package = "scLANE")) +sim_data_seu <- Seurat::as.Seurat(sim_data) cell_offset <- createCellOffset(sim_data) +cell_offset_seu <- createCellOffset(sim_data_seu) genes_to_test <- c(rownames(sim_data)[SummarizedExperiment::rowData(sim_data)$geneStatus_overall == "Dynamic"][1:10], rownames(sim_data)[SummarizedExperiment::rowData(sim_data)$geneStatus_overall == "NotDynamic"][1:10]) counts_test <- t(as.matrix(SingleCellExperiment::counts(sim_data)[genes_to_test, ])) @@ -80,6 +82,7 @@ withr::with_output_sink(tempfile(), { return.basis = TRUE, return.GCV = TRUE, return.WIC = TRUE) + marge_mod_stripped <- stripGLM(marge_mod$final_mod) # run GLM model -- with offset marge_mod_offset <- marge2(X_pred = pt_test, Y = counts_test[, 3], @@ -206,8 +209,11 @@ test_that("internal marge functions", { test_that("createCellOffset() output", { expect_type(cell_offset, "double") + expect_type(cell_offset_seu, "double") expect_length(cell_offset, 300) + expect_length(cell_offset_seu, 300) expect_false(any(is.na(cell_offset))) + expect_false(any(is.na(cell_offset_seu))) }) test_that("testDynamic() output", { @@ -266,6 +272,7 @@ test_that("marge2() output -- GLM backend", { expect_s3_class(marge_mod_offset, "marge") expect_s3_class(marge_mod$final_mod, "negbin") expect_s3_class(marge_mod_offset$final_mod, "negbin") + expect_s3_class(marge_mod_stripped, "negbin") expect_equal(marge_mod$model_type, "GLM") expect_equal(marge_mod_offset$model_type, "GLM") expect_true(marge_mod$final_mod$converged)