prepping for BioConductor submission -- addresses #96

jr-leary7 · Oct 19, 2023 · deed8d9 · deed8d9
1 parent 3889195
commit deed8d9
Show file tree

Hide file tree

Showing 22 changed files with 328 additions and 59 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,3 +1,5 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.github$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,5 @@
 .Rbuildignore
 inst/doc
 codecov.yml
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scLANE
 Type: Package
 Title: Model gene expression dynamics with spline-based NB GLMs, GEEs, & GLMMs
-Version: 0.7.4
+Version: 0.7.5
 Authors@R: c(person(given = "Jack", family = "Leary", email = "j.leary@ufl.edu", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), 
              person(given = "Rhonda", family = "Bacher", email = "rbacher@ufl.edu", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
 Description: This package uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 
@@ -55,6 +55,7 @@ Suggests:
     bluster, 
     cluster, 
     rmarkdown,
+    BiocStyle, 
     slingshot, 
     gprofiler2,
     BiocGenerics, 

diff --git a/NAMESPACE b/NAMESPACE
@@ -122,6 +122,7 @@ importFrom(stats,predict)
 importFrom(stats,qnorm)
 importFrom(stats,quantile)
 importFrom(stats,setNames)
+importFrom(stats,vcov)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyselect,everything)
 importFrom(utils,tail)

diff --git a/NEWS.md b/NEWS.md
@@ -1,22 +1,32 @@
-# `scLANE` 0.7.3
+# `scLANE` v0.7.5
+
+* Preparing for BioConductor submission i.e., reformatting code, adding documentation, etc.
+* Added convolution function `npConvolve()` to be used for e.g., heatmap smoothing.
+
+# `scLANE` v0.7.4
+
+* Added the `getKnotDist()` function to pull the set of empirically-identified knots for a user-provided gene set.
+* Minor enhancements & documentation improvements. 
+
+# `scLANE` v0.7.3
 
 * Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering. 
 * Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. 
 * Added `theme_scLANE()` for output plots. 
 * Enhanced documentation. 
 * Increased test coverage.
 
-# `scLANE` 0.7.2 
+# `scLANE` v0.7.2 
 
 * Added a function named `sortGenesHeatmap()` that aids in the creation of expression cascade heatmaps by sorting genes according to where in pseudotime their peak expression is. 
 * Changed the parameter `approx.knot` in the `testDynamic()` function to use (stochasticity-controlled) subsampling instead of `seq()` to reduce candidate knot space. 
 * Added `summarizeModels()` to sum up slopes across pseudotime intervals. 
 
-# `scLANE` 0.7.1
+# `scLANE` v0.7.1
 
 * Changed input format of all functions to allow counts matrices formatted as `SingleCellExperiment` or `Seurat` objects, sparse matrices, or dense matrices.
 * Updated visualization functions to reflect changes made in `ggplot2` v3.4 (mostly changing the `size` parameter in line-based geoms to be `linewidth` instead). 
 
-# `scLANE` 0.6.3
+# `scLANE` v0.6.3
 
 * Added a `NEWS.md` file to track changes to the package.
diff --git a/R/GetResultsDE.R b/R/GetResultsDE.R
@@ -34,7 +34,7 @@ getResultsDE <- function(test.dyn.res = NULL,
                               function(x) {
                                 purrr::map_dfr(x,
                                                function(y) {
-                                                 as.data.frame(rbind(y[1:12])) %>%
+                                                 as.data.frame(rbind(y[seq(12)])) %>%
                                                    dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z))))
                                                })
                               }) %>%

diff --git a/R/embedGenes.R b/R/embedGenes.R
@@ -41,7 +41,6 @@ embedGenes <- function(smoothed.counts = NULL,
   genes <- colnames(smoothed.counts)
   smoothed.counts <- t(smoothed.counts)
   # embeddings
-  set.seed(random.seed)
   smoothed_counts_pca <- irlba::prcomp_irlba(smoothed.counts,
                                              n = pc.embed,
                                              center = TRUE,
@@ -52,14 +51,16 @@ embedGenes <- function(smoothed.counts = NULL,
                                        metric = "cosine",
                                        n_neighbors = k.param,
                                        init = "spectral",
-                                       nn_method = "annoy")
+                                       nn_method = "annoy",
+                                       seed = random.seed)
   } else {
     smoothed_counts_umap <- uwot::umap(smoothed.counts,
                                        n_components = 2,
                                        metric = "cosine",
                                        n_neighbors = k.param,
                                        init = "spectral",
-                                       nn_method = "annoy")
+                                       nn_method = "annoy",
+                                       seed = random.seed)
   }
   # clustering w/ silhouette score parameter tuning
   if (cluster.genes) {

diff --git a/R/getFittedValues.R b/R/getFittedValues.R
@@ -116,16 +116,16 @@ getFittedValues <- function(test.dyn.res = NULL,
                          }) %>%
               purrr::reduce(rbind)
     if (!is.null(id.vec)) {
-      mod_df$id <- id.vec[!is.na(x)]
+      mod_df <- dplyr::mutate(mod_df, subj_id = id.vec[!is.na(x)])
     } else {
-      mod_df$id <- NA_character_
+      mod_df <- dplyr::mutate(mod_df, subj_id = NA_character_)
     }
     return(mod_df)
   })
   final_df <- purrr::reduce(mod_df_list, rbind) %>%
-              dplyr::relocate(cell, id, lineage)
-  if (all(is.na(final_df$id))) {
-    final_df <- dplyr::select(final_df, -id)
+              dplyr::relocate(cell, subj_id, lineage)
+  if (is.null(id.vec)) {
+    final_df <- dplyr::select(final_df, -subj_id)
   }
   if (!is.null(filter.lineage)) {
     final_df <- dplyr::filter(final_df, !lineage %in% filter.lineage)

diff --git a/R/marge2.R b/R/marge2.R
@@ -411,7 +411,7 @@ marge2 <- function(X_pred = NULL,
             if (utils::tail(max(score_knot_one_int_mat, na.rm = TRUE), n = 1) > utils::tail(max(score_knot_one_add_mat, na.rm = TRUE), n = 1)) {
               int <- TRUE
               score_knot <- score_knot_one_int_mat
-              temp <- utils::tail(which(utils::tail(max(round(score_knot, 6), na.rm = T), n = 1) == round(score_knot, 6), arr.ind = TRUE), n = 1)
+              temp <- utils::tail(which(utils::tail(max(round(score_knot, 6), na.rm = TRUE), n = 1) == round(score_knot, 6), arr.ind = TRUE), n = 1)
               min_knot1 <- temp[1]
               best.var <- temp[2]
             } else {
@@ -784,7 +784,7 @@ marge2 <- function(X_pred = NULL,
   wic_mat_2[1, (ncol_B + 1)] <- full.wic
   wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new)
   wic_mat_2[2, 2:(length(wic1_2) + 1)] <- wic1_2
-  WIC_2 <- sum(apply(wic_mat_2[1:2, ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new)
+  WIC_2 <- sum(apply(wic_mat_2[seq(2), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new)
   WIC_vec_2 <- c(WIC_vec_2, WIC_2)
 
   variable.lowest_2 <- as.numeric(which(wic1_2 == min(wic1_2, na.rm = TRUE))[1])
@@ -795,14 +795,14 @@ marge2 <- function(X_pred = NULL,
     wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new_2)
     if (i != (ncol_B - 1)) {
       wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2
-      WIC_2 <- sum(apply(wic_mat_2[1:(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2)
+      WIC_2 <- sum(apply(wic_mat_2[seq(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2)
       WIC_vec_2 <- c(WIC_vec_2, WIC_2)
       variable.lowest_2 <- as.numeric(which(wic1_2 == min(wic1_2, na.rm = TRUE))[1])
       var.low.vec_2 <- c(var.low.vec_2, colnames(B_new_2)[variable.lowest_2 + 1])
       B_new_2 <- as.matrix(B_new_2[, -(variable.lowest_2 + 1)])
     } else {
       wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2
-      WIC_2 <- sum(apply(wic_mat_2[1:(ncol_B), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2)
+      WIC_2 <- sum(apply(wic_mat_2[seq(ncol_B), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2)
       WIC_vec_2 <- c(WIC_vec_2, WIC_2)
       B_new_2 <- as.matrix(B_new_2[, -(variable.lowest_2)])
       colnames(B_new_2) <- "Intercept"

diff --git a/R/max_span.R b/R/max_span.R
@@ -17,7 +17,7 @@ max_span <- function(X_red = NULL,
   N <- length(unique(X_red))
   x <- sort(unique(X_red))
   maxspan <- round((3 - log2(alpha / q)))
-  x_new <- x[-c(1:maxspan, floor(N - maxspan + 1):N)]
+  x_new <- x[-c(seq(maxspan), floor(N - maxspan + 1):N)]
   if (length(x_new) == 0) {
     x_new <- x
   }

diff --git a/R/plotModels.R b/R/plotModels.R
@@ -154,7 +154,7 @@ plotModels <- function(test.dyn.res = NULL,
                                                         se = TRUE,
                                                         REML = FALSE)
                           }
-                          glm_preds <- data.frame(predict(glm_mod, type = "link", se.fit = TRUE)[1:2])
+                          glm_preds <- data.frame(predict(glm_mod, type = "link", se.fit = TRUE)[seq(2)])
                         } else {
                           glm_formula <- "COUNT ~ PT"
                           if (!is.null(size.factor.offset)) {
@@ -168,7 +168,7 @@ plotModels <- function(test.dyn.res = NULL,
                                                   method = "glm.fit2",
                                                   link = log,
                                                   init.theta = 1)
-                          glm_preds <- data.frame(stats::predict(glm_mod, type = "link", se.fit = TRUE)[1:2])
+                          glm_preds <- data.frame(stats::predict(glm_mod, type = "link", se.fit = TRUE)[seq(2)])
                         }
                         x <- dplyr::mutate(x,
                                            RESP_GLM = glm_preds$fit,
@@ -202,7 +202,7 @@ plotModels <- function(test.dyn.res = NULL,
                                              pt = x$PT)
                           }
                         }
-                        gam_preds <- data.frame(predict(gam_mod, type = "link", se.fit = TRUE)[1:2])
+                        gam_preds <- data.frame(predict(gam_mod, type = "link", se.fit = TRUE)[seq(2)])
                         x <- dplyr::mutate(x,
                                            RESP_GAM = gam_preds$fit,
                                            SE_GAM = gam_preds$se.fit)

diff --git a/R/pull_sumy.R b/R/pull_sumy.R
@@ -31,15 +31,15 @@ pull.null.sumy <- function(mod.obj, is.gee, is.glmm) {
                    `p.value` = null_glmm_summary$p.value[1])
       }, silent = TRUE)
       null_pred_df <- try({
-        data.frame(predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>%
+        data.frame(predict(mod.obj, type = "link", se.fit = TRUE)[seq(2)]) %>%
           dplyr::rename(null_link_fit = fit, null_link_se = se.fit)
       }, silent = TRUE)
     }  else {
       null_sumy_df <- try({
         as.data.frame(broom.mixed::tidy(mod.obj)) # saves a few bytes by converting from tibble
       }, silent = TRUE)
       null_pred_df <- try({
-        data.frame(stats::predict(mod.obj, type = "link", se.fit = TRUE)[1:2]) %>%
+        data.frame(stats::predict(mod.obj, type = "link", se.fit = TRUE)[seq(2)]) %>%
           dplyr::rename(null_link_fit = fit, null_link_se = se.fit)
       }, silent = TRUE)
     }
@@ -87,7 +87,7 @@ pull.marge.sumy <- function(mod.obj,
 
     } else if (is.glmm) {
       marge_pred_df <- try({
-        data.frame(predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>%
+        data.frame(predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>%
           dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit)
       }, silent = TRUE)
       marge_sumy_df <- try({
@@ -101,7 +101,7 @@ pull.marge.sumy <- function(mod.obj,
 
     } else {
       marge_pred_df <- try({
-        data.frame(stats::predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[1:2]) %>%
+        data.frame(stats::predict(mod.obj$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>%
           dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit)
       }, silent = TRUE)
       marge_sumy_df <- try({

diff --git a/R/score_fun_gee.R b/R/score_fun_gee.R
@@ -54,13 +54,13 @@ score_fun_gee <- function(Y = NULL,
     J21 <- Sigma21 <- matrix(0, nrow = p, ncol = p1)
 
     for (i in seq(N)) {
-      k <- sum(n_vec[1:i])
+      k <- sum(n_vec[seq(i)])
       VS.est_i <- VS.est_list[[i]]
       AWA.est_i <- AWA.est_list[[i]]
       J2_i <- J2_list[[i]]
       Sigma2_i <- Sigma2_list[[i]]
-      D.est_i <- eigenMapMatMult(A = diag((mu.est[(sum(n_vec1[1:i]) + 1):k]), nrow = n_vec[i], ncol = n_vec[i]),
-                                 B = XA[(sum(n_vec1[1:i]) + 1):k, ],
+      D.est_i <- eigenMapMatMult(A = diag((mu.est[(sum(n_vec1[seq(i)]) + 1):k]), nrow = n_vec[i], ncol = n_vec[i]),
+                                 B = XA[(sum(n_vec1[seq(i)]) + 1):k, ],
                                  n_cores = 1)
       D_est_i_transpose <- t(D.est_i)
       J21 <- J21 + eigenMapMatMult(A = D_est_i_transpose,

diff --git a/R/smoothedCountsMatrix.R b/R/smoothedCountsMatrix.R
@@ -49,7 +49,7 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL,
   } else {
     genes <- names(test.dyn.res)
   }
-  lineages <- LETTERS[1:length(test.dyn.res[[1]])]
+  lineages <- LETTERS[seq(length(test.dyn.res[[1]]))]
   colnames(pt) <- paste0("Lineage_", lineages)
   lineage_mat_list <- purrr::map(lineages, \(x) {
     lineage_name <- paste0("Lineage_", x)

diff --git a/R/sortGenesHeatmap.R b/R/sortGenesHeatmap.R
@@ -31,18 +31,18 @@ sortGenesHeatmap <- function(heatmap.mat = NULL, pt.vec = NULL) {
   # identify point at which peak expression occurs for each gene across pseudotime
   gene_peak_order <- purrr::map(seq_len(ncol(heatmap.mat)), \(x) {
     data.frame(gene = colnames(heatmap.mat)[x],
-               pt = pt.vec,
+               pseudotime = pt.vec,
                mRNA = heatmap.mat[, x]) %>%
       dplyr::filter(mRNA == max(mRNA)) %>%
       dplyr::distinct() %>%
       dplyr::select(gene,
-                    pt,
+                    pseudotime,
                     mRNA)
   }) %>%
     purrr::reduce(rbind) %>%
     dplyr::with_groups(gene,
                        dplyr::summarise,
-                       mu = mean(pt)) %>%
+                       mu = mean(pseudotime)) %>%
     dplyr::arrange(mu) %>%
     dplyr::pull(gene)
   return(gene_peak_order)

diff --git a/R/stat_out_score_gee_null.R b/R/stat_out_score_gee_null.R
@@ -46,19 +46,19 @@ stat_out_score_gee_null <- function(Y = NULL,
   J11 <- Sigma11 <- matrix(0, ncol(B_null), ncol(B_null))
 
   for (i in seq(N)) {
-    k <- sum(n_vec[1:i])
+    k <- sum(n_vec[seq(i)])
     # set up working correlation matrix structure
     if (cor.structure == "independence") {
       R_alpha <- diag(1, n_vec[i], n_vec[i])
     } else if (cor.structure == "exchangeable") {
       R_alpha <- matrix(alpha_est, n_vec[i], n_vec[i]) + diag(c(1 - alpha_est), n_vec[i], n_vec[i])
     } else if (cor.structure == "ar1") {
-      R_alpha <- alpha_est^outer(1:n_vec[i], 1:n_vec[i], \(x, y) abs(x - y))
+      R_alpha <- alpha_est^outer(seq(n_vec[i]), seq(n_vec[i]), \(x, y) abs(x - y))
     } else {
       stop("Currently unsupported correlation structure.")
     }
     # compute iteration values for each statistic
-    temp_seq_n <- (sum(n_vec1[1:i]) + 1):k
+    temp_seq_n <- (sum(n_vec1[seq(i)]) + 1):k
     mu_est_sub <- mu_est[temp_seq_n]
     diag_sqrt_V_est <- diag(sqrt(V_est[temp_seq_n]),
                             nrow = n_vec[i],

diff --git a/R/summarizeModel.R b/R/summarizeModel.R
@@ -4,6 +4,7 @@
 #' @author Jack Leary
 #' @author Rhonda Bacher
 #' @import magrittr
+#' @importFrom stats vcov
 #' @description This function summarizes the model for each gene and allows for quantiative interpretation of fitted gene dynamics.
 #' @param marge.model The fitted model output from \code{\link{marge2}} (this function is internally called by \code{\link{testDynamic}}). Defaults to NULL.
 #' @param pt The predictor matrix of pseudotime. Defaults to NULL.
@@ -27,7 +28,7 @@ summarizeModel <- function(marge.model = NULL, pt = NULL) {
     if (marge.model$model_type == "GEE") {
       coef_vcov <- as.matrix(marge.model$final_mod$var)
     } else {
-      coef_vcov <- vcov(marge.model$final_mod)
+      coef_vcov <- stats::vcov(marge.model$final_mod)
     }
     coef_df <- data.frame(coef_name = names(coef(marge.model$final_mod)),
                           coef_value = unname(coef(marge.model$final_mod)),

diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -29,7 +29,7 @@
 #' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 2.
 #' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to FALSE.
-#' @param track.time (Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Useful for debugging. Defaults to FALSE.
+#' @param track.time (Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Defaults to TRUE.
 #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.
 #' @details
 #' \itemize{
@@ -87,7 +87,7 @@ testDynamic <- function(expr.mat = NULL,
                         parallel.exec = TRUE,
                         n.cores = 2,
                         approx.knot = TRUE,
-                        track.time = FALSE,
+                        track.time = TRUE,
                         random.seed = 312) {
   # check inputs
   if (is.null(expr.mat) || is.null(pt)) { stop("You forgot some inputs to testDynamic().") }
@@ -123,7 +123,7 @@ testDynamic <- function(expr.mat = NULL,
 
   # set pseudotime lineage column names automatically to prevent user error (uses e.g., "Lineage_A", "Lineage_B", etc.)
   n_lineages <- ncol(pt)
-  colnames(pt) <- paste0("Lineage_", LETTERS[1:n_lineages])
+  colnames(pt) <- paste0("Lineage_", LETTERS[seq(n_lineages)])
 
   # ensure subject ID vector meets criteria for GEE / GLMM fitting
   if ((is.gee || is.glmm) && is.null(id.vec)) { stop("You must provide a vector of IDs if you're using GEE / GLMM backends.") }
@@ -333,11 +333,11 @@ testDynamic <- function(expr.mat = NULL,
                             is.glmm = is.glmm)
      }
      # add test stats to result list
-     lineage_list[[j]]$Test_Stat = ifelse(is.gee, test_res$Wald_Stat, test_res$LRT_Stat)
-     lineage_list[[j]]$Test_Stat_Note = test_res$Notes
-     lineage_list[[j]]$P_Val = test_res$P_Val
+     lineage_list[[j]]$Test_Stat <- ifelse(is.gee, test_res$Wald_Stat, test_res$LRT_Stat)
+     lineage_list[[j]]$Test_Stat_Note <- test_res$Notes
+     lineage_list[[j]]$P_Val <- test_res$P_Val
     }
-    names(lineage_list) <- paste0("Lineage_", LETTERS[1:n_lineages])
+    names(lineage_list) <- paste0("Lineage_", LETTERS[seq(n_lineages)])
     lineage_list
   }
 
@@ -374,7 +374,7 @@ testDynamic <- function(expr.mat = NULL,
              MARGE_Slope_Data = NULL,
              Gene_Dynamics = NULL)
       })
-      names(reformatted_results) <- paste0("Lineage_", LETTERS[1:n_lineages])
+      names(reformatted_results) <- paste0("Lineage_", LETTERS[seq(n_lineages)])
       return(reformatted_results)
     } else {
       return(x)