Merge pull request #138 from jr-leary7/dev

Dev
jr-leary7 · Oct 11, 2023 · 31f1002 · 31f1002
2 parents 226b31b + 62f8b0f
commit 31f1002
Show file tree

Hide file tree

Showing 19 changed files with 345 additions and 279 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^\.github$
diff --git a/.github/workflows/render-README.yaml b/.github/workflows/render-README.yaml
@@ -0,0 +1,36 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Additional help taken from https://fromthebottomoftheheap.net/2020/04/30/rendering-your-readme-with-github-actions/
+
+on:
+  push:
+    branches: [main, master]
+    paths: README.Rmd
+
+name: render-README
+
+jobs:
+  render:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: r-lib/actions/setup-r@v2
+      - uses: r-lib/actions/setup-r-dependencies@v2
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - name: install CRAN packages
+        run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "dplyr", "purrr", "remotes"))'
+      - name: install BioConductor packages
+        run: Rscript -e 'install.packages("BiocManager"); BiocManager::install(c("SingleCellExperiment", "scater", "scran", "scaffold"))'
+      - name: install GitHub packages
+        run: Rscript -e 'remotes::install_github("jr-leary7/scLANE")'
+
+      - name: render README
+        run: Rscript -e 'rmarkdown::render("README.Rmd", output_format = "md_document")'
+
+      - name: commit rendered README
+        run: |
+          git add README.md man/figures/README-*
+          git commit -m "Recompiled README.Rmd" || echo "No changes to commit"
+          git push origin main || echo "No changes to commit"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This package uses truncated power basis spline models to build flex
              Downstream analysis functionalities include model comparison, dynamic gene clustering, smoothed counts generation, gene set enrichment testing, & visualization. 
 License: MIT + file LICENSE
 Encoding: UTF-8
-LazyData: true
+LazyData: false
 RoxygenNote: 7.2.1
 Depends: 
   glm2, 
@@ -70,11 +70,13 @@ LinkingTo:
 biocViews:
     RNASeq,
     Software,
+    Clustering, 
     TimeCourse,
     Sequencing,
     Regression,
     SingleCell, 
     Visualization, 
     GeneExpression,
     Transcriptomics,
+    GeneSetEnrichment,
     DifferentialExpression
diff --git a/R/GetResultsDE.R b/R/GetResultsDE.R
@@ -5,7 +5,7 @@
 #' @description This function turns the nested list differential expression results of \code{\link{testDynamic}} and turns them into a tidy \code{data.frame}.
 #' @import magrittr
 #' @importFrom purrr map_dfr
-#' @importFrom dplyr arrange mutate across if_else with_groups relocate
+#' @importFrom dplyr arrange desc mutate across if_else with_groups relocate
 #' @importFrom tidyselect everything
 #' @importFrom stats p.adjust p.adjust.methods
 #' @param test.dyn.res The nested list returned by \code{\link{testDynamic}}. Defaults to NULL.
@@ -38,7 +38,7 @@ getResultsDE <- function(test.dyn.res = NULL,
                                                    dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z))))
                                                })
                               }) %>%
-               dplyr::arrange(P_Val, Test_Stat) %>%
+               dplyr::arrange(P_Val, dplyr::desc(Test_Stat)) %>%
                dplyr::mutate(P_Val_Adj = stats::p.adjust(P_Val, method = p.adj.method),
                              Gene_Dynamic_Lineage = dplyr::if_else(P_Val_Adj < fdr.cutoff, 1, 0, missing = 0)) %>%
                dplyr::with_groups(Gene,

diff --git a/R/ModelLRT.R b/R/ModelLRT.R
@@ -29,45 +29,43 @@ modelLRT <- function(mod.1 = NULL,
                 P_Val = NA,
                 Model_Type = NA,
                 Notes = "No test performed due to model failure.")
-    return(res)
-  }
-
-  mod.1 <- mod.1$final_mod
-
-  if (is.null(mod.1) || is.null(mod.0)) { stop("You must provide two models to compare.") }
-  LRT_notes <- NA_character_
-  # compute LRT
-  if (is.glmm) {
-    if (!(inherits(mod.1, "glmmTMB") && inherits(mod.0, "glmmTMB"))) { stop("Models must be of class glmmTMB.") }
-    mod.1_ll <- -mod.1$fit$objective
-    mod.0_ll <- -mod.0$fit$objective
-    lrt_stat <- 2 * (mod.1_ll - mod.0_ll)
-    if (is.na(logLik(mod.1))) {
-      if (!mod.1$sdr$pdHess) {
-        LRT_notes <- "Non-positive definite Hessian in alternate GLMM, probably because of shallow log-likelihood."
-      } else {
-        LRT_notes <- "Log-likelihood for alternate GLMM is very shallow / flat, exercise caution."
+  } else {
+    mod.1 <- mod.1$final_mod
+    if (is.null(mod.1) || is.null(mod.0)) { stop("You must provide two models to compare.") }
+    LRT_notes <- NA_character_
+    # compute LRT
+    if (is.glmm) {
+      if (!(inherits(mod.1, "glmmTMB") && inherits(mod.0, "glmmTMB"))) { stop("Models must be of class glmmTMB.") }
+      mod.1_ll <- -mod.1$fit$objective
+      mod.0_ll <- -mod.0$fit$objective
+      lrt_stat <- 2 * (mod.1_ll - mod.0_ll)
+      if (is.na(logLik(mod.1))) {
+        if (!mod.1$sdr$pdHess) {
+          LRT_notes <- "Non-positive definite Hessian in alternate GLMM, probably because of shallow log-likelihood."
+        } else {
+          LRT_notes <- "Log-likelihood for alternate GLMM is very shallow / flat, exercise caution."
+        }
       }
+    } else {
+      if (!(inherits(mod.1, "glm") || inherits(mod.1, "lm"))) { stop("Models must be of class lm or glm.") }
+      mod.1_ll <- stats::logLik(mod.1)
+      mod.0_ll <- stats::logLik(mod.0)
+      lrt_stat <- as.numeric(2 * (mod.1_ll - mod.0_ll))
     }
-  } else {
-    if (!(inherits(mod.1, "glm") || inherits(mod.1, "lm"))) { stop("Models must be of class lm or glm.") }
-    mod.1_ll <- stats::logLik(mod.1)
-    mod.0_ll <- stats::logLik(mod.0)
-    lrt_stat <- as.numeric(2 * (mod.1_ll - mod.0_ll))
-  }
-  dgr_free <- attributes(stats::logLik(mod.1))$df - attributes(stats::logLik(mod.0))$df
-  if (dgr_free == 0) {
-    dgr_free <- 1
+    dgr_free <- attributes(stats::logLik(mod.1))$df - attributes(stats::logLik(mod.0))$df
+    if (dgr_free == 0) {
+      dgr_free <- 1
+    }
+    p_val <- stats::pchisq(lrt_stat,
+                           dgr_free,
+                           lower = FALSE)
+    res <- list(Alt_Mod_LL = as.numeric(mod.1_ll),
+                Null_Mod_LL = as.numeric(mod.0_ll),
+                LRT_Stat = lrt_stat,
+                DF = dgr_free,
+                P_Val = p_val,
+                Model_Type = ifelse(is.glmm, "GLMM", "GLM"),
+                Notes = LRT_notes)
   }
-  p_val <- stats::pchisq(lrt_stat,
-                         dgr_free,
-                         lower = FALSE)
-  res <- list(Alt_Mod_LL = as.numeric(mod.1_ll),
-              Null_Mod_LL = as.numeric(mod.0_ll),
-              LRT_Stat = lrt_stat,
-              DF = dgr_free,
-              P_Val = p_val,
-              Model_Type = ifelse(is.glmm, "GLMM", "GLM"),
-              Notes = LRT_notes)
   return(res)
 }
diff --git a/R/clusterGenes.R b/R/clusterGenes.R
@@ -50,7 +50,7 @@ clusterGenes <- function(test.dyn.res = NULL,
   if (!clust.algo %in% c("hclust", "kmeans", "leiden")) { stop("clust.algo must be one of 'hclust', 'kmeans', or 'leiden'.") }
   if ((use.pca & is.null(n.PC)) || (use.pca & n.PC <= 0)) { stop("n.PC must be a non-zero integer when clustering on principal components.") }
   if (is.null(lineages)) {
-    lineages <- LETTERS[1:length(test.dyn.res[[1]])]
+    lineages <- LETTERS[seq(length(test.dyn.res[[1]]))]
   }
   gene_cluster_list <- vector("list", length = length(lineages))
   for (l in seq_along(lineages)) {
@@ -82,7 +82,7 @@ clusterGenes <- function(test.dyn.res = NULL,
     # hierarchical clustering routine w/ Ward's linkage
     if (clust.algo  == "hclust") {
       hclust_tree <- stats::hclust(dist_matrix, method = "ward.D2")
-      k_vals <- c(2:10)
+      k_vals <- seq(2, 10)
       sil_vals <- vector("numeric", 9L)
       clust_list <- vector("list", 9L)
       for (k in seq_along(k_vals)) {
@@ -96,7 +96,7 @@ clusterGenes <- function(test.dyn.res = NULL,
                                   Cluster = clust_list[[which.max(sil_vals)]])
     # k-means clustering routine w/ Hartigan-Wong algorithm
     } else if (clust.algo == "kmeans") {
-      k_vals <- c(2:10)
+      k_vals <- seq(2, 10)
       sil_vals <- vector("numeric", 9L)
       clust_list <- vector("list", 9L)
       for (k in seq_along(k_vals)) {

diff --git a/R/embedGenes.R b/R/embedGenes.R
@@ -14,6 +14,7 @@
 #' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE.
 #' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL.
 #' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.
+#' @param resolution.param (Optional) The value of the resolution parameter for the Leiden algorithm. If unspecified, silhouette scoring is used to select an optimal value. Defaults to NULL.
 #' @param random.seed (Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312.
 #' @return A data.frame containing embedding coordinates, cluster IDs, and metadata for each gene.
 #' @export
@@ -31,6 +32,7 @@ embedGenes <- function(smoothed.counts = NULL,
                        cluster.genes = TRUE,
                        gene.meta.data = NULL,
                        k.param = 20,
+                       resolution.param = NULL,
                        random.seed = 312) {
   # check inputs
   if (is.null(smoothed.counts)) { stop("You forgot to provide a smoothed counts matrix to embedGenes().") }
@@ -53,23 +55,30 @@ embedGenes <- function(smoothed.counts = NULL,
                                                  k = k.param,
                                                  type = "jaccard",
                                                  BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
-    dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
-    clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) {
+    if (is.null(resolution.param)) {
+      dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
+      clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) {
+        smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn,
+                                                        objective_function = "modularity",
+                                                        resolution_parameter = r)
+        if (smoothed_counts_clust$nb_clusters == 1) {
+          sil_val <- 0
+        } else {
+          sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3])
+        }
+        clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L),
+                          resolution = r,
+                          silhouette = sil_val)
+        return(clust_res)
+      })
+      best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette))
+      cluster_vec <- clust_runs[[best_clustering]]$clusters
+    } else {
       smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn,
                                                       objective_function = "modularity",
-                                                      resolution_parameter = r)
-      if (smoothed_counts_clust$nb_clusters == 1) {
-        sil_val <- 0
-      } else {
-        sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3])
-      }
-      clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L),
-                        resolution = r,
-                        silhouette = sil_val)
-      return(clust_res)
-    })
-    best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette))
-    cluster_vec <- clust_runs[[best_clustering]]$clusters
+                                                      resolution_parameter = resolution.param)
+      cluster_vec <- as.factor(smoothed_counts_clust$membership - 1L)
+    }
   } else {
     cluster_vec <- NA_integer_
   }