Heavy update on the 1st tutorial and DEG funcs

mvfki · Jul 10, 2024 · 70ec07c · 70ec07c
1 parent 88362e0
commit 70ec07c
Show file tree

Hide file tree

Showing 24 changed files with 1,294 additions and 321 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rliger
-Version: 2.0.1.9001
-Date: 2024-06-24
+Version: 2.0.1.9002
+Date: 2024-07-10
 Type: Package
 Title: Linked Inference of Genomic Experimental Relationships
 Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) <doi:10.1016/j.cell.2019.05.006>, and Liu J, Gao C, Sodicoff J, et al (2020) <doi:10.1038/s41596-020-0391-8> for more details.

diff --git a/NAMESPACE b/NAMESPACE
@@ -150,6 +150,7 @@ export(plotDimRed)
 export(plotEnhancedVolcano)
 export(plotFactorDimRed)
 export(plotFactorHeatmap)
+export(plotGODot)
 export(plotGeneDetectedViolin)
 export(plotGeneDimRed)
 export(plotGeneHeatmap)
@@ -158,6 +159,7 @@ export(plotGeneLoadings)
 export(plotGeneViolin)
 export(plotGroupClusterDimRed)
 export(plotMarkerHeatmap)
+export(plotPairwiseDEGHeatmap)
 export(plotPeakDimRed)
 export(plotProportion)
 export(plotProportionBar)

diff --git a/NEWS.md b/NEWS.md
@@ -13,16 +13,19 @@
   - Pseudo-bulk should be easy because we are just aggregating cells.
   - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data.
 
-## rliger 2.0.1.9001
+## rliger 2.0.1.9002
 
-- Added `ligerToH5AD()` allowing reticulate/Python free export of liger object to H5AD format
+- Added `ligerToH5AD()` allowing reticulate/Python free export of liger object to H5AD format. This might not be releasable due to the need of calling non-exported functions from *hdf5r* library.
 - Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"`
-- Fixed `runMarkerDEG(method = "pseudobulk")` bug in creating pseudo-bulk for "the rest of cells" , and optimized error signaling.
+- Fixed `runMarkerDEG(method = "pseudobulk")` bug in assigning pseudo-replicates, and optimized error/warning signaling.
 - Added `plotProportionBox()` for visualizing compositional analysis
 - Added `plotBarcodeRank()` for basic QC visualization
+- Added `plotPairwiseDEGHeatmap()` for visualizing pairwise DEG results
+- Added `plotGODot()` for visualizing GO enrichment results
 - Added `calcNMI()` for evaluating clustering results against ground truth
 - Fixed bug in `calcAlignment()`, `subsetMemLigerDataset()`, `cellMeta()`
 - Optimized `plotVolcano()` text annotation positioning
+- Optimized DE test memory usage scalability for both pseudo-bulk method and wilcoxon test
 
 ## rliger 2.0.1
 

diff --git a/R/DEG_marker.R b/R/DEG_marker.R
diff --git a/R/GSEA.R b/R/GSEA.R
@@ -157,12 +157,22 @@ runGSEA <- function(
 #' See \code{gprofiler2::gost()}. for detailed explanation.
 #' @export
 #' @examples
-#' res <- runMarkerDEG(pbmcPlot)
+#' defaultCluster(pbmc) <- pbmcPlot$leiden_cluster
+#' # Test the DEG between "stim" and "ctrl", within each cluster
+#' result <- runPairwiseDEG(
+#'     pbmc,
+#'     groupTest = "stim",
+#'     groupCtrl = "ctrl",
+#'     variable1 = "dataset",
+#'     splitBy = "defaultCluster",
+#'     nPsdRep = 3,
+#'     minCellPerRep = 3
+#' )
 #' # Setting `significant = FALSE` because it's hard for a gene list obtained
 #' # from small test dataset to represent real-life biology.
 #' \donttest{
 #' if (requireNamespace("gprofiler2", quietly = TRUE)) {
-#'     go <- runGOEnrich(res, group = 0, significant = FALSE)
+#'     go <- runGOEnrich(result, group = "0.stim", significant = FALSE)
 #' }
 #' }
 runGOEnrich <- function(
@@ -229,3 +239,117 @@ runGOEnrich <- function(
     return(resultList)
 }
 
+
+#' Visualize GO enrichment test result in dot plot
+#' @param result Returned list object from \code{\link{runGOEnrich}}.
+#' @param group Character vector of group names, must be available in
+#' \code{names(result)}. Default \code{NULL} make plots for all groups.
+#' @param pvalThresh Numeric scalar, cutoff for p-value where smaller values are
+#' considered as significant. Default \code{0.05}.
+#' @param n Number of top terms to be shown, ranked by p-value. Default
+#' \code{20}.
+#' @param termIDMatch Regular expression pattern to match the term ID. Default
+#' \code{"^GO"} for only using GO terms from returned results.
+#' @param colorPalette,colorDirection Viridis palette options. Default
+#' \code{"E"} and \code{1}.
+#' @param xlab,ylab Axis title for x and y axis. Default
+#' \code{"-log10(P-value)"} and \code{"Term name"}, respectively.
+#' @inheritDotParams .ggplotLigerTheme title subtitle legendColorTitle legendSizeTitle showLegend legendPosition baseSize titleSize subtitleSize xTextSize xTitleSize yTextSize yTitleSize legendTextSize legendTitleSize plotly
+#' @return A ggplot object if only one group or a list of ggplot objects.
+#' @export
+#' @examples
+#' defaultCluster(pbmc) <- pbmcPlot$leiden_cluster
+#' # Test the DEG between "stim" and "ctrl", within each cluster
+#' result <- runPairwiseDEG(
+#'     pbmc,
+#'     groupTest = "stim",
+#'     groupCtrl = "ctrl",
+#'     variable1 = "dataset",
+#'     splitBy = "defaultCluster"
+#' )
+#' # Setting `significant = FALSE` because it's hard for a gene list obtained
+#' # from small test dataset to represent real-life biology.
+#' \donttest{
+#' if (requireNamespace("gprofiler2", quietly = TRUE)) {
+#'     go <- runGOEnrich(result, group = "0.stim", significant = FALSE)
+#'     # The toy example won't have significant result.
+#'     plotGODot(go)
+#' }
+#' }
+plotGODot <- function(
+        result,
+        group = NULL,
+        pvalThresh = 0.05,
+        n = 20,
+        termIDMatch = "^GO",
+        colorPalette = "E",
+        colorDirection = 1,
+        xlab = '-log10(P-value)',
+        ylab = 'Term name',
+        ...
+) {
+    group <- group %||% names(result)
+    if (any(!group %in% names(result))) {
+        cli::cli_abort(
+            c(x = "Specified group{?s} not available in {.var result}: {.val {group[!group %in% names(result)]}}",
+              i = "Available one{?s} {?is/are}: {.val {names(result)}}")
+        )
+    }
+
+    plotList <- list()
+    for (i in seq_along(group)) {
+        gname <- group[i]
+        resdf <- result[[gname]]$result
+        if (is.null(resdf) || nrow(resdf) == 0) {
+            cli::cli_alert_warning(
+                "No significant result returned for group {.val {gname}}."
+            )
+            next
+        }
+        resdf %<>% dplyr::filter(
+            grepl(termIDMatch, .data[['term_id']]),
+            .data[['p_value']] <= pvalThresh
+        )
+        if (nrow(resdf) == 0) {
+            cli::cli_alert_warning(
+                "No enough matching terms ({.field termIDMatch}) nor significant terms (p-value <= {.val {pvalThresh}}) for group {.val {gname}}."
+            )
+            next
+        }
+        g <- resdf %>%
+            dplyr::select(
+                .data[['term_name']],
+                .data[['p_value']],
+                .data[['intersection_size']]
+            ) %>%
+            dplyr::arrange(.data[['p_value']]) %>%
+            dplyr::slice_head(n = n) %>%
+            dplyr::mutate(
+                term_name = factor(
+                    .data[['term_name']],
+                    levels = rev(.data[['term_name']])
+                )
+            ) %>%
+            ggplot2::ggplot(ggplot2::aes(
+                x = -log10(.data[['p_value']]),
+                y = .data[['term_name']],
+                size = .data[['intersection_size']],
+                color = -log10(.data[['p_value']])
+            )) +
+            ggplot2::geom_point()
+        plotList[[gname]] <- .ggplotLigerTheme(
+            plot = g,
+            colorPalette = colorPalette,
+            colorDirection = colorDirection,
+            xlab = xlab,
+            ylab = ylab,
+            panelBorder = TRUE,
+            ...
+        )
+    }
+    if (length(plotList) == 1) {
+        return(plotList[[1]])
+    } else {
+        return(plotList)
+    }
+}
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -77,6 +77,10 @@ sample_cpp <- function(x, size) {
     .Call(`_rliger_sample_cpp`, x, size)
 }
 
+updatePseudoBulkRcpp <- function(psdBulk, sparseRaw, featureIdx, repIdx) {
+    invisible(.Call(`_rliger_updatePseudoBulkRcpp`, psdBulk, sparseRaw, featureIdx, repIdx))
+}
+
 #' Fast calculation of feature count matrix
 #'
 #' @param bedmat A feature count list generated by bedmap

diff --git a/R/clustering.R b/R/clustering.R
@@ -130,9 +130,14 @@ runCluster <- function(
         cli::cli_process_done(
             msg_done = "{method} clustering on {type} cell factor loadings ... Found {nlevels(clusts)} cluster{?s}."
         )
-    object@uns$defaultCluster <- object@uns$defaultCluster %||% clusterName
-    if (isTRUE(verbose))
-        cli::cli_alert_info("{.field cellMeta} variable {.val {clusterName}} is now set as default.")
+    if (is.null(object@uns$defaultCluster)) {
+        # If no default set yet
+        object@uns$defaultCluster <- clusterName
+        if (isTRUE(verbose)) {
+            cli::cli_alert_info("{.field cellMeta} variable {.val {clusterName}} is now set as default.")
+        }
+    }
+
     return(object)
 }
 

diff --git a/R/heatmap.R b/R/heatmap.R
@@ -404,7 +404,11 @@ plotFactorHeatmap <- function(
                 } else {
                     # Automatic generate with ggplot2 strategy,
                     # with level awareness
-                    annCol[[var]] <- scales::hue_pal()(length(levels(df[[var]])))
+                    if (nlevels(df[[var]]) > length(scPalette)) {
+                        annCol[[var]] <- scales::hue_pal()(nlevels(df[[var]]))
+                    } else {
+                        annCol[[var]] <- scPalette[1:nlevels(df[[var]])]
+                    }
                     names(annCol[[var]]) <- levels(df[[var]])
                     df[[var]] <- droplevels(df[[var]])
                 }

diff --git a/R/ligerToH5AD.R b/R/ligerToH5AD.R
@@ -1,6 +1,12 @@
 #' Write liger object to H5AD files
 #' @param object A \linkS4class{liger} object
 #' @param filename A character string, the path to the H5AD file to be written
+#' @param useSlot Character scalar, which type of data is going to be stored
+#' to \code{adata.X}. Default \code{"scaleData"}, choose from
+#' \code{"scaleData"}, \code{"normData"}, or \code{"rawData"}.
+#' @param saveRaw Logical, whether to save rawData to \code{adata.raw.X}.
+#' Default \code{TRUE} when \code{useSlot} is not \code{"rawData"}, otherwise
+#' \code{FALSE}.
 #' @param overwrite Logical, whether to overwrite the file if it exists.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.
@@ -11,8 +17,8 @@
 ligerToH5AD <- function(
         object,
         filename,
-        # X = c("scaleData", "normData", "rawData"),
-        # merge = TRUE,
+        useSlot = c("scaleData", "normData", "rawData"),
+        saveRaw = useSlot != "rawData",
         overwrite = FALSE,
         verbose = getOption("ligerVerbose", TRUE)
 ) {
@@ -23,7 +29,8 @@ ligerToH5AD <- function(
             cli::cli_abort("H5AD file exists at {.file {normalizePath(filename)}}")
         }
     }
-
+    useSlotCheckOrder <- c("scaleData", "normData", "rawData")
+    useSlot <- match.arg(useSlot)
     rownames <- "_index"
     dfile <- hdf5r::H5File$new(
         filename = filename,
@@ -32,25 +39,32 @@ ligerToH5AD <- function(
 
     # Work on expression matrices
     Xslot <- NULL
-    rawSlot <- NULL
-    if (all(!sapply(scaleData(object), is.null))) {
-        Xslot <- "scaleData"
-        Xmat <- mergeSparseAll(scaleData(object))
-    }
-    else if (all(!sapply(normData(object), is.null))) {
-        Xslot <- "normData"
-        Xmat <- mergeSparseAll(normData(object))
-    }
-    else if (all(!sapply(rawData(object), is.null))) {
-        Xslot <- "rawData"
-        Xmat <- mergeSparseAll(rawData(object))
+    if (all(!sapply(getMatrix(object, useSlot), is.null))) {
+        # If the specified slot is all available, use it
+        Xslot <- useSlot
+        Xmat <- mergeSparseAll(getMatrix(object, useSlot))
+    } else {
+        cli::cli_alert_warning(
+            "Requested {.field {useSlot}} is not available, trying other slots."
+        )
+        # Otherwise, check by priority order
+        for (i in useSlotCheckOrder) {
+            if (all(!sapply(getMatrix(object, i), is.null))) {
+                # If the specified slot is all available, use it
+                Xslot <- i
+                Xmat <- mergeSparseAll(getMatrix(object, i))
+                break
+            }
+        }
     }
-    else {
+    if (is.null(Xslot)) {
         cli::cli_abort("No data available to be added to {.field X}")
     }
-    if (all(!sapply(rawData(object), is.null)) &&
-        Xslot != "rawData") {
+
+    if (isTRUE(saveRaw) && Xslot != "rawData") {
         rawSlot <- "rawData"
+    } else {
+        rawSlot <- NULL
     }
 
     if (isTRUE(verbose)) {
@@ -68,7 +82,7 @@ ligerToH5AD <- function(
     if (isTRUE(verbose)) cli::cli_process_done(cliID)
 
     # Add raw
-    if (!is.null(rawSlot)) {
+    if (!is.null(rawSlot) && isTRUE(saveRaw)) {
         if (isTRUE(verbose)) {
             cliID <- cli::cli_process_start(sprintf("Adding %s to raw", rawSlot))
         }

diff --git a/R/mergeObject.R b/R/mergeObject.R
@@ -227,3 +227,4 @@ mergeH5 <- function(file.list,
     }
     h5_merged$close_all()
 }
+