Added centroidAlign(); modified package to match new methods

mvfki · Oct 1, 2024 · 1e85688 · 1e85688
1 parent 9e5685e
commit 1e85688
Show file tree

Hide file tree

Showing 79 changed files with 1,438 additions and 652 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -21,6 +21,6 @@ LICENSE
 ^devdata
 ^tests/testthat/*\.h5$
 ^vignettes/articles$
+cran-comments.md
 ^doc$
 ^Meta$
-cran-comments.md
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rliger
-Version: 2.0.1.9004
-Date: 2024-09-27
+Version: 2.0.99
+Date: 2024-10-01
 Type: Package
 Title: Linked Inference of Genomic Experimental Relationships
 Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) <doi:10.1016/j.cell.2019.05.006>, and Liu J, Gao C, Sodicoff J, et al (2020) <doi:10.1038/s41596-020-0391-8> for more details.
@@ -34,7 +34,7 @@ URL: https://welch-lab.github.io/liger/
 License: GPL-3
 biocViews:
 LazyData: true
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 VignetteBuilder: knitr
 Encoding: UTF-8
 Additional_repositories: https://welch-lab.r-universe.dev, https://blaserlab.r-universe.dev
@@ -90,3 +90,4 @@ Suggests:
     SingleCellExperiment,
     SummarizedExperiment,
     testthat
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,8 @@ S3method("[[",liger)
 S3method("[[<-",liger)
 S3method("names<-",liger)
 S3method(.DollarNames,liger)
+S3method(alignFactors,Seurat)
+S3method(alignFactors,liger)
 S3method(as.liger,Seurat)
 S3method(as.liger,SingleCellExperiment)
 S3method(as.liger,dgCMatrix)
@@ -18,6 +20,8 @@ S3method(as.ligerDataset,ligerDataset)
 S3method(as.ligerDataset,matrix)
 S3method(c,liger)
 S3method(cbind,ligerDataset)
+S3method(centroidAlign,Seurat)
+S3method(centroidAlign,liger)
 S3method(closeAllH5,liger)
 S3method(closeAllH5,ligerDataset)
 S3method(fortify,liger)
@@ -73,6 +77,7 @@ export("scaleData<-")
 export("scaleUnsharedData<-")
 export("varFeatures<-")
 export("varUnsharedFeatures<-")
+export(alignFactors)
 export(as.liger)
 export(as.ligerDataset)
 export(calcARI)
@@ -82,6 +87,7 @@ export(calcDatasetSpecificity)
 export(calcNMI)
 export(calcPurity)
 export(cellMeta)
+export(centroidAlign)
 export(closeAllH5)
 export(commandDiff)
 export(commands)
@@ -276,6 +282,7 @@ importFrom(Matrix,summary)
 importFrom(Matrix,t)
 importFrom(Rcpp,evalCpp)
 importFrom(ggplot2,fortify)
+importFrom(lifecycle,deprecated)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(methods,new)

diff --git a/NEWS.md b/NEWS.md
@@ -13,21 +13,22 @@
   - Pseudo-bulk should be easy because we are just aggregating cells.
   - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data.
 
-## rliger 2.0.1.9004
+## rliger 2.0.99
 
-- Added `ligerToH5AD()` allowing reticulate/Python free export of liger object to H5AD format. This might not be releasable due to the need of calling non-exported functions from *hdf5r* library.
-- Added organism support in `runGeneralQC()` and refined hemoglobin gene matching regex pattern.
-- Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"`
-- Fixed `runMarkerDEG(method = "pseudobulk")` bug in assigning pseudo-replicates, and optimized error/warning signaling.
-- Optimized DE test memory usage scalability for both pseudo-bulk method and wilcoxon test
+- Added `centroidAlign()` for new cell factor loading alignment method
 - Added `plotProportionBox()` for visualizing compositional analysis
 - Added `plotBarcodeRank()` for basic QC visualization
 - Added `plotPairwiseDEGHeatmap()` for visualizing pairwise DEG results
 - Added `plotGODot()` for visualizing GO enrichment results
+- Added `calcNMI()` for evaluating clustering results against ground truth
+- Added `ligerToH5AD()` allowing reticulate/Python free export of liger object to H5AD format. This might not be releasable due to the need of calling non-exported functions from *hdf5r* library.
+- Added organism support in `runGeneralQC()` and refined hemoglobin gene matching regex pattern.
+- Optimized DE test memory usage scalability for both pseudo-bulk method and wilcoxon test
 - Optimized `plotProportionPie()` by adding argument `circleColors`
 - Optimized `plotVolcano()` text annotation positioning
 - Optimized visualization function additional argument documentation
-- Added `calcNMI()` for evaluating clustering results against ground truth
+- Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"`
+- Fixed `runMarkerDEG(method = "pseudobulk")` bug in assigning pseudo-replicates, and optimized error/warning signaling.
 - Fixed bug in `calcAlignment()`, `subsetMemLigerDataset()`, `cellMeta()`
 - Fixed bug in old version updating functions
 

diff --git a/R/ATAC.R b/R/ATAC.R
@@ -2,11 +2,11 @@
 #' after integration
 #' @description
 #' This function is designed for creating peak data for a dataset with only gene
-#' expression. This function uses quantile normalized cell factor loading to
-#' find nearest neighbors between cells from the queried dataset (without peak)
-#' and cells from reference dataset (with peak). And then impute the peak for
-#' the former basing on the weight. Therefore, the reference dataset selected
-#' must be of "atac" modality setting.
+#' expression. This function uses aligned cell factor loading to find nearest
+#' neighbors between cells from the queried dataset (without peak) and cells
+#' from reference dataset (with peak). And then impute the peak for the former
+#' basing on the weight. Therefore, the reference dataset selected must be of
+#' "atac" modality setting.
 #' @param object \linkS4class{liger} object with aligned factor loading computed
 #' in advance.
 #' @param nNeighbors The maximum number of nearest neighbors to search. Default
@@ -39,7 +39,7 @@
 #' bmmc <- scaleNotCenter(bmmc)
 #' if (requireNamespace("RcppPlanc", quietly = TRUE)) {
 #'     bmmc <- runINMF(bmmc, k = 20)
-#'     bmmc <- quantileNorm(bmmc)
+#'     bmmc <- alignFactors(bmmc)
 #'     bmmc <- normalizePeak(bmmc)
 #'     bmmc <- imputeKNN(bmmc, reference = "atac", queries = "rna")
 #' }
@@ -60,7 +60,7 @@ imputeKNN <- function(
     if (is.null(getMatrix(object, "H.norm")))
         cli::cli_abort(
             "Aligned factor loading has to be available for imputation.
-            Please run {.fn quantileNorm} in advance.")
+            Please run {.fn alignFactors} in advance.")
     reference <- .checkArgLen(reference, n = 1)
     reference <- .checkUseDatasets(object, reference)#, modal = "atac")
     queries <- .checkUseDatasets(object, queries)
@@ -177,7 +177,7 @@ imputeKNN <- function(
 #'     bmmc <- selectGenes(bmmc)
 #'     bmmc <- scaleNotCenter(bmmc)
 #'     bmmc <- runINMF(bmmc, miniBatchSize = 100)
-#'     bmmc <- quantileNorm(bmmc)
+#'     bmmc <- alignFactors(bmmc)
 #'     bmmc <- normalizePeak(bmmc)
 #'     bmmc <- imputeKNN(bmmc, reference = "atac", queries = "rna")
 #'     corr <- linkGenesAndPeaks(
@@ -370,7 +370,7 @@ linkGenesAndPeaks <- function(
 #'     requireNamespace("IRanges", quietly = TRUE) &&
 #'     requireNamespace("psych", quietly = TRUE)) {
 #'     bmmc <- runINMF(bmmc)
-#'     bmmc <- quantileNorm(bmmc)
+#'     bmmc <- alignFactors(bmmc)
 #'     bmmc <- normalizePeak(bmmc)
 #'     bmmc <- imputeKNN(bmmc, reference = "atac", queries = "rna")
 #'     corr <- linkGenesAndPeaks(
@@ -480,7 +480,7 @@ exportInteractTrack <- function(
     invisible(NULL)
 }
 
-#' [Deprecated] Export predicted gene-pair interaction
+#' `r lifecycle::badge("deprecated")` Export predicted gene-pair interaction
 #' @description Export the predicted gene-pair interactions calculated by
 #' upstream function \code{\link{linkGenesAndPeaks}} into an Interact Track file
 #' which is compatible with \href{https://genome.ucsc.edu/cgi-bin/hgCustom}{UCSC

diff --git a/R/DEG_marker.R b/R/DEG_marker.R
@@ -130,10 +130,10 @@
 #' \code{1}.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
-#' @return A data.frame with DEG information with the following field:
-#' \enumerate{
-#'  \item{feature - Gene names}
-#'  \item{group - Test group name. Multiple tests might be present for each
+#' @return A data.frame with DEG information with the all or some of the
+#' following fields:
+#'  \item{feature}{Gene names}
+#'  \item{group}{Test group name. Multiple tests might be present for each
 #'    function call. This is the main variable to distinguish the tests. For a
 #'    pairwise test, a row with a certain group name represents the test result
 #'    between the this group against the other control group; When split by a
@@ -144,21 +144,20 @@
 #'    all other cells. When running split marker detection, the group name would
 #'    be in "split.group" format, meaning the stats is by comparing the group in
 #'    the split level against all other cells in the same split level.}
-#'  \item{logFC - Log fold change}
-#'  \item{pval - P-value}
-#'  \item{padj - Adjusted p-value}
-#'  \item{avgExpr - Mean expression in the test group indicated by the "group"
+#'  \item{logFC}{Log fold change}
+#'  \item{pval}{P-value}
+#'  \item{padj}{Adjusted p-value}
+#'  \item{avgExpr}{Mean expression in the test group indicated by the "group"
 #'    field. Only available for wilcoxon tests.}
-#'  \item{statistic - Wilcoxon rank-sum test statistic. Only available for
+#'  \item{statistic}{Wilcoxon rank-sum test statistic. Only available for
 #'    wilcoxon tests.}
-#'  \item{auc - Area under the ROC curve. Only available for wilcoxon tests.}
-#'  \item{pct_in - Percentage of cells in the test group, indicated by the
+#'  \item{auc}{Area under the ROC curve. Only available for wilcoxon tests.}
+#'  \item{pct_in}{Percentage of cells in the test group, indicated by the
 #'    "group" field, that express the feature. Only available for wilcoxon
 #'    tests.}
-#'  \item{pct_out - Percentage of cells in the control group or other cells, as
+#'  \item{pct_out}{Percentage of cells in the control group or other cells, as
 #'    explained for the "group" field, that express the feature. Only available
 #'    for wilcoxon tests.}
-#' }
 #' @rdname liger-DEG
 #' @export
 #' @examples

diff --git a/R/GSEA.R b/R/GSEA.R
@@ -258,6 +258,7 @@ runGOEnrich <- function(
 #' @return A ggplot object if only one group or a list of ggplot objects.
 #' @export
 #' @examples
+#' \donttest{
 #' defaultCluster(pbmc) <- pbmcPlot$leiden_cluster
 #' # Test the DEG between "stim" and "ctrl", within each cluster
 #' result <- runPairwiseDEG(
@@ -269,7 +270,6 @@ runGOEnrich <- function(
 #' )
 #' # Setting `significant = FALSE` because it's hard for a gene list obtained
 #' # from small test dataset to represent real-life biology.
-#' \donttest{
 #' if (requireNamespace("gprofiler2", quietly = TRUE)) {
 #'     go <- runGOEnrich(result, group = "0.stim", significant = FALSE)
 #'     # The toy example won't have significant result.
@@ -317,11 +317,11 @@ plotGODot <- function(
             next
         }
         g <- resdf %>%
-            dplyr::select(
-                .data[['term_name']],
-                .data[['p_value']],
-                .data[['intersection_size']]
-            ) %>%
+            dplyr::select(dplyr::all_of(c(
+                'term_name',
+                'p_value',
+                'intersection_size'
+            ))) %>%
             dplyr::arrange(.data[['p_value']]) %>%
             dplyr::slice_head(n = n) %>%
             dplyr::mutate(

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,30 +5,18 @@ RunModularityClusteringCpp <- function(SNN, modularityFunction, resolution, algo
     .Call(`_rliger_RunModularityClusteringCpp`, SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename)
 }
 
-normalize_dense_cpp <- function(x, MARGIN = 2L, L = 1L) {
-    .Call(`_rliger_normalize_dense_cpp`, x, MARGIN, L)
+moe_correct_ridge_cpp <- function(Z_orig, R, lambda, Phi, B, N) {
+    .Call(`_rliger_moe_correct_ridge_cpp`, Z_orig, R, lambda, Phi, B, N)
 }
 
-scale_dense_cpp <- function(x, MARGIN = 2L, center = TRUE, scale = TRUE) {
-    .Call(`_rliger_scale_dense_cpp`, x, MARGIN, center, scale)
+normalize_byCol_dense_rcpp <- function(x) {
+    .Call(`_rliger_normalize_byCol_dense_rcpp`, x)
 }
 
 colNormalize_dense_cpp <- function(x, L) {
     .Call(`_rliger_colNormalize_dense_cpp`, x, L)
 }
 
-colAggregateMedian_dense_cpp <- function(x, group, n) {
-    .Call(`_rliger_colAggregateMedian_dense_cpp`, x, group, n)
-}
-
-harmony_moe_correct_ridge_cpp <- function(Z_orig, R, lambda, Phi, B, N) {
-    .Call(`_rliger_harmony_moe_correct_ridge_cpp`, Z_orig, R, lambda, Phi, B, N)
-}
-
-normalize_byCol_dense_rcpp <- function(x) {
-    .Call(`_rliger_normalize_byCol_dense_rcpp`, x)
-}
-
 scaleNotCenter_byRow_rcpp <- function(x) {
     .Call(`_rliger_scaleNotCenter_byRow_rcpp`, x)
 }
@@ -73,6 +61,10 @@ colAggregateSums_sparse <- function(x, group, ngroups) {
     .Call(`_rliger_colAggregateSums_sparse`, x, group, ngroups)
 }
 
+colAggregateMedian_dense_cpp <- function(x, group, n) {
+    .Call(`_rliger_colAggregateMedian_dense_cpp`, x, group, n)
+}
+
 sample_cpp <- function(x, size) {
     .Call(`_rliger_sample_cpp`, x, size)
 }

diff --git a/R/cINMF.R b/R/cINMF.R
@@ -1,6 +1,7 @@
 #' Perform consensus iNMF on scaled datasets
 #' @description
-#' \bold{NOT STABLE} - This is an experimental function and is subject to change.
+#' `r lifecycle::badge("experimental")` This is an experimental function and is
+#' subject to change.
 #'
 #' Performs consensus integrative non-negative matrix factorization (c-iNMF)
 #' to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. In order to

diff --git a/R/classConversion.R b/R/classConversion.R
@@ -256,14 +256,15 @@ as.ligerDataset.SingleCellExperiment <- function(
 #' default cluster labeling to set the Idents. Default \code{FALSE}.
 #' @param merge Logical, whether to merge layers of different datasets into one.
 #' Not recommended. Default \code{FALSE}.
-#' @param by.dataset [Deprecated]. Use \code{identByDataset} instead.
-#' @param nms [Defunct] Will be ignored because new object structure does not
-#' have related problem.
-#' @param renormalize [Defunct] Will be ignored because since Seurat V5, layers
-#' of data can exist at the same time and it is better to left it for users to
-#' do it by themselves.
-#' @param use.liger.genes [Defunct] Will be ignored and will always set LIGER
-#' variable features to the place.
+#' @param by.dataset `r lifecycle::badge("superseded")`. Use
+#' \code{identByDataset} instead.
+#' @param nms `r lifecycle::badge("defunct")` Will be ignored because new object
+#' structure does not have related problem.
+#' @param renormalize `r lifecycle::badge("defunct")` Will be ignored because
+#' since Seurat V5, layers of data can exist at the same time and it is better
+#' to left it for users to do it by themselves.
+#' @param use.liger.genes `r lifecycle::badge("defunct")` Will be ignored and
+#' will always set LIGER variable features to the place.
 #' @export
 #' @rdname ligerToSeurat
 #' @return Always returns Seurat object(s) of the latest version. By default a

diff --git a/R/classes.R b/R/classes.R
@@ -177,7 +177,7 @@ setValidity("ligerDataset", .valid.ligerDataset)
 #' @slot W iNMF output matrix of shared gene loadings for each factor. See
 #' \code{\link{runIntegration}}.
 #' @slot H.norm Matrix of aligned factor loading for each cell. See
-#' \code{\link{quantileNorm}} and \code{\link{runIntegration}}.
+#' \code{\link{alignFactors}} and \code{\link{runIntegration}}.
 #' @slot commands List of \linkS4class{ligerCommand} objects. Record of
 #' analysis. Use \code{commands} to retrieve information. See detailed section
 #' accordingly.

diff --git a/R/clustering.R b/R/clustering.R
@@ -1,14 +1,13 @@
 #' SNN Graph Based Community Detection
 #' @description
-#' After quantile normalization, users can additionally run the Leiden or
+#' After aligning cell factor loadings, users can additionally run the Leiden or
 #' Louvain algorithm for community detection, which is widely used in
 #' single-cell analysis and excels at merging small clusters into broad cell
 #' classes.
 #'
-#' While using quantile normalized factor loadings (result from
-#' \code{\link{quantileNorm}}) is recommended, this function looks for
-#' unnormalized factor loadings (result from \code{\link{runIntegration}}) when
-#' the former is not available.
+#' While using aligned factor loadings (result from \code{\link{alignFactors}})
+#' is recommended, this function looks for unaligned factor loadings (raw result
+#' from \code{\link{runIntegration}}) when the former is not available.
 #' @param object A \linkS4class{liger} object. Should have valid factorization
 #' result available.
 #' @param nNeighbors Integer, the maximum number of nearest neighbors to
@@ -80,7 +79,7 @@ runCluster <- function(object,
     Hsearch <- searchH(object, useRaw)
     H <- Hsearch$H
     useRaw <- Hsearch$useRaw
-    type <- ifelse(useRaw, "unnormalized", "quantile normalized")
+    type <- ifelse(useRaw, "unaligned", "aligned")
 
     if (!is.null(useDims))
         H <- H[, useDims, drop = FALSE]
@@ -148,7 +147,7 @@ runCluster <- function(object,
     return(object)
 }
 
-#' [Deprecated] Louvain algorithm for community detection
+#' `r lifecycle::badge("superseded")` Louvain algorithm for community detection
 #' @description
 #' After quantile normalization, users can additionally run the Louvain
 #' algorithm for community detection, which is widely used in single-cell
@@ -362,7 +361,8 @@ mapCellMeta <- function(object, from, newTo = NULL, ...) {
 #' calculation. Default \code{NULL} uses all datasets.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
-#' @param classes.compare [Deprecated/Renamed]. Use \code{trueCluster} instead.
+#' @param classes.compare `r lifecycle::badge("superseded")` Use
+#' \code{trueCluster} instead.
 #' @return A numeric scalar, the purity of the clustering result indicated by
 #' \code{useCluster} compared to \code{trueCluster}.
 #' @export
@@ -451,7 +451,8 @@ calcPurity <- function(object,
 #' calculation. Default \code{NULL} uses all datasets.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
-#' @param classes.compare [Deprecated/Renamed]. Use \code{trueCluster} instead.
+#' @param classes.compare . `r lifecycle::badge("superseded")` Use
+#' \code{trueCluster} instead.
 #' @return A numeric scalar, the ARI of the clustering result indicated by
 #' \code{useCluster} compared to \code{trueCluster}.
 #' @export