bug fixes; minor improvement; liger obj doc update

mvfki · Nov 9, 2023 · 3fdc9e8 · 3fdc9e8
1 parent 154a3a4
commit 3fdc9e8
Show file tree

Hide file tree

Showing 7 changed files with 162 additions and 37 deletions.
diff --git a/R/downsample.R b/R/downsample.R
@@ -1,35 +1,49 @@
 #' Downsample datasets
 #' @description This function mainly aims at downsampling datasets to a size
-#' suitable for plotting.
-#' @details Users can balance the sample size of categories of interests with
+#' suitable for plotting or expensive in-memmory calculation.
+#'
+#' Users can balance the sample size of categories of interests with
 #' \code{balance}. Multi-variable specification to \code{balance} is supported,
 #' so that at most \code{maxCells} cells will be sampled from each combination
-#' of categories from the variables. Note that \code{"dataset"} will
-#' automatically be added as one variable when balancing the downsampling.
-#' However, if users want to balance the downsampling solely basing on dataset
-#' origin, users have to explicitly set \code{balance = "dataset"}.
+#' of categories from the variables. For example, when two datasets are
+#' presented and three clusters labeled across them, there would then be at most
+#' \eqn{2 \times 3 \times maxCells} cells being selected. Note that
+#' \code{"dataset"} will automatically be added as one variable when balancing
+#' the downsampling. However, if users want to balance the downsampling solely
+#' basing on dataset origin, users have to explicitly set \code{balance =
+#' "dataset"}.
 #' @param object \linkS4class{liger} object
 #' @param balance Character vector of categorical variable names in
 #' \code{cellMeta} slot, to subsample \code{maxCells} cells from each
 #' combination of all specified variables. Default \code{NULL} samples
 #' \code{maxCells} cells from the whole object.
 #' @param maxCells Max number of cells to sample from the grouping based on
 #' \code{balance}.
-#' @param useDatasets Index selection of datasets to consider. Default
+#' @param useDatasets Index selection of datasets to include Default
 #' \code{NULL} for using all datasets.
 #' @param seed Random seed for reproducibility. Default \code{1}.
+#' @param returnIndex Logical, whether to only return the numeric index that can
+#' subset the original object instead of a subset object. Default \code{FALSE}.
 #' @param ... Arguments passed to \code{\link{subsetLiger}}, where
 #' \code{cellIdx} is occupied by internal implementation.
-#' @return Subset of \linkS4class{liger} \code{object}.
+#' @return By default, a subset of \linkS4class{liger} \code{object}.
+#' Alternatively when \code{returnIndex = TRUE}, a numeric vector to be used
+#' with the original object.
 #' @export
 #' @examples
+#' # Subsetting an object
 #' pbmc <- downsample(pbmc)
+#' # Creating a subsetting index
+#' sampleIdx <- downsample(pbmcPlot, balance = "leiden_cluster",
+#'                         maxCells = 10, returnIndex = TRUE)
+#' plotClusterDimRed(pbmcPlot, cellIdx = sampleIdx)
 downsample <- function(
     object,
     balance = NULL,
     maxCells = 1000,
     useDatasets = NULL,
     seed = 1,
+    returnIndex = FALSE,
     ...
 ) {
     set.seed(seed)
@@ -68,7 +82,8 @@ downsample <- function(
         }
         selected <- sort(selected)
     }
-    subsetLiger(object = object, cellIdx = selected, ...)
+    if (isTRUE(returnIndex)) return(selected)
+    else return(subsetLiger(object = object, cellIdx = selected, ...))
 }
 
 #' [Deprecated] See \code{\link{downsample}}

diff --git a/R/import.R b/R/import.R
@@ -15,7 +15,8 @@
 #' memory.
 #' @param addPrefix Logical. Whether to add "<dataset name>_" as a prefix of
 #' cell identifiers (e.g. barcodes) to avoid duplicates in multiple libraries (
-#' common with 10X data). Default \code{TRUE}
+#' common with 10X data). Default \code{"auto"} detects if matrix columns
+#' already has the exact prefix or not. Logical value forces the action.
 #' @param formatType Select preset of H5 file structure. Current available
 #' options are \code{"10X"} and \code{"AnnData"}. Can be either a single
 #' specification for all datasets or a character vector that match with each
@@ -38,7 +39,7 @@ createLiger <- function(
         modal = NULL,
         cellMeta = NULL,
         removeMissing = TRUE,
-        addPrefix = TRUE,
+        addPrefix = "auto",
         formatType = "10X",
         dataName = NULL,
         indicesName = NULL,
@@ -93,14 +94,19 @@ createLiger <- function(
         } else {
             datasets[[dname]] <- as.ligerDataset(data, modal = modal[i])
         }
-        barcodesOrig <- c(barcodesOrig, colnames(datasets[[dname]]))
-        if (isTRUE(addPrefix)) {
-            cellID <- paste0(dname, "_", colnames(datasets[[dname]]))
-            colnames(datasets[[dname]]) <- cellID
+        colnameOrig <- colnames(datasets[[dname]])
+        prefix <- paste0(dname, "_")
+        .addPrefix <- FALSE
+        if (addPrefix == "auto") {
+            # If all colnames starts with the prefix wanted, don't add it again
+            .addPrefix <- !all(startsWith(colnameOrig, prefix))
+        }
+        barcodesOrig <- c(barcodesOrig, colnameOrig)
+        if (.addPrefix) {
+            colnames(datasets[[dname]]) <- paste0(prefix, colnameOrig)
         }
     }
 
-    #barcodesOrig <- unlist(lapply(datasets, colnames), use.names = FALSE)
     datasets <- lapply(datasets, function(ld) {
         colnames(ld) <- make.unique(colnames(ld))
         return(ld)

diff --git a/R/preprocess.R b/R/preprocess.R
@@ -76,12 +76,7 @@ runGeneralQC <- function(
 
     # Start calculation on each dataset
     newResultNames <- c("nUMI", "nGene", names(featureSubsets))
-    # Not using S4 cellMeta() method below because no need to do so
-    for (nrn in newResultNames) {
-        if (!nrn %in% colnames(cellMeta(object))) {
-            object[[nrn]] <- NA
-        }
-    }
+
     for (d in useDatasets) {
         ld <- dataset(object, d)
         if (isTRUE(verbose)) .log('calculating QC for dataset "', d, '"')

diff --git a/man/createLiger.Rd b/man/createLiger.Rd
diff --git a/man/dot-ggplotLigerTheme.Rd b/man/dot-ggplotLigerTheme.Rd
diff --git a/man/downsample.Rd b/man/downsample.Rd
diff --git a/vignettes/articles/liger_object.Rmd b/vignettes/articles/liger_object.Rmd
@@ -44,6 +44,49 @@ pbmc <- pbmc %>%
     runUMAP()
 ```
 
+## Access a dataset
+
+As introduced above, the dataset-specific information is contained in a [ligerDataset](../reference/ligerDataset-class.html) object. 
+
+- To get the names of all datasets
+
+```{R}
+names(pbmc)
+```
+
+- To get the number of all datasets
+
+```{R}
+length(pbmc)
+```
+
+- To access the [ligerDataset](../reference/ligerDataset-class.html) object for a specific dataset
+
+```{R}
+ctrlLD <- dataset(pbmc, dataset = "ctrl")
+# Alternatively, using numeric index
+ctrlLD <- dataset(pbmc, 1)
+```
+
+In other LIGER functions where the argument `useDatasets` is allowed, users can always use the exact character name(s) or the numeric index to specify the datasets to be involved in the analysis. Moreoever, a logical vector of index is also allowed and could ease the usage in some cases.
+
+```{R, eval = FALSE}
+# Not run, just for example, assuming we've got the clustering for such an object
+names(ligerObj)
+## [1] female-1  female-2  male-3  male-4  female-5 ......
+femaleIdx <- startsWith(names(ligerObj), "fe")
+runMarkerDEG(ligerObj, conditionBy = "dataset", splitBy = "leiden_cluster", 
+             useDatasets = femaleIdx)
+```
+
+In the example above, the `runMarkerDEG()` funcion is parametered for detecting dataset specific markers within each cluster, and only within the female samples. For example, cells from condition "female-1 and cluster 1" will be tested against cells belonging to condition "cluster 1 and all other female datasets". Can be use
+
+- To access multiple datasets, returned in a list
+
+```{R}
+ldList <- datasets(pbmc)
+```
+
 ## Access feature matrices
 
 We have three main generics for accessing feature matrices, namingly `rawData()`, `normData()` and `scaleData()`. For scaled unshared features, used for UINMF, we also have `scaleUnsharedData()`. The logistics of the accessor to all these feature matrices are the same, so we only present the case for raw counts.
@@ -80,7 +123,7 @@ ctrlRaw <- rawData(pbmc, "ctrl")
 rawData(pbmc, "ctrl") <- ctrlRaw
 ```
 
-In the new version, strict validity checks have been put upon modification in object content. Replacement with unmatching feature names or barcodes will be rejected. In the case where there is a need to replace the dataset with a different set of barcodes or features, we suggest recreate a new [ligerDataset](../reference/ligerDataset-class.html) object with the new raw counts (or other feature matrix), and then replace the whole dataset with it.
+In the new version, strict validity checks have been put upon modification in object content. Replacement with unmatching feature names or barcodes will be rejected. In the case where there is a need to replace the dataset with a different set of barcodes or features, we suggest recreating a new [ligerDataset](../reference/ligerDataset-class.html) object with the new raw counts (or other feature matrix), and then replace the whole dataset with it.
 
 ```{R}
 ctrlRaw <- rawData(pbmc, "ctrl")
@@ -174,10 +217,62 @@ lapply(HList, dim)
 
 ## Subsetting the data
 
-TODO
+- A [liger](../reference/liger-class.html) object can be subset by both cells and genes. 
+
+For cell level subsetting, any indexing method among barcode names, numeric or logical index can do the job. Cells are indexed by `rownames(cellMeta(object))`, which is a concatenation of the barcodes from each dataset, and datasets are ordered as `names(object)` shows.
+
+```{R}
+pbmcSmall <- pbmc[, 1:100]
+pbmcCluster1 <- pbmc[, pbmc$leiden_cluster == 1]
+```
+
+For gene level subsetting, we only allow using gene names, because it is assumed that different datasets can have different set of genes. And only genes shared by all datasets can be used.
+
+```{R}
+pbmcVarOnly <- pbmc[varFeatures(pbmc),]
+ctrlUnsharedGenes <- c("P2RY1", "GFI1B", "HDGFRP2", "TUBGCP6", "CELA1")
+# Not run, will raise error
+# pbmc[ctrl.unshared.genes,]
+```
+
+- A [ligerDataset object](../reference/liger-class.html) can be subset by both cells and genes. 
+
+Cell level subsetting works in the exactly same way as a [liger object](../reference/liger-class).
+
+```{R}
+ctrlLD <- dataset(pbmc, "ctrl")
+ctrlLDSmall <- ctrlLD[, 1:100]
+```
+
+Gene level subsetting on a [ligerDataset object](../reference/liger-class.html) can achieved with any type of index.
+
+```{R}
+ctrlLDsmall <- ctrlLD[1:100, ]
+ctrlLDsmall <- ctrlLD[1:100, 1:100]
+```
+
+Note that, `scaleData(ctrlLD)` and `scaleUnsharedData(ctrlLD)` comes with only variable genes identified upstream. Subsetting genes on a [ligerDataset object](../reference/liger-class.html) is based on its raw input data. Therefore, we only take the user specification available in scaled data into the subset of scaled data. 
 
 ## Check the records of run commands
 
-TODO
+We implemented a analysis tracking feature in order to keep a record of what functions are called and what parameters are used.
+
+- To show a list of function names applied to the [liger object](../reference/liger-class.html) in time order
+
+```{R}
+commands(pbmc)
+```
+
+A unique suffix is added to each function name to keep track of calls of the same function with different parameters. 
 
+- Detailed a function call information can be retrieved with partial matching.
 
+```{R}
+commands(pbmc, "runINMF")
+```
+
+A function can be applied to an object several times with parameter tweaks. For example, different `lambda` for iNMF integration. If `runINMF()` is called several times, calling `commands(pbmc, "runINMF")` returns a list of records of all such calls, as all record names starting with `"runINMF"` are matched. So listing names first and using the unique record name will be required for getting the information of one specific call among all of such. For another example, given that `runCluster()` and `runUMAP()` are also in the record, the following result would be returned if we do matching with only `"run"`
+
+```{R}
+commands(pbmc, "run")
+```