From 373990284ca4b3ff1f958527150781257ce70244 Mon Sep 17 00:00:00 2001
From: Yichen Wang <wayichen@umich.edu>
Date: Tue, 14 Nov 2023 14:53:16 -0800
Subject: [PATCH] Update onlineINMF vig, with all bugs related fixed

---
 NAMESPACE                                   |   1 +
 R/clustering.R                              |  49 ++++
 R/import.R                                  |  10 +-
 R/integration.R                             | 136 ++++++---
 R/liger-class.R                             |   5 +
 R/preprocess.R                              |   5 +-
 R/subsetObject.R                            | 309 +++++++++++++-------
 R/util.R                                    |  40 ++-
 man/createLiger.Rd                          |   9 +-
 man/liger-class.Rd                          |   4 +-
 man/mapCellMeta.Rd                          |  33 +++
 man/removeMissing.Rd                        |   1 +
 man/subsetLiger.Rd                          |   5 +-
 man/subsetLigerDataset.Rd                   |   7 +-
 vignettes/articles/online_iNMF_tutorial.Rmd | 233 +++++++--------
 15 files changed, 559 insertions(+), 288 deletions(-)
 create mode 100644 man/mapCellMeta.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 05f83578..85042f74 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -88,6 +88,7 @@ export(louvainCluster)
 export(makeFeatureMatrix)
 export(makeInteractTrack)
 export(makeRiverplot)
+export(mapCellMeta)
 export(mergeDenseAll)
 export(mergeH5)
 export(mergeSparseAll)
diff --git a/R/clustering.R b/R/clustering.R
index 0d491615..e5385a80 100644
--- a/R/clustering.R
+++ b/R/clustering.R
@@ -245,3 +245,52 @@ groupSingletons <- function(
     newClusts <- factor(newClusts)
     return(newClusts)
 }
+
+
+
+#' Create new variable from categories in cellMeta
+#' @description
+#' Designed for fast variable creation when a new variable is going to be
+#' created from existing variable. For example, multiple samples can be mapped
+#' to the same study design condition, clusters can be mapped to cell types.
+#' @param object A \linkS4class{liger} object.
+#' @param from The name of the original variable to be mapped from.
+#' @param newTo The name of the new variable to store the mapped result. Default
+#' \code{NULL} returns the new variable (factor class).
+#' @param ... Mapping criteria, argument names are original existing categories
+#' in the \code{from} and values are new categories in the new variable.
+#' @return When \code{newTo = NULL}, a factor object of the new variable.
+#' Otherwise, the input object with variable \code{newTo} updated in
+#' \code{cellMeta(object)}.
+#' @export
+#' @examples
+#' pbmc <- mapCellMeta(pbmc, from = "dataset", newTo = "modal",
+#'                     ctrl = "rna", stim = "rna")
+mapCellMeta <- function(
+        object,
+        from,
+        newTo = NULL,
+        ...
+) {
+    object <- recordCommand(object)
+    from <- cellMeta(object, from)
+    if (!is.factor(from)) stop("`from` must be a factor class variable.")
+    mapping <- list(...)
+    fromCats <- names(mapping)
+    notFound <- fromCats[!fromCats %in% levels(from)]
+    if (length(notFound) > 0) {
+        stop("The following categories requested not found: ",
+             paste0(notFound, collapse = ", "))
+    }
+
+    toCats <- unlist(mapping)
+    unchangedCats <- levels(from)
+    unchangedCats <- unchangedCats[!unchangedCats %in% fromCats]
+    names(unchangedCats) <- unchangedCats
+    if (length(unchangedCats) > 0) toCats <- c(toCats, unchangedCats)
+    to <- toCats[as.character(from)]
+    to <- factor(unname(to), levels = unique(toCats))
+    if (is.null(newTo)) return(to)
+    cellMeta(object, newTo) <- to
+    return(object)
+}
diff --git a/R/import.R b/R/import.R
index 6c49922c..30c7ec94 100644
--- a/R/import.R
+++ b/R/import.R
@@ -11,8 +11,7 @@
 #' \code{NULL}.
 #' @param removeMissing Logical. Whether to remove cells that do not have any
 #' counts and features not expressed in any cells from each dataset. Default
-#' \code{TRUE}. H5 based dataset with less than 8000 cells will be subset into
-#' memory.
+#' \code{TRUE}.
 #' @param addPrefix Logical. Whether to add "<dataset name>_" as a prefix of
 #' cell identifiers (e.g. barcodes) to avoid duplicates in multiple libraries (
 #' common with 10X data). Default \code{"auto"} detects if matrix columns
@@ -27,6 +26,10 @@
 #' object. Default \code{NULL} uses \code{formatType} preset.
 #' @param genesName,barcodesName The path in a H5 file for the gene names and
 #' cell barcodes. Default \code{NULL} uses \code{formatType} preset.
+#' @param newH5 When using HDF5 based data and subsets created after removing
+#' missing cells/features, whether to create new HDF5 files for the subset.
+#' Default \code{TRUE}. If \code{FALSE}, data will be subset into memory and
+#' can be dangerous for large scale analysis.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.
 #' @param ... Additional slot values that should be directly placed in object.
@@ -46,6 +49,7 @@ createLiger <- function(
         indptrName = NULL,
         genesName = NULL,
         barcodesName = NULL,
+        newH5 = TRUE,
         verbose = getOption("ligerVerbose"),
         ...,
         # Deprecated coding style
@@ -135,7 +139,7 @@ createLiger <- function(
     obj <- runGeneralQC(obj, verbose = verbose)
     if (isTRUE(removeMissing)) {
         obj <- removeMissing(obj, "both", filenameSuffix = "qc",
-                             verbose = verbose)
+                             verbose = verbose, newH5 = newH5)
     }
 
     return(obj)
diff --git a/R/integration.R b/R/integration.R
index 93cf36c9..32d30875 100644
--- a/R/integration.R
+++ b/R/integration.R
@@ -271,10 +271,15 @@ runINMF.liger <- function(
     )
 
     object@W <- out$W
+    rownames(object@W) <- varFeatures(object)
     for (d in names(object)) {
         ld <- dataset(object, d)
         ld@H <- out$H[[d]]
         ld@V <- out$V[[d]]
+        if (isH5Liger(ld)) {
+            colnames(ld@H) <- colnames(ld)
+            rownames(ld@V) <- varFeatures(object)
+        }
         datasets(object, check = FALSE)[[d]] <- ld
     }
     object@uns$factorization <- list(k = k, lambda = lambda)
@@ -696,31 +701,47 @@ runOnlineINMF.liger <- function(
                  "datasets. Please run `runOnlineINMF()` without `newDataset` ",
                  "first.")
         }
-        # Put new datasets into input liger object
+
+        newNames <- names(newDatasets)
+        if (any(newNames %in% names(object))) {
+            stop("Names of `newDatasets` overlap with existing datasets.")
+        }
         if (is.list(newDatasets)) {
-            allType <- sapply(newDatasets, function(x) class(x)[1])
-            if (!all(allType == "dgCMatrix")) {
-                stop("`newDatasets` must contain only `dgCMatrix`.")
-            }
+            # A list of raw data
             if (is.null(names(newDatasets))) {
-                stop("`newDatasets` must be a NAMED list.")
+                stop("The list of new datasets must be named.")
             }
-            allNewNames <- character()
             for (i in seq_along(newDatasets)) {
-                if (names(newDatasets)[i] %in% names(object)) {
-                    newName <- paste0(names(newDatasets)[i], ".1")
-                } else newName <- names(newDatasets)[i]
-                dataset(object, newName) <- newDatasets[[i]]
-                allNewNames <- c(allNewNames, newName)
+                if (inherits(newDatasets[[i]], "dgCMatrix")) {
+                    dataset(object, names(newDatasets)[i]) <- newDatasets[[i]]
+                } else if (is.character(newDatasets[[i]])) {
+                    # Assume is H5 filename
+                    ld <- createH5LigerDataset(newDatasets[[i]])
+                    dataset(object, names(newDatasets[i])) <- ld
+                } else {
+                    stop("Cannot interpret `newDatasets` element ", i)
+                }
             }
-            object <- normalize(object, useDatasets = allNewNames,
-                                verbose = verbose)
-            object <- scaleNotCenter(object, useDatasets = allNewNames,
-                                     verbose = verbose)
-            newDatasets <- getMatrix(object, slot = "scaleData",
-                                     dataset = allNewNames, returnList = TRUE)
+        } else if (inherits(newDatasets, "liger")) {
+            # A liger object with all new datasets
+            object <- c(object, newDatasets)
         } else {
-            stop("`newDatasets` must be a named list of dgCMatrix.")
+            stop("`newDatasets` must be either a named list or a liger object")
+        }
+
+        object <- normalize(object, useDatasets = newNames)
+        object <- scaleNotCenter(object, useDatasets = newNames)
+        newDatasets <- list()
+        for (d in newNames) {
+            ld <- dataset(object, d)
+            sd <- scaleData(ld)
+            if (inherits(sd, "H5D")) {
+                newDatasets[[d]] <- .H5DToH5Mat(sd)
+            } else if (inherits(sd, "H5Group")) {
+                newDatasets[[d]] <- .H5GroupToH5SpMat(sd, c(length(varFeatures(object)), ncol(ld)))
+            } else {
+                newDatasets[[d]] <- sd
+            }
         }
     }
     object <- closeAllH5(object)
@@ -731,16 +752,35 @@ runOnlineINMF.liger <- function(
                                HALSiter = HALSiter, verbose = verbose,
                                WInit = WInit, VInit = VInit, AInit = AInit,
                                BInit = BInit, seed = seed)
-
-    for (i in seq_along(object)) {
-        ld <- dataset(object, i)
-        ld@H <- res$H[[i]]
-        ld@V <- res$V[[i]]
-        ld@A <- res$A[[i]]
-        ld@B <- res$B[[i]]
-        datasets(object, check = FALSE)[[i]] <- ld
+    if (!isTRUE(projection)) {
+        # Scenario 1&2, everything updated
+        for (i in seq_along(object)) {
+            ld <- dataset(object, i)
+            ld@H <- res$H[[i]]
+            ld@V <- res$V[[i]]
+            ld@A <- res$A[[i]]
+            ld@B <- res$B[[i]]
+            if (isH5Liger(ld)) {
+                colnames(ld@H) <- colnames(ld)
+                rownames(ld@V) <- varFeatures(object)
+                rownames(ld@B) <- varFeatures(object)
+            }
+            datasets(object, check = FALSE)[[i]] <- ld
+        }
+        object@W <- res$W
+        rownames(object@W) <- varFeatures(object)
+    } else {
+        # Scenario 3, only H of newDatasets returned
+        for (i in seq_along(newDatasets)) {
+            dname <- names(newDatasets)[i]
+            ld <- dataset(object, dname)
+            ld@H <- res$H[[i]]
+            if (isH5Liger(ld)) {
+                colnames(ld@H) <- colnames(ld)
+            }
+            datasets(object, check = FALSE)[[dname]] <- ld
+        }
     }
-    object@W <- res$W
     object@uns$factorization <- list(k = k, lambda = lambda)
     suppressMessages({object <- restoreH5Liger(object)})
     return(object)
@@ -769,6 +809,7 @@ runOnlineINMF.liger <- function(
              "devtools::install_github(\"welch-lab/RcppPlanc\")")
     nDatasets <- length(object) + length(newDatasets)
     barcodeList <- c(lapply(object, colnames), lapply(newDatasets, colnames))
+    names(barcodeList) <- c(names(object), names(newDatasets))
     features <- rownames(object[[1]])
     if (!is.null(seed)) set.seed(seed)
 
@@ -793,16 +834,27 @@ runOnlineINMF.liger <- function(
                                  Winit = WInit, Ainit = AInit, Binit = BInit,
                                  verbose = verbose)
     factorNames <- paste0("Factor_", seq(k))
-    for (i in seq(nDatasets)) {
-        res$H[[i]] <- t(res$H[[i]])
-        dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[i]])
-        dimnames(res$V[[i]]) <- list(features, factorNames)
-        dimnames(res$A[[i]]) <- list(factorNames, factorNames)
-        dimnames(res$B[[i]]) <- list(features, factorNames)
+    if (isTRUE(projection)) {
+        # Scenario 3 only got H for new datasets
+        for (i in seq_along(newDatasets)) {
+            dname <- names(newDatasets)[i]
+            res$H[[i]] <- t(res$H[[i]])
+            dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[dname]])
+            names(res$H) <- names(newDatasets)
+        }
+    } else {
+        # Scenario 1&2 got everything
+        for (i in seq(nDatasets)) {
+            res$H[[i]] <- t(res$H[[i]])
+            dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[i]])
+            dimnames(res$V[[i]]) <- list(features, factorNames)
+            dimnames(res$A[[i]]) <- list(factorNames, factorNames)
+            dimnames(res$B[[i]]) <- list(features, factorNames)
+        }
+        names(res$B) <- names(res$A) <- names(res$V) <- names(res$H) <-
+            names(barcodeList)
+        dimnames(res$W) <- list(features, factorNames)
     }
-    names(res$A) <- names(res$B) <- names(res$V) <-
-        names(res$H) <- c(names(object), names(newDatasets))
-    dimnames(res$W) <- list(features, factorNames)
     return(res)
 }
 
@@ -1108,10 +1160,16 @@ runUINMF.liger <- function(
         ld <- dataset(object, i)
         ld@H <- res$H[[i]]
         ld@V <- res$V[[i]]
-        if (!is.null(ld@scaleUnsharedData)) ld@U <- res$U[[i]]
+        if (!is.null(ld@scaleUnsharedData)) {
+            ld@U <- res$U[[i]]
+            rownames(ld@U) <- ld@varUnsharedFeatures
+        }
+        colnames(ld@H) <- colnames(ld)
+        rownames(ld@V) <- varFeatures(object)
         datasets(object, check = FALSE)[[i]] <- ld
     }
     object@W <- res$W
+    rownames(ld@W) <- varFeatures(object)
     object@uns$factorization <- list(k = k, lambda = lambda)
     return(object)
 }
@@ -1271,7 +1329,7 @@ quantileNorm.liger <- function(
         ...
 ) {
     .checkObjVersion(object)
-    .checkValidFactorResult(object, useDatasets = names(object))
+    .checkValidFactorResult(object, checkV = FALSE)
     if (is.null(reference)) {
         # If ref_dataset not given, set the one with the largest number of
         # cells as reference.
diff --git a/R/liger-class.R b/R/liger-class.R
index 2ad7034e..4a05ee5c 100644
--- a/R/liger-class.R
+++ b/R/liger-class.R
@@ -133,6 +133,11 @@ liger <- setClass(
                                          "dimension of scaleData in dataset ",
                                          "(H5)", d))
                         }
+                        scaleDataIdx <- scaleData(ld)[["featureIdx"]][]
+                        if (!identical(rownames(ld)[scaleDataIdx], varFeatures(x))) {
+                            return("HDF5 scaled data feature index does not ",
+                                   "match variable features")
+                        }
                     }
                 }
             }
diff --git a/R/preprocess.R b/R/preprocess.R
index 302ff3f5..df76bd3f 100644
--- a/R/preprocess.R
+++ b/R/preprocess.R
@@ -254,6 +254,7 @@ removeMissing <- function(
         minFeatures = NULL,
         useDatasets = NULL,
         filenameSuffix = "removeMissing",
+        newH5 = TRUE,
         verbose = getOption("ligerVerbose"),
         ...
 ) {
@@ -296,7 +297,9 @@ removeMissing <- function(
                 featureIdx = featureIdx,
                 cellIdx = cellIdx,
                 filenameSuffix = filenameSuffix,
-                verbose = verbose, ...
+                verbose = verbose,
+                newH5 = newH5,
+                ...
             )
         } else {
             datasets.new[[d]] <- ld
diff --git a/R/subsetObject.R b/R/subsetObject.R
index 36c4cf03..d1c8c89b 100644
--- a/R/subsetObject.R
+++ b/R/subsetObject.R
@@ -18,8 +18,7 @@
 #' \code{NULL} subsets the whole object including analysis result matrices.
 #' @param newH5 Whether to create new H5 files on disk for the subset datasets
 #' if involved datasets in the \code{object} is HDF5 based. \code{TRUE} writes a
-#' new ones, \code{FALSE} returns in memory data. Default \code{"auto"} writes a
-#' new dataset when more than 8000 cells from that dataset is subscribed.
+#' new ones, \code{FALSE} returns in memory data.
 #' @param chunkSize Integer. Number of maximum number of cells in each chunk,
 #' Default \code{1000}.
 #' @param verbose Logical. Whether to show information of the progress. Default
@@ -42,7 +41,7 @@ subsetLiger <- function(
         useSlot = NULL,
         chunkSize = 1000,
         verbose = getOption("ligerVerbose"),
-        newH5 = "auto",
+        newH5 = TRUE,
         returnObject = TRUE,
         ...
 ) {
@@ -99,9 +98,9 @@ subsetLiger <- function(
     }
     if (isTRUE(returnObject)) {
         if (!is.null(featureIdx)) {
-            W <- object@W[featureIdx, , drop = FALSE]
-            varFeature <- varFeatures(object)[varFeatures(object) %in%
-                                                  featureIdx]
+            featureIdxVar <- featureIdx[featureIdx %in% varFeatures(object)]
+            W <- object@W[featureIdxVar, , drop = FALSE]
+            varFeature <- featureIdxVar
         } else {
             W <- object@W
             varFeature <- varFeatures(object)
@@ -238,8 +237,7 @@ retrieveCellFeature <- function(
 #' \code{NULL} subsets the whole object including analysis result matrices.
 #' @param newH5 Whether to create a new H5 file on disk for the subset dataset
 #' if \code{object} is HDF5 based. \code{TRUE} writes a new one, \code{FALSE}
-#' returns in memory data. Default \code{"auto"} writes a new one when more than
-#' 8000 cells.
+#' returns in memory data.
 #' @param filename Filename of the new H5 file if being created. Default
 #' \code{NULL} adds suffix \code{".subset_{yymmdd_HHMMSS}.h5"} to the original
 #' name.
@@ -266,7 +264,7 @@ subsetLigerDataset <- function(
         featureIdx = NULL,
         cellIdx = NULL,
         useSlot = NULL,
-        newH5 = "auto",
+        newH5 = TRUE,
         filename = NULL,
         filenameSuffix = NULL,
         chunkSize = 1000,
@@ -293,18 +291,18 @@ subsetH5LigerDataset <- function(
         featureIdx = NULL,
         cellIdx = NULL,
         useSlot = NULL,
-        newH5 = "auto",
+        newH5 = TRUE,
         filename = NULL,
         filenameSuffix = NULL,
         chunkSize = 1000,
         verbose = getOption("ligerVerbose"),
         returnObject = TRUE
 ) {
-    if (newH5 == "auto") {
-        cellIdx <- .idxCheck(object, cellIdx, "cell")
-        if (length(cellIdx) > 8000) newH5 <- TRUE
-        else newH5 <- FALSE
-    }
+    # if (newH5 == "auto") {
+    #     cellIdx <- .idxCheck(object, cellIdx, "cell")
+    #     if (length(cellIdx) > 8000) newH5 <- TRUE
+    #     else newH5 <- FALSE
+    # }
     if (isTRUE(newH5)) {
         if (isFALSE(returnObject))
             warning("Cannot set `returnObject = FALSE` when subsetting",
@@ -379,50 +377,84 @@ subsetH5LigerDatasetToMem <- function(
     }
 
     # Process scaled data ####
+    secondIdx <- NULL
+    if (!is.null(scaleData(object))) {
+        # See comments in h5ToH5 for what the following two mean
+        scaledFeatureIdx <- scaleData(object)[["featureIdx"]][]
+        secondIdx <- as.numeric(na.omit(match(featureIdx, scaledFeatureIdx)))
+    }
     if ("scaleData" %in% slotInvolved & !is.null(scaleData(object))) {
         if (isTRUE(verbose)) .log("Subsetting `scaleData`", level = 2)
-        scaledFeatureIdx <- NULL
-        if (getH5File(object)$exists("scaleData.featureIdx")) {
-            scaledFeatureIdx <- getH5File(object)[["scaleData.featureIdx"]][]
-        } else if ("selected" %in% names(featureMeta(object))) {
-            scaledFeatureIdx <- which(featureMeta(object)$selected)
-        } else {
-            warning("Unable to know what features are included scaled data. ",
-                    "Skipped.")
-        }
-        if (!is.null(scaledFeatureIdx) && length(scaledFeatureIdx) > 0) {
-            if (length(scaledFeatureIdx) == scaleData(object)$dims[1]) {
-                # Before: scaledFeatureIdx is based on all features, selecting
-                # what features are in the scaleData
-                # After: based on variable features, selecting what features
-                # from variable features are being involved in featureIdx.
-                scaledFeatureIdx2 <- unlist(lapply(featureIdx, function(i) {
-                    which(scaledFeatureIdx == i)}), use.names = FALSE)
-                if (isTRUE(verbose))
-                    .log(length(scaledFeatureIdx2),
-                         " features used in scaleData were selected. ",
-                         level = 3)
-                scaleData <- scaleData(object)[scaledFeatureIdx2, cellIdx,
-                                               drop = FALSE]
-                rownames(scaleData) <- rownames(object)[scaledFeatureIdx][scaledFeatureIdx2]
-                colnames(scaleData) <- colnames(object)[cellIdx]
-            } else {
-                scaleData <- NULL
-                warning("Row dimension of scaleData does not match with ",
-                        "feature selection. Unable to subset from H5.")
-            }
+        # scaledFeatureIdx <- NULL
+        # if (getH5File(object)$exists("scaleData.featureIdx")) {
+        #     scaledFeatureIdx <- getH5File(object)[["scaleData.featureIdx"]][]
+        # } else if ("selected" %in% names(featureMeta(object))) {
+        #     scaledFeatureIdx <- which(featureMeta(object)$selected)
+        # } else {
+        #     warning("Unable to know what features are included scaled data. ",
+        #             "Skipped.")
+        # }
+        scaledFeatureIdxNew <- which(featureIdx %in% scaledFeatureIdx)
+        scaleDataSubset <- NULL
+        nChunk <- ceiling(ncol(object) / chunkSize)
+        arrayDims <- c(0, 1) # for recording the current size of ix and p
+        for (i in seq_len(nChunk)) {
+            # Column range of a chunk
+            start <- (i - 1)*chunkSize + 1
+            end <- i*chunkSize
+            if (end > ncol(object)) end <- ncol(object)
+            p.range <- seq(start, end + 1)
+            chunk.p <- scaleData(object)[["indptr"]][p.range]
+            ix.range <- seq(chunk.p[1] + 1, chunk.p[length(chunk.p)])
+            chunk.i <- scaleData(object)[["indices"]][ix.range]
+            chunk.x <- scaleData(object)[["data"]][ix.range]
+            chunk.p <- chunk.p - chunk.p[1]
+            chunkSpMat <- Matrix::sparseMatrix(
+                i = chunk.i + 1, p = chunk.p, x = chunk.x,
+                dims = c(length(scaledFeatureIdx), end - start + 1)
+            )
+            chunkCellIdx <- seq(start, end)
+            chunkSubset <- chunkSpMat[secondIdx, chunkCellIdx %in% cellIdx]
+            if (is.null(scaleDataSubset)) scaleDataSubset <- chunkSubset
+            else scaleDataSubset <- cbind(scaleDataSubset, chunkSubset)
         }
-        value$scaleData <- scaleData
+        dimnames(scaleDataSubset) <- list(
+            rownames(object)[scaledFeatureIdx][secondIdx],
+            colnames(object)[cellIdx]
+        )
+        # if (!is.null(scaledFeatureIdx) && length(scaledFeatureIdx) > 0) {
+        #     if (length(scaledFeatureIdx) == scaleData(object)$dims[1]) {
+        #         # Before: scaledFeatureIdx is based on all features, selecting
+        #         # what features are in the scaleData
+        #         # After: based on variable features, selecting what features
+        #         # from variable features are being involved in featureIdx.
+        #         scaledFeatureIdx2 <- unlist(lapply(featureIdx, function(i) {
+        #             which(scaledFeatureIdx == i)}), use.names = FALSE)
+        #         if (isTRUE(verbose))
+        #             .log(length(scaledFeatureIdx2),
+        #                  " features used in scaleData were selected. ",
+        #                  level = 3)
+        #         scaleData <- scaleData(object)[scaledFeatureIdx2, cellIdx,
+        #                                        drop = FALSE]
+        #         rownames(scaleData) <- rownames(object)[scaledFeatureIdx][scaledFeatureIdx2]
+        #         colnames(scaleData) <- colnames(object)[cellIdx]
+        #     } else {
+        #         scaleData <- NULL
+        #         warning("Row dimension of scaleData does not match with ",
+        #                 "feature selection. Unable to subset from H5.")
+        #     }
+        # }
+        value$scaleData <- scaleDataSubset
     }
     # `NULL[idx1, idx2]` returns `NULL`
     # V: k x genes
-    if (is.null(useSlot)) {
-        if (exists("scaledFeatureIdx2")) sfi <- scaledFeatureIdx2
-        else sfi <- NULL
+    # if (is.null(useSlot)) {
+        # if (exists("scaledFeatureIdx2")) sfi <- scaledFeatureIdx2
+        # else sfi <- NULL
         value$H <- object@H[, cellIdx, drop = FALSE]
-        value$V <- object@V[sfi, , drop = FALSE]
+        value$V <- object@V[secondIdx, , drop = FALSE]
         value$A <- object@A
-        value$B <- object@B[sfi, , drop = FALSE]
+        value$B <- object@B[secondIdx, , drop = FALSE]
         value$U <- object@U
         value$featureMeta <- featureMeta(object)[featureIdx, , drop = FALSE]
         # Additional subsetting for sub-classes, if applicable
@@ -430,7 +462,7 @@ subsetH5LigerDatasetToMem <- function(
             value$rawPeak <- rawPeak(object)[, cellIdx, drop = FALSE]
             value$normPeak <- normPeak(object)[, cellIdx, drop = FALSE]
         }
-    }
+    # }
     if (isTRUE(returnObject)) {
         value$modal <- modal
         do.call(createLigerDataset, value)
@@ -498,11 +530,14 @@ subsetH5LigerDatasetToH5 <- function(
     if ("rawData" %in% useSlot & !is.null(rawData(object))) {
         # 1. Create paths to store i, p, x of sparse matrix
         if (isTRUE(verbose)) .log("Subsetting `rawData`", level = 2)
-        safeH5Create(newH5File, newH5Meta$indicesName, dims = 1, dtype = "int")
+        safeH5Create(newH5File, newH5Meta$indicesName, dims = 1,
+                     chunkSize = 4096, dtype = "int")
         i.h5d <- newH5File[[newH5Meta$indicesName]]
-        safeH5Create(newH5File, newH5Meta$indptrName, dims = 1, dtype = "int")
+        safeH5Create(newH5File, newH5Meta$indptrName, dims = 1,
+                     chunkSize = 2048, dtype = "int")
         p.h5d <- newH5File[[newH5Meta$indptrName]]
-        safeH5Create(newH5File, newH5Meta$rawData, dims = 1, dtype = "double")
+        safeH5Create(newH5File, newH5Meta$rawData, dims = 1,
+                     chunkSize = 4096, dtype = "double")
         x.h5d <- newH5File[[newH5Meta$rawData]]
         # 2. Go into chunks
         subsetSizes <- list(ix = 0, p = 1)
@@ -510,7 +545,7 @@ subsetH5LigerDatasetToH5 <- function(
             object, init = subsetSizes, useData = "rawData",
             chunkSize = chunkSize, verbose = verbose,
             FUN = function(chunk, sparseXIdx, chunkCellIdx, values) {
-                if (sum(chunkCellIdx %in% cellIdx) > 0) {
+                if (any(chunkCellIdx %in% cellIdx)) {
                     subset <- chunk[featureIdx, chunkCellIdx %in% cellIdx]
                     # Length of `subset@i` and `subset@x` should always be the
                     # same.
@@ -546,7 +581,8 @@ subsetH5LigerDatasetToH5 <- function(
     # Process Normalized Data ####
     if ("normData" %in% useSlot & !is.null(normData(object))) {
         if (isTRUE(verbose)) .log("Subsetting `normData`", level = 2)
-        safeH5Create(newH5File, newH5Meta$normData, dims = 1, dtype = "double")
+        safeH5Create(newH5File, newH5Meta$normData, dims = 1,
+                     chunkSize = 4096, dtype = "double")
         x.h5d <- newH5File[[newH5Meta$normData]]
         ipProcessedBefore <- exists("i.h5d") & exists("p.h5d")
         if (!ipProcessedBefore) {
@@ -554,10 +590,10 @@ subsetH5LigerDatasetToH5 <- function(
             # have to recreate them for normData. (normData and rawData share
             # the same `i` and `p` vectors as in the form of sparse matrices.)
             safeH5Create(newH5File, newH5Meta$indicesName,
-                         dims = 1, dtype = "int")
+                         dims = 1, chunkSize = 4096, dtype = "int")
             i.h5d <- newH5File[[newH5Meta$indicesName]]
             safeH5Create(newH5File, newH5Meta$indptrName,
-                         dims = 1, dtype = "int")
+                         dims = 1, chunkSize = 2048, dtype = "int")
             p.h5d <- newH5File[[newH5Meta$indptrName]]
         }
         subsetSizes <- list(ix = 0, p = 1)
@@ -589,45 +625,120 @@ subsetH5LigerDatasetToH5 <- function(
         )
     }
     # Process Scaled Data ####
-    scaleFeatureIdx <- NULL
-    if (getH5File(object)$exists("scaleData.featureIdx")) {
-        scaledFeatureIdx <- getH5File(object)[["scaleData.featureIdx"]][]
-    } else if ("selected" %in% names(featureMeta(object))) {
-        scaledFeatureIdx <- which(featureMeta(object)$selected)
-    } else if (!is.null(object@V)) {
-        scaleFeatureIdx <- rownames(object@V) %in% rownames(object)[featureIdx]
-    } else {
-        if ("scaleData" %in% useSlot & !is.null(scaleData(object))) {
-            warning("Unable to know what features are included scaled data. ",
-                    "Skipped.")
-        }
+    secondIdx <- NULL
+    # if (getH5File(object)$exists("scaleData.featureIdx")) {
+    #     scaledFeatureIdx <- getH5File(object)[["scaleData.featureIdx"]][]
+    # } else if ("selected" %in% names(featureMeta(object))) {
+    #     scaledFeatureIdx <- which(featureMeta(object)$selected)
+    # } else if (!is.null(object@V)) {
+    #     scaleFeatureIdx <- rownames(object@V) %in% rownames(object)[featureIdx]
+    # } else {
+    #     if ("scaleData" %in% useSlot & !is.null(scaleData(object))) {
+    #         warning("Unable to know what features are included scaled data. ",
+    #                 "Skipped.")
+    #     }
+    # }
+    if (!is.null(scaleData(object))) {
+        # This asserts that
+        # identical(rownames(object)[scaledFeatureIdx], rownames(scaleData))
+        scaledFeatureIdx <- scaleData(object)[["featureIdx"]][]
+        # This asserts that
+        # rownames(scaleData)[secondIdx] returns a subset that follows the order
+        # specified by featureIdx
+        secondIdx <- as.numeric(na.omit(match(featureIdx, scaledFeatureIdx)))
     }
     if ("scaleData" %in% useSlot & !is.null(scaleData(object))) {
-        if (isTRUE(verbose)) .log("Subsetting `scaleData`", level = 2)
-        if (!is.null(scaledFeatureIdx) && length(scaledFeatureIdx) > 0) {
-            if (length(scaledFeatureIdx) == scaleData(object)$dims[1]) {
-                scaledFeatureIdx2 <- unlist(lapply(featureIdx, function(i)
-                    which(scaledFeatureIdx == i)), use.names = FALSE)
-                ng <- length(scaledFeatureIdx2)
-                nc <- length(cellIdx)
-                if (isTRUE(verbose))
-                    .log(length(scaledFeatureIdx2),
-                         " features used in scaleData were selected. ",
-                         level = 3)
-                safeH5Create(newH5File, dataPath = newH5Meta$scaleData,
-                             dims = c(ng, nc), dtype = "double",
-                             chunkSize = c(ng, 1000))
-                newH5File[[newH5Meta$scaleData]][1:ng,1:nc] <-
-                    scaleData(object)[scaledFeatureIdx2, cellIdx, drop = FALSE]
-                safeH5Create(newH5File, dataPath = "scaleData.featureIdx",
-                             dims = ng, dtype = "int", chunkSize = ng)
-                newH5File[["scaleData.featureIdx"]][1:ng] <-
-                    .getOrderedSubsetIdx(featureIdx, scaledFeatureIdx)
-            } else {
-                warning("Row dimension of scaleData does not match with ",
-                        "feature selection. Unable to subset from H5.")
-            }
+        scaledFeatureIdxNew <- which(featureIdx %in% scaledFeatureIdx)
+        if (isTRUE(verbose))
+            .log(length(secondIdx),
+                 " features used in scaleData were selected. ", level = 3)
+        newH5File$create_group(newH5Meta$scaleData)
+        safeH5Create(
+            newH5File,
+            dataPath = file.path(newH5Meta$scaleData, "data", fsep = "/"),
+            dims = 1, dtype = "double", chunkSize = 4096
+        )
+        x.h5d <- newH5File[[file.path(newH5Meta$scaleData, "data", fsep = "/")]]
+        safeH5Create(
+            newH5File,
+            dataPath = file.path(newH5Meta$scaleData, "indices", fsep = "/"),
+            dims = 1, dtype = "int", chunkSize = 4096
+        )
+        i.h5d <- newH5File[[file.path(newH5Meta$scaleData, "indices", fsep = "/")]]
+        safeH5Create(
+            newH5File,
+            dataPath = file.path(newH5Meta$scaleData, "indptr", fsep = "/"),
+            dims = 1, dtype = "int", chunkSize = 2048
+        )
+        p.h5d <- newH5File[[file.path(newH5Meta$scaleData, "indptr", fsep = "/")]]
+        p.h5d[1] <- 0
+        nChunk <- ceiling(ncol(object) / chunkSize)
+        arrayDims <- c(0, 1) # for recording the current size of ix and p
+        for (i in seq_len(nChunk)) {
+            # Column range of a chunk
+            start <- (i - 1)*chunkSize + 1
+            end <- i*chunkSize
+            if (end > ncol(object)) end <- ncol(object)
+            p.range <- seq(start, end + 1)
+            chunk.p <- scaleData(object)[["indptr"]][p.range]
+            ix.range <- seq(chunk.p[1] + 1, chunk.p[length(chunk.p)])
+            chunk.i <- scaleData(object)[["indices"]][ix.range]
+            chunk.x <- scaleData(object)[["data"]][ix.range]
+            chunk.p <- chunk.p - chunk.p[1]
+            chunkSpMat <- Matrix::sparseMatrix(
+                i = chunk.i + 1, p = chunk.p, x = chunk.x,
+                dims = c(length(scaledFeatureIdx), end - start + 1)
+            )
+            chunkCellIdx <- seq(start, end)
+            chunkSubset <- chunkSpMat[secondIdx, chunkCellIdx %in% cellIdx]
+            # Write chunk subset to H5 now
+            newChunkIXRange <- seq(arrayDims[1] + 1,
+                                   arrayDims[1] + length(chunkSubset@i))
+            newChunkPRange <- seq(arrayDims[2] + 1,
+                                  arrayDims[2] + ncol(chunkSubset))
+            arrayDimsNew <- arrayDims + c(length(chunkSubset@i), ncol(chunkSubset))
+            hdf5r::extendDataSet(i.h5d, arrayDimsNew[1])
+            hdf5r::extendDataSet(x.h5d, arrayDimsNew[1])
+            i.h5d[newChunkIXRange] <- chunkSubset@i
+            x.h5d[newChunkIXRange] <- chunkSubset@x
+            hdf5r::extendDataSet(p.h5d, arrayDimsNew[2])
+            p.h5d[newChunkPRange] <- chunkSubset@p[seq(2, ncol(chunkSubset) + 1)] + p.h5d[arrayDims[2]]
+            arrayDims <- arrayDimsNew
         }
+        safeH5Create(
+            newH5File,
+            file.path(newH5Meta$scaleData, "featureIdx", fsep = "/"),
+            dims = length(scaledFeatureIdxNew), chunkSize = 1024, dtype = "int"
+        )
+        newH5File[[file.path(newH5Meta$scaleData, "featureIdx", fsep = "/")]][seq_along(scaledFeatureIdxNew)] <- scaledFeatureIdxNew
+        # Below is the previous version where we use dense H5 scaled data
+        # Can remove if no longer used
+        # if (isTRUE(verbose)) .log("Subsetting `scaleData`", level = 2)
+        # if (!is.null(scaledFeatureIdx) && length(scaledFeatureIdx) > 0) {
+        #     scaleDataDim <- scaleData(object)[["featureIdx"]]$dims
+        #     if (length(scaledFeatureIdx) == scaleDataDim) {
+        #         scaledFeatureIdx2 <- unlist(lapply(featureIdx, function(i)
+        #             which(scaledFeatureIdx == i)), use.names = FALSE)
+        #         ng <- length(scaledFeatureIdx2)
+        #         nc <- length(cellIdx)
+        #         if (isTRUE(verbose))
+        #             .log(length(scaledFeatureIdx2),
+        #                  " features used in scaleData were selected. ",
+        #                  level = 3)
+        #         safeH5Create(newH5File, dataPath = newH5Meta$scaleData,
+        #                      dims = c(ng, nc), dtype = "double",
+        #                      chunkSize = c(ng, 1000))
+        #         newH5File[[newH5Meta$scaleData]][1:ng,1:nc] <-
+        #             scaleData(object)[scaledFeatureIdx2, cellIdx, drop = FALSE]
+        #         safeH5Create(newH5File, dataPath = "scaleData.featureIdx",
+        #                      dims = ng, dtype = "int", chunkSize = ng)
+        #         newH5File[["scaleData.featureIdx"]][1:ng] <-
+        #             .getOrderedSubsetIdx(featureIdx, scaledFeatureIdx)
+        #     } else {
+        #         warning("Row dimension of scaleData does not match with ",
+        #                 "feature selection. Unable to subset from H5.")
+        #     }
+        # }
     }
     newH5File$close()
     if (!"rawData" %in% useSlot) newH5Meta$rawData <- NULL
@@ -651,9 +762,9 @@ subsetH5LigerDatasetToH5 <- function(
     )
     h5fileInfo(newObj, info = "formatType") <- newH5Meta$formatType
     newObj@H <- object@H[, cellIdx, drop = FALSE]
-    newObj@V <- object@V[scaleFeatureIdx, , drop = FALSE]
+    newObj@V <- object@V[secondIdx, , drop = FALSE]
     newObj@A <- object@A
-    newObj@B <- object@B[scaleFeatureIdx, , drop = FALSE]
+    newObj@B <- object@B[secondIdx, , drop = FALSE]
     newObj@U <- object@U
 
     newObj
@@ -707,7 +818,7 @@ subsetMemLigerDataset <- function(object, featureIdx = NULL, cellIdx = NULL,
                                          drop = FALSE]
         }
     }
-    if (is.null(useSlot)) {
+    # if (is.null(useSlot)) {
         sfi <- scaleFeatureIdx
         subsetData <- c(subsetData,
                         list(H = object@H[, cellIdx, drop = FALSE],
@@ -723,7 +834,7 @@ subsetMemLigerDataset <- function(object, featureIdx = NULL, cellIdx = NULL,
             subsetData$rawPeak <- rawPeak(object)[, cellIdx, drop = FALSE]
             subsetData$normPeak <- normPeak(object)[, cellIdx, drop = FALSE]
         }
-    }
+    # }
     if (isTRUE(returnObject)) {
         subsetData$modal <- modal
         return(do.call("createLigerDataset", subsetData))
diff --git a/R/util.R b/R/util.R
index 2272f03e..edb97b51 100644
--- a/R/util.R
+++ b/R/util.R
@@ -245,19 +245,17 @@
     return(TRUE)
 }
 
-.checkValidFactorResult <- function(object, useDatasets = NULL) {
+.checkValidFactorResult <- function(object, useDatasets = NULL,
+                                    checkV = TRUE) {
     result <- TRUE
     useDatasets <- .checkUseDatasets(object, useDatasets)
-    if (is.null(object@W)) {
-        warning("W matrix does not exist.")
-        result <- FALSE
-    } else {
-        # nGenes <- nrow(object@W)
-        k <- ncol(object@W)
+    if (is.null(object@W)) stop("W matrix does not exist.")
+    k <- ncol(object@W)
 
-        for (d in useDatasets) {
-            ld <- dataset(object, d)
-            nCells <- ncol(ld)
+    for (d in useDatasets) {
+        ld <- dataset(object, d)
+        nCells <- ncol(ld)
+        if (isTRUE(checkV)) {
             if (is.null(ld@V)) {
                 warning("V matrix does not exist for dataset '", d, "'.")
                 result <- FALSE
@@ -268,23 +266,23 @@
                     result <- FALSE
                 }
             }
-            if (is.null(ld@H)) {
-                warning("H matrix does not exist for dataset '", d, "'.")
+        }
+        if (is.null(ld@H)) {
+            warning("H matrix does not exist for dataset '", d, "'.")
+            result <- FALSE
+        } else {
+            if (!identical(dim(ld@H), c(k, nCells))) {
+                warning("Dimensionality of H matrix for dataset '", d,
+                        "' is not valid")
                 result <- FALSE
-            } else {
-                if (!identical(dim(ld@H), c(k, nCells))) {
-                    warning("Dimensionality of H matrix for dataset '", d,
-                            "' is not valid")
-                    result <- FALSE
-                }
             }
         }
-        if (k != object@uns$factorization$k)
-            warning("Number of factors does not match with object `k` slot. ")
     }
+    if (k != object@uns$factorization$k)
+        warning("Number of factors does not match with object `k` slot. ")
     if (isFALSE(result))
         stop("Cannot detect valid existing factorization result. ",
-             "Please run factorization first.")
+             "Please run factorization first. Check warnings.")
 }
 
 # !!!MaintainerDeveloperNOTE:
diff --git a/man/createLiger.Rd b/man/createLiger.Rd
index d4176cb3..a6a20014 100644
--- a/man/createLiger.Rd
+++ b/man/createLiger.Rd
@@ -16,6 +16,7 @@ createLiger(
   indptrName = NULL,
   genesName = NULL,
   barcodesName = NULL,
+  newH5 = TRUE,
   verbose = getOption("ligerVerbose"),
   ...,
   remove.missing = removeMissing,
@@ -41,8 +42,7 @@ an HDF5 file. See detail for HDF5 reading.}
 
 \item{removeMissing}{Logical. Whether to remove cells that do not have any
 counts and features not expressed in any cells from each dataset. Default
-\code{TRUE}. H5 based dataset with less than 8000 cells will be subset into
-memory.}
+\code{TRUE}.}
 
 \item{addPrefix}{Logical. Whether to add "<dataset name>_" as a prefix of
 cell identifiers (e.g. barcodes) to avoid duplicates in multiple libraries (
@@ -62,6 +62,11 @@ object. Default \code{NULL} uses \code{formatType} preset.}
 \item{genesName, barcodesName}{The path in a H5 file for the gene names and
 cell barcodes. Default \code{NULL} uses \code{formatType} preset.}
 
+\item{newH5}{When using HDF5 based data and subsets created after removing
+missing cells/features, whether to create new HDF5 files for the subset.
+Default \code{TRUE}. If \code{FALSE}, data will be subset into memory and
+can be dangerous for large scale analysis.}
+
 \item{verbose}{Logical. Whether to show information of the progress. Default
 \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.}
 
diff --git a/man/liger-class.Rd b/man/liger-class.Rd
index bd037ec3..b4007ecb 100644
--- a/man/liger-class.Rd
+++ b/man/liger-class.Rd
@@ -36,7 +36,7 @@
 \alias{cellMeta<-,liger,missing-method}
 \alias{cellMeta<-,liger,character-method}
 \alias{[[,liger,ANY,missing-method}
-\alias{[[<-,liger,ANY,missing-method}
+\alias{[[<-,liger,ANY,missing,ANY-method}
 \alias{$,liger-method}
 \alias{$<-,liger-method}
 \alias{defaultCluster}
@@ -169,7 +169,7 @@ cellMeta(x, columns = NULL, useDataset = NULL, cellIdx = NULL, check = FALSE) <-
 
 \S4method{[[}{liger,ANY,missing}(x, i, j, ...)
 
-\S4method{[[}{liger,ANY,missing}(x, i, j, ...) <- value
+\S4method{[[}{liger,ANY,missing,ANY}(x, i, j, ...) <- value
 
 \S4method{$}{liger}(x, name)
 
diff --git a/man/mapCellMeta.Rd b/man/mapCellMeta.Rd
new file mode 100644
index 00000000..8c2351a4
--- /dev/null
+++ b/man/mapCellMeta.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{mapCellMeta}
+\alias{mapCellMeta}
+\title{Create new variable from categories in cellMeta}
+\usage{
+mapCellMeta(object, from, newTo = NULL, ...)
+}
+\arguments{
+\item{object}{A \linkS4class{liger} object.}
+
+\item{from}{The name of the original variable to be mapped from.}
+
+\item{newTo}{The name of the new variable to store the mapped result. Default
+\code{NULL} returns the new variable (factor class).}
+
+\item{...}{Mapping criteria, argument names are original existing categories
+in the \code{from} and values are new categories in the new variable.}
+}
+\value{
+When \code{newTo = NULL}, a factor object of the new variable.
+Otherwise, the input object with variable \code{newTo} updated in
+\code{cellMeta(object)}.
+}
+\description{
+Designed for fast variable creation when a new variable is going to be
+created from existing variable. For example, multiple samples can be mapped
+to the same study design condition, clusters can be mapped to cell types.
+}
+\examples{
+pbmc <- mapCellMeta(pbmc, from = "dataset", newTo = "modal",
+                    ctrl = "rna", stim = "rna")
+}
diff --git a/man/removeMissing.Rd b/man/removeMissing.Rd
index 80ad705b..d3edeca2 100644
--- a/man/removeMissing.Rd
+++ b/man/removeMissing.Rd
@@ -12,6 +12,7 @@ removeMissing(
   minFeatures = NULL,
   useDatasets = NULL,
   filenameSuffix = "removeMissing",
+  newH5 = TRUE,
   verbose = getOption("ligerVerbose"),
   ...
 )
diff --git a/man/subsetLiger.Rd b/man/subsetLiger.Rd
index 81410974..e2b62939 100644
--- a/man/subsetLiger.Rd
+++ b/man/subsetLiger.Rd
@@ -11,7 +11,7 @@ subsetLiger(
   useSlot = NULL,
   chunkSize = 1000,
   verbose = getOption("ligerVerbose"),
-  newH5 = "auto",
+  newH5 = TRUE,
   returnObject = TRUE,
   ...
 )
@@ -37,8 +37,7 @@ Default \code{1000}.}
 
 \item{newH5}{Whether to create new H5 files on disk for the subset datasets
 if involved datasets in the \code{object} is HDF5 based. \code{TRUE} writes a
-new ones, \code{FALSE} returns in memory data. Default \code{"auto"} writes a
-new dataset when more than 8000 cells from that dataset is subscribed.}
+new ones, \code{FALSE} returns in memory data.}
 
 \item{returnObject}{Logical, whether to return a \linkS4class{liger} object
 for result. Default \code{TRUE}. \code{FALSE} returns a list containing
diff --git a/man/subsetLigerDataset.Rd b/man/subsetLigerDataset.Rd
index a879f371..4fb6528d 100644
--- a/man/subsetLigerDataset.Rd
+++ b/man/subsetLigerDataset.Rd
@@ -11,7 +11,7 @@ subsetLigerDataset(
   featureIdx = NULL,
   cellIdx = NULL,
   useSlot = NULL,
-  newH5 = "auto",
+  newH5 = TRUE,
   filename = NULL,
   filenameSuffix = NULL,
   chunkSize = 1000,
@@ -25,7 +25,7 @@ subsetH5LigerDataset(
   featureIdx = NULL,
   cellIdx = NULL,
   useSlot = NULL,
-  newH5 = "auto",
+  newH5 = TRUE,
   filename = NULL,
   filenameSuffix = NULL,
   chunkSize = 1000,
@@ -57,8 +57,7 @@ Missing or \code{NULL} for all cells.}
 
 \item{newH5}{Whether to create a new H5 file on disk for the subset dataset
 if \code{object} is HDF5 based. \code{TRUE} writes a new one, \code{FALSE}
-returns in memory data. Default \code{"auto"} writes a new one when more than
-8000 cells.}
+returns in memory data.}
 
 \item{filename}{Filename of the new H5 file if being created. Default
 \code{NULL} adds suffix \code{".subset_{yymmdd_HHMMSS}.h5"} to the original
diff --git a/vignettes/articles/online_iNMF_tutorial.Rmd b/vignettes/articles/online_iNMF_tutorial.Rmd
index f1cfd7a8..c9ecb03c 100644
--- a/vignettes/articles/online_iNMF_tutorial.Rmd
+++ b/vignettes/articles/online_iNMF_tutorial.Rmd
@@ -8,165 +8,170 @@ output:
 ---
 
 ```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, results = "hide")
 ```
 
-## Scenario 1: sampling minibatches from fully observed datasets
+## Load data and preprocess
+
+LIGER supports integrating datasets based on HDF5 files, for examples, the 10X CellRanger output file located at `sample/outs/filtered_feature_bc_matrix.h5` can be used with LIGER. Here's the link to learn more about [CellRanger output HDF5 files](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices). With the feature that we read one chunk of data into memory at a time, large-scale datasets stored on disk can be analyzed with a rather small memory specification. 
+
+>Large datasets are often generated over multiple 10X runs (for example, multiple biological replicates). In such cases it may be necessary to merge the HDF5 files from each run into a single HDF5 file. We provide the `mergeH5()` function for this purpose.
+
+To integrate datasets stored in HDF5 files, we first create a [liger object](../reference/liger-class.html) using the file names of the HDF5 files. We prepared the following example datasets for demonstration:
 
-We first create a Liger object by passing the filenames of HDF5 files containing the raw count data. The data can be downloaded [here](https://www.dropbox.com/sh/ay5r0k6to6ck1dz/AADNzhqtztBr5kTQdE-CsU5Ja?dl=0). Liger assumes by default that the HDF5 files are formatted by the 10X CellRanger pipeline. Large datasets are often generated over multiple 10X runs (for example, multiple biological replicates). In such cases it may be necessary to merge the HDF5 files from each run into a single HDF5 file. We provide the `mergeH5` function for this purpose (see below for details).
+- [pbmcs_ctrl.h5](https://figshare.com/ndownloader/files/43108963)
+- [pbmcs_stim.h5](https://figshare.com/ndownloader/files/43108966)
 
-```{r create.object, message=FALSE, results='hide'}
+```{R}
 library(rliger2)
-pbmcs <- createLiger(list(stim = "pbmcs_stim.h5", ctrl = "pbmcs_ctrl.h5"), removeMissing = TRUE)
+
+dataSource <- data.frame(
+    filenames = c("pbmcs_ctrl_vignette.h5", 
+                  "pbmcs_stim_vignette.h5"),
+    urls = c("https://figshare.com/ndownloader/files/43108963",
+             "https://figshare.com/ndownloader/files/43108966")
+)
+for (i in seq_len(nrow(dataSource))) {
+    if (!file.exists(dataSource$filenames[i])) {
+        download.file(dataSource$urls[i], destfile = dataSource$filenames[i])
+    }
+}
+
+pbmcs <- createLiger(list(ctrl = dataSource$filenames[1],
+                          stim = dataSource$filenames[2]))
 ```
 
-We then perform the normalization, gene selection, and gene scaling in an online fashion, reading the data from disk in small batches.
-```{r preprocess, message=FALSE, results='hide'}
-pbmcs = normalize(pbmcs)
-pbmcs = selectGenes(pbmcs, var.thresh = 0.2)
-pbmcs = scaleNotCenter(pbmcs)
+Similar to other iNMF integration tutorial, we also need to preprocee the data by 1. normalizing each cell by its library size, 2. select variable genes for each dataset, and 3. scale the gene expression within each dataset in a non-negative way.
+
+```{R}
+pbmcs <- pbmcs %>%
+    normalize() %>%
+    selectGenes(var.thresh = 0.2) %>%
+    scaleNotCenter()
 ```
-## Online Integrative Nonnegative Matrix Factorization
+
+## Scenario 1: sampling minibatches from fully observed datasets
+
+### Online Integrative Nonnegative Matrix Factorization
+
 Now we can use online iNMF to factorize the data, again using only minibatches that we read from the HDF5 files on demand (default mini-batch size = 5000). Sufficient number of iterations is crucial for obtaining ideal factorization result. If the size of the mini-batch is set to be close to the size of the whole dataset (i.e. an epoch only contains one iteration), `max.epochs` needs to be increased accordingly for more iterations. 
-```{r factorize, message=FALSE, results='hide'}
-pbmcs = online_iNMF(pbmcs, k = 20, miniBatch_size = 5000, max.epochs = 5)
+
+```{R}
+pbmcs <- runIntegration(pbmcs, k = 20, method = "online")
 ```
-## Quantile Normalization and Downstream Analysis
-After performing the factorization, we can perform quantile normalization to align the datasets.
-```{r norm, message=FALSE, results='hide'}
-pbmcs = quantile_norm(pbmcs)
+
+### Joint clustering and visualization
+
+After performing the factorization, we can perform quantile normalization to align the datasets, and then use graph based community detection algorithm to find clusters.
+
+```{R}
+pbmcs <- quantileNorm(pbmcs)
+pbmcs <- runCluster(pbmcs, nNeighbors = 30, resolution = .6, nRandomStarts = 2)
 ```
 
 We can also visualize the cell factor loadings in two dimensions using t-SNE or UMAP.
-```{r visualize}
-pbmcs = runUMAP(pbmcs)
-plotByDatasetAndCluster(pbmcs, axis.labels = c("UMAP1","UMAP2"))
-```
 
-Let's first evaluate the quality of data alignment. The alignment score ranges from 0 (no alignment) to 1 (perfect alignment).
-```{r alignmentScore}
-calcAlignment(pbmcs)
+```{r, fig.width=10}
+pbmcs <- runUMAP(pbmcs, nNeighbors = 30, minDist = .3)
+plotByDatasetAndCluster(pbmcs)
 ```
 
-With HDF5 files as input, we need to sample the raw, normalized, or scaled data from the full dataset on disk and load them in memory. Some plotting functions and downstream analyses are designed to operate on a subset of cells sampled from the full dataset. This enables rapid analysis using limited memory. The readSubset function allows either uniform random sampling or sampling balanced by cluster. Here we extract the normalized count data of 5000 sampled cells.
-```{r subset, message=FALSE, results='hide'}
-pbmcs = readSubset(pbmcs, slot.use = "norm.data", max.cells = 5000)
-```
+### Downstream analysis
 
-Using the sampled data stored in memory, we can now compare clusters or datasets (within each cluster) to identify differentially expressed genes. The runWilcoxon function
-performs differential expression analysis by performing an in-memory Wilcoxon rank-sum test on this subset. Thus, users can still analyze large datasets with a fixed amount of memory.
-```{r wilcox 1, message=FALSE, results='hide'}
-de_genes = runWilcoxon(pbmcs, compare.method = "datasets")
-```
+For downstream analysis that requires expression data, such as differential expression test, we currently do not support direct computation using on-disk HDF5 data. In order to conduct such analyses, users need to create downsampled subset from the large-scale on-disk data and read it into memory. By default, `downsample()` function randomly samples `maxCells` cells. Alternatively, we can set balanced sampling basing on given categorical variables. Note that `newH5 = FALSE` has to be set in order to take the subsample into memory, other wise another downsampled HDF5 file would be created and still cannot be used for those downstream analysis method. 
 
-Here we show the top 10 genes in cluster 1 whose expression level significantly differ between two dataset.
-```{r wilcox 2}
-de_genes = de_genes[order(de_genes$padj), ]
-head(de_genes[de_genes$group == "1-stim",], n = 10)
+```{R}
+pbmcs.sample <- downsample(pbmcs, maxCells = 5000, useSlot = "normData", newH5 = FALSE)
 ```
 
-We can show UMAP coordinates of sampled cells by their loadings on each factor (Factor 1 as an example). Underneath it displays the
-most highly loading shared and dataset-specific genes, with the size of the marker indicating the magnitude of the loading.
-```{r wordClouds, warning=FALSE, results='hide', fig.keep = 'last'}
-p_wordClouds = plotWordClouds(pbmcs, num.genes = 5, return.plots = T)
-p_wordClouds[[1]]
-```
+Using the sampled data stored in memory, we can now compare clusters or datasets to identify differentially expressed genes. The `runMarkerDEG()` function performs differential expression analysis, by default with Wilcoxon rank-sum test. The default mode of mark detection goes for finding the marker of each cluster by performing the test between a cluster against all the rest. Here, we demonstrate finding the dataset-specific markers, within each cluster.
 
-We can generate plots of dimensional reduction coordinates colored by expression of specified gene. The first two UMAP dimensions and gene ISG15 (identified by Wilcoxon test in the previous step) is used as an example here.
-```{r gene}
-plotGene(pbmcs, "ISG15", return.plots = F)
+```{R}
+markerTableList <- runMarkerDEG(pbmcs.sample, conditionBy = "dataset", splitBy = "leiden_cluster")
 ```
 
-Furthermore, we can make violin plots of expression of specified gene for each dataset (ISG15 as an example).
-```{r violin, warning=FALSE}
-plotGeneViolin(pbmcs, "ISG15", return.plots = F)
-```
+The returned object is a list where each element is a dataset-specific marker table derived within each cluster and contains all stats for all the genes without filtering or sorting. We strongly suggest using package "dplyr" for data frame operation due to the simplicity. Here, we show an example of getting the top 10 markers for "stim" within cluster 2.
 
-The online algorithm can be implemented on datasets loaded in memory as well. The same analysis is performed on the PBMCs, shown below.  
-```{r , message=FALSE, results='hide'}
-stim = readRDS("pbmcs_stim.RDS")
-ctrl = readRDS("pbmcs_ctrl.RDS")
-pbmcs_mem = createLiger(list(stim = stim, ctrl = ctrl), remove.missing = F)
-pbmcs_mem = normalize(pbmcs_mem)
-pbmcs_mem = selectGenes(pbmcs_mem, var.thresh = 0.2, do.plot = F)
-pbmcs_mem = scaleNotCenter(pbmcs_mem)
-pbmcs_mem = online_iNMF(pbmcs_mem, k = 20, miniBatch_size = 5000, max.epochs = 5)
+```{R}
+library(dplyr)
+markerTable2 <- markerTableList$`2`
+markerTable2 %>%
+    filter(logFC > 1, padj < 0.05, group == "stim") %>%
+    arrange(padj, -logFC) %>%
+    top_n(10) %>%
+    select(feature, logFC, padj)
+    
 ```
 
-```{r ,message=FALSE}
-pbmcs_mem = quantile_norm(pbmcs_mem)
-pbmcs_mem = runUMAP(pbmcs_mem)
-plotByDatasetAndCluster(pbmcs_mem, axis.labels = c("UMAP1","UMAP2"))
-```
+We can show UMAP coordinates of sampled cells by their loadings on each factor (Factor 4 as an example). The lower part of the figure shows the gene loading ranking of the factor, with the middle showing the ranking of the shared gene loading, and the dataset-specific gene loading ranking on two sides. 
 
-As mentioned above, it is sometimes necessary to merge multiple HDF5 files (such as multiple 10X runs from the same tissue or condition) into a single file. We provide the `mergeH5` function for this purpose. The function takes as input a list of filenames to merge (file.list), a vector of sample names that are prepended to the cell barcodes (library.names), and the name of the merged HDF5 file. The function requires that all files to be merged include exactly the same set of genes. For example, we can merge the cells and nuclei datasets used in the examples below (note that merging these particular two datasets is not something you would like to do, and is purely to demonstrate the `mergeH5` function).
-```{r merge,message=FALSE}
-mergeH5(file.list = list("allen_smarter_cells.h5", "allen_smarter_nuclei.h5"), 
-        library.names = c("cells","nuclei"),
-        new.filename = "cells_nuclei")
+```{r, fig.height=6}
+factorMarker <- getFactorMarkers(pbmcs.sample, dataset1 = "ctrl", dataset2 = "stim")
+plotGeneLoadings(pbmcs.sample, markerTable = factorMarker, useFactor = 4)
 ```
 
-## Scenario 2: iterative refinement by incorporating new datasets
-We can also perform online iNMF with continually arriving datasets.
-```{r prep1, message=FALSE, results='hide'}
-MOp = createLiger(list(cells = "allen_smarter_cells.h5"))
-MOp = normalize(MOp)
-MOp = selectGenes(MOp, var.thresh = 2)
-MOp.vargenes = MOp@var.genes
-MOp = scaleNotCenter(MOp)
-```
+### Visualization with expression
 
-```{r fact1, message=FALSE, results='hide'}
-MOp = online_iNMF(MOp, k = 40, max.epochs = 1)
-```
+With the subsampled data loaded into memory, we can then also generate plots of dimensional reduction coordinates colored by expression of specified gene. For example, the gene ISG15 has expression visualized below.
 
-```{r vis1}
-MOp = quantile_norm(MOp)
-MOp = runUMAP(MOp)
-plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))
+```{r, fig.width=6}
+plotGeneDimRed(pbmcs.sample, "ISG15")
 ```
 
-```{r prep2, message=FALSE, results='hide'}
-MOp2 = createLiger(list(nuclei = "allen_smarter_nuclei.h5"))
-MOp2 = normalize(MOp2)
-MOp2@var.genes = MOp@var.genes
-MOp2 = scaleNotCenter(MOp2)
-```
+Furthermore, we can make violin plots of expression of specified gene. Taking ISG15 as an example again, we can make a violin plot for each dataset and group each by cluster. 
 
-```{r fact2, message=FALSE, results='hide'}
-MOp = online_iNMF(MOp, X_new = list(nuclei = MOp2), k = 40, max.epochs = 1)
+```{r}
+plotGeneViolin(pbmcs.sample, "ISG15", byDataset = TRUE, title = names(pbmcs.sample))
 ```
 
-```{r vis2}
-MOp = quantile_norm(MOp)
-MOp = runUMAP(MOp)
-plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))
-```
+## Scenario 2: iterative refinement by incorporating new datasets
 
-## Scenario 3: projecting new datasets
-```{r prep3, message=FALSE, results='hide'}
-MOp = createLiger(list(cells = "allen_smarter_cells.h5"))
-MOp@var.genes = MOp.vargenes
-MOp = online_iNMF(MOp, k = 40, max.epochs = 1)
-MOp = quantile_norm(MOp)
-MOp = runUMAP(MOp)
-plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))
-```
+The second scenario of online learning algorithm is where we have performed factorization on existing dataset(s), and we add on continually arriving datasets to the pool and update the factorization with existing result.
 
-```{r project, message=FALSE, results='hide'}
-MOp = online_iNMF(MOp, X_new = list(nuclei = MOp2), k = 40, project = TRUE)
-```
+For example, still with the same PBMC example, we first perform online iNMF on the control dataset.
 
-```{r vis3}
-MOp = quantile_norm(MOp)
-MOp = runUMAP(MOp)
-plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))
+```{r}
+pbmcs <- createLiger(list(ctrl = "pbmcs_ctrl_vignette.h5")) %>%
+  normalize() %>%
+  selectGenes() %>%
+  scaleNotCenter() %>%
+  runIntegration(method = "online")
 ```
 
+Then, we obtain the new dataset, and then run online iNMF with adding it as a new arriving dataset. There are a couple of points to note.
 
+1. There are different ways of specifying new datasets. We allow using either a named list of raw data. If users work with HDF5 based data all the way, then it's best to have a named list of HDF5 filenames. If users work with in-memory matrices, then a named list of matrix objects (dgCMatrix class).
+2. New datasets are always regarded as raw count input, and will be preprocessed internally. The variable genes identified using the original existing datasets will always be used for taking the scaled data. Therefore, users need to make sure that the newly arriving datasets have them available. 
+3. Alternatively, a [liger object](../references/liger-class.html) can also be used as an input for the argument `newDatasets`. Users need to be aware of the second point above, because we remove missing features by default and can cause error when variable features obtained from existing dataserts is identified as missing when creating the object for new datasets.
+4. For HDF5 filename inputs, it is assumed that the program can find raw counts data with 10X CellRanger format. If the HDF5 file stores raw counts data in a different way (e.g. H5AD data), users will need to create a separate [liger object](../references/liger-class.html) with specification, and then use the constructed object as input while being aware of the 2nd and 3rd point above.
+5. After seccessfully integrating the new datasets with the existing factorization, the returned [liger obejct](../references/liger-class.html) will have them inserted.
 
+```{R, results="hide"}
+pbmcs.new <- runOnlineINMF(pbmcs, newDatasets = list(stim = "pbmcs_stim_vignette.h5"))
+```
 
+After the factorization, we can go through the same procedure to align the factor loading and jointly cluster the cells, as well as visualize the integration result.
 
+```{r, fig.width=10}
+pbmcs.new <- quantileNorm(pbmcs.new) %>%
+  runCluster(nNeighbors = 30, resolution = .6, nRandomStarts = 2) %>%
+  runUMAP(nNeighbors = 30, minDist = .3) 
+plotByDatasetAndCluster(pbmcs.new)
+```
 
+## Scenario 3: Projecting new datasets
 
+The third scenario serves as a fast inspection for new datasets. We do not update the factorization with newly arriving data, but instead, compute the factor loading in cells (i.e. $H$ matrices for new datasets) using the existing gene loading in the factors ($W$ matrix from previous factorization). Therefore, no dataset-specific gene loading in factors ($V$ matrices) can be produced for the new datasets in this approach. Scenario 3 can be triggered with specifying the new datasets, following the same guidance as in scenario 2 above, and set `projection = TRUE`. 
 
+```{r, fig.width=10}
+pbmcs <- createLiger(list(ctrl = "pbmcs_ctrl_vignette.h5")) %>%
+  normalize() %>%
+  selectGenes() %>%
+  scaleNotCenter() %>%
+  runIntegration(method = "online")
+pbmcs.new <- runOnlineINMF(pbmcs, newDatasets = list(stim = "pbmcs_stim_vignette.h5"), projection = TRUE)
+pbmcs.new <- quantileNorm(pbmcs.new) %>%
+  runCluster(nNeighbors = 30, resolution = .6, nRandomStarts = 2) %>%
+  runUMAP(nNeighbors = 30, minDist = .3) 
+plotByDatasetAndCluster(pbmcs.new)
+```