Skip to content

Commit

Permalink
Update onlineINMF vig, with all bugs related fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
mvfki committed Nov 14, 2023
1 parent 3fdc9e8 commit 3739902
Show file tree
Hide file tree
Showing 15 changed files with 559 additions and 288 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ export(louvainCluster)
export(makeFeatureMatrix)
export(makeInteractTrack)
export(makeRiverplot)
export(mapCellMeta)
export(mergeDenseAll)
export(mergeH5)
export(mergeSparseAll)
Expand Down
49 changes: 49 additions & 0 deletions R/clustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,52 @@ groupSingletons <- function(
newClusts <- factor(newClusts)
return(newClusts)
}



#' Create new variable from categories in cellMeta
#' @description
#' Designed for fast variable creation when a new variable is going to be
#' created from existing variable. For example, multiple samples can be mapped
#' to the same study design condition, clusters can be mapped to cell types.
#' @param object A \linkS4class{liger} object.
#' @param from The name of the original variable to be mapped from.
#' @param newTo The name of the new variable to store the mapped result. Default
#' \code{NULL} returns the new variable (factor class).
#' @param ... Mapping criteria, argument names are original existing categories
#' in the \code{from} and values are new categories in the new variable.
#' @return When \code{newTo = NULL}, a factor object of the new variable.
#' Otherwise, the input object with variable \code{newTo} updated in
#' \code{cellMeta(object)}.
#' @export
#' @examples
#' pbmc <- mapCellMeta(pbmc, from = "dataset", newTo = "modal",
#' ctrl = "rna", stim = "rna")
mapCellMeta <- function(
object,
from,
newTo = NULL,
...
) {
object <- recordCommand(object)
from <- cellMeta(object, from)
if (!is.factor(from)) stop("`from` must be a factor class variable.")
mapping <- list(...)
fromCats <- names(mapping)
notFound <- fromCats[!fromCats %in% levels(from)]
if (length(notFound) > 0) {
stop("The following categories requested not found: ",
paste0(notFound, collapse = ", "))
}

toCats <- unlist(mapping)
unchangedCats <- levels(from)
unchangedCats <- unchangedCats[!unchangedCats %in% fromCats]
names(unchangedCats) <- unchangedCats
if (length(unchangedCats) > 0) toCats <- c(toCats, unchangedCats)
to <- toCats[as.character(from)]
to <- factor(unname(to), levels = unique(toCats))
if (is.null(newTo)) return(to)
cellMeta(object, newTo) <- to
return(object)
}
10 changes: 7 additions & 3 deletions R/import.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
#' \code{NULL}.
#' @param removeMissing Logical. Whether to remove cells that do not have any
#' counts and features not expressed in any cells from each dataset. Default
#' \code{TRUE}. H5 based dataset with less than 8000 cells will be subset into
#' memory.
#' \code{TRUE}.
#' @param addPrefix Logical. Whether to add "<dataset name>_" as a prefix of
#' cell identifiers (e.g. barcodes) to avoid duplicates in multiple libraries (
#' common with 10X data). Default \code{"auto"} detects if matrix columns
Expand All @@ -27,6 +26,10 @@
#' object. Default \code{NULL} uses \code{formatType} preset.
#' @param genesName,barcodesName The path in a H5 file for the gene names and
#' cell barcodes. Default \code{NULL} uses \code{formatType} preset.
#' @param newH5 When using HDF5 based data and subsets created after removing
#' missing cells/features, whether to create new HDF5 files for the subset.
#' Default \code{TRUE}. If \code{FALSE}, data will be subset into memory and
#' can be dangerous for large scale analysis.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.
#' @param ... Additional slot values that should be directly placed in object.
Expand All @@ -46,6 +49,7 @@ createLiger <- function(
indptrName = NULL,
genesName = NULL,
barcodesName = NULL,
newH5 = TRUE,
verbose = getOption("ligerVerbose"),
...,
# Deprecated coding style
Expand Down Expand Up @@ -135,7 +139,7 @@ createLiger <- function(
obj <- runGeneralQC(obj, verbose = verbose)
if (isTRUE(removeMissing)) {
obj <- removeMissing(obj, "both", filenameSuffix = "qc",
verbose = verbose)
verbose = verbose, newH5 = newH5)
}

return(obj)
Expand Down
136 changes: 97 additions & 39 deletions R/integration.R
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,15 @@ runINMF.liger <- function(
)

object@W <- out$W
rownames(object@W) <- varFeatures(object)
for (d in names(object)) {
ld <- dataset(object, d)
ld@H <- out$H[[d]]
ld@V <- out$V[[d]]
if (isH5Liger(ld)) {
colnames(ld@H) <- colnames(ld)
rownames(ld@V) <- varFeatures(object)
}
datasets(object, check = FALSE)[[d]] <- ld
}
object@uns$factorization <- list(k = k, lambda = lambda)
Expand Down Expand Up @@ -696,31 +701,47 @@ runOnlineINMF.liger <- function(
"datasets. Please run `runOnlineINMF()` without `newDataset` ",
"first.")
}
# Put new datasets into input liger object

newNames <- names(newDatasets)
if (any(newNames %in% names(object))) {
stop("Names of `newDatasets` overlap with existing datasets.")
}
if (is.list(newDatasets)) {
allType <- sapply(newDatasets, function(x) class(x)[1])
if (!all(allType == "dgCMatrix")) {
stop("`newDatasets` must contain only `dgCMatrix`.")
}
# A list of raw data
if (is.null(names(newDatasets))) {
stop("`newDatasets` must be a NAMED list.")
stop("The list of new datasets must be named.")
}
allNewNames <- character()
for (i in seq_along(newDatasets)) {
if (names(newDatasets)[i] %in% names(object)) {
newName <- paste0(names(newDatasets)[i], ".1")
} else newName <- names(newDatasets)[i]
dataset(object, newName) <- newDatasets[[i]]
allNewNames <- c(allNewNames, newName)
if (inherits(newDatasets[[i]], "dgCMatrix")) {
dataset(object, names(newDatasets)[i]) <- newDatasets[[i]]
} else if (is.character(newDatasets[[i]])) {
# Assume is H5 filename
ld <- createH5LigerDataset(newDatasets[[i]])
dataset(object, names(newDatasets[i])) <- ld
} else {
stop("Cannot interpret `newDatasets` element ", i)
}
}
object <- normalize(object, useDatasets = allNewNames,
verbose = verbose)
object <- scaleNotCenter(object, useDatasets = allNewNames,
verbose = verbose)
newDatasets <- getMatrix(object, slot = "scaleData",
dataset = allNewNames, returnList = TRUE)
} else if (inherits(newDatasets, "liger")) {
# A liger object with all new datasets
object <- c(object, newDatasets)
} else {
stop("`newDatasets` must be a named list of dgCMatrix.")
stop("`newDatasets` must be either a named list or a liger object")
}

object <- normalize(object, useDatasets = newNames)
object <- scaleNotCenter(object, useDatasets = newNames)
newDatasets <- list()
for (d in newNames) {
ld <- dataset(object, d)
sd <- scaleData(ld)
if (inherits(sd, "H5D")) {
newDatasets[[d]] <- .H5DToH5Mat(sd)
} else if (inherits(sd, "H5Group")) {
newDatasets[[d]] <- .H5GroupToH5SpMat(sd, c(length(varFeatures(object)), ncol(ld)))
} else {
newDatasets[[d]] <- sd
}
}
}
object <- closeAllH5(object)
Expand All @@ -731,16 +752,35 @@ runOnlineINMF.liger <- function(
HALSiter = HALSiter, verbose = verbose,
WInit = WInit, VInit = VInit, AInit = AInit,
BInit = BInit, seed = seed)

for (i in seq_along(object)) {
ld <- dataset(object, i)
ld@H <- res$H[[i]]
ld@V <- res$V[[i]]
ld@A <- res$A[[i]]
ld@B <- res$B[[i]]
datasets(object, check = FALSE)[[i]] <- ld
if (!isTRUE(projection)) {
# Scenario 1&2, everything updated
for (i in seq_along(object)) {
ld <- dataset(object, i)
ld@H <- res$H[[i]]
ld@V <- res$V[[i]]
ld@A <- res$A[[i]]
ld@B <- res$B[[i]]
if (isH5Liger(ld)) {
colnames(ld@H) <- colnames(ld)
rownames(ld@V) <- varFeatures(object)
rownames(ld@B) <- varFeatures(object)
}
datasets(object, check = FALSE)[[i]] <- ld
}
object@W <- res$W
rownames(object@W) <- varFeatures(object)
} else {
# Scenario 3, only H of newDatasets returned
for (i in seq_along(newDatasets)) {
dname <- names(newDatasets)[i]
ld <- dataset(object, dname)
ld@H <- res$H[[i]]
if (isH5Liger(ld)) {
colnames(ld@H) <- colnames(ld)
}
datasets(object, check = FALSE)[[dname]] <- ld
}
}
object@W <- res$W
object@uns$factorization <- list(k = k, lambda = lambda)
suppressMessages({object <- restoreH5Liger(object)})
return(object)
Expand Down Expand Up @@ -769,6 +809,7 @@ runOnlineINMF.liger <- function(
"devtools::install_github(\"welch-lab/RcppPlanc\")")
nDatasets <- length(object) + length(newDatasets)
barcodeList <- c(lapply(object, colnames), lapply(newDatasets, colnames))
names(barcodeList) <- c(names(object), names(newDatasets))
features <- rownames(object[[1]])
if (!is.null(seed)) set.seed(seed)

Expand All @@ -793,16 +834,27 @@ runOnlineINMF.liger <- function(
Winit = WInit, Ainit = AInit, Binit = BInit,
verbose = verbose)
factorNames <- paste0("Factor_", seq(k))
for (i in seq(nDatasets)) {
res$H[[i]] <- t(res$H[[i]])
dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[i]])
dimnames(res$V[[i]]) <- list(features, factorNames)
dimnames(res$A[[i]]) <- list(factorNames, factorNames)
dimnames(res$B[[i]]) <- list(features, factorNames)
if (isTRUE(projection)) {
# Scenario 3 only got H for new datasets
for (i in seq_along(newDatasets)) {
dname <- names(newDatasets)[i]
res$H[[i]] <- t(res$H[[i]])
dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[dname]])
names(res$H) <- names(newDatasets)
}
} else {
# Scenario 1&2 got everything
for (i in seq(nDatasets)) {
res$H[[i]] <- t(res$H[[i]])
dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[i]])
dimnames(res$V[[i]]) <- list(features, factorNames)
dimnames(res$A[[i]]) <- list(factorNames, factorNames)
dimnames(res$B[[i]]) <- list(features, factorNames)
}
names(res$B) <- names(res$A) <- names(res$V) <- names(res$H) <-
names(barcodeList)
dimnames(res$W) <- list(features, factorNames)
}
names(res$A) <- names(res$B) <- names(res$V) <-
names(res$H) <- c(names(object), names(newDatasets))
dimnames(res$W) <- list(features, factorNames)
return(res)
}

Expand Down Expand Up @@ -1108,10 +1160,16 @@ runUINMF.liger <- function(
ld <- dataset(object, i)
ld@H <- res$H[[i]]
ld@V <- res$V[[i]]
if (!is.null(ld@scaleUnsharedData)) ld@U <- res$U[[i]]
if (!is.null(ld@scaleUnsharedData)) {
ld@U <- res$U[[i]]
rownames(ld@U) <- ld@varUnsharedFeatures
}
colnames(ld@H) <- colnames(ld)
rownames(ld@V) <- varFeatures(object)
datasets(object, check = FALSE)[[i]] <- ld
}
object@W <- res$W
rownames(ld@W) <- varFeatures(object)
object@uns$factorization <- list(k = k, lambda = lambda)
return(object)
}
Expand Down Expand Up @@ -1271,7 +1329,7 @@ quantileNorm.liger <- function(
...
) {
.checkObjVersion(object)
.checkValidFactorResult(object, useDatasets = names(object))
.checkValidFactorResult(object, checkV = FALSE)
if (is.null(reference)) {
# If ref_dataset not given, set the one with the largest number of
# cells as reference.
Expand Down
5 changes: 5 additions & 0 deletions R/liger-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ liger <- setClass(
"dimension of scaleData in dataset ",
"(H5)", d))
}
scaleDataIdx <- scaleData(ld)[["featureIdx"]][]
if (!identical(rownames(ld)[scaleDataIdx], varFeatures(x))) {
return("HDF5 scaled data feature index does not ",
"match variable features")
}
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ removeMissing <- function(
minFeatures = NULL,
useDatasets = NULL,
filenameSuffix = "removeMissing",
newH5 = TRUE,
verbose = getOption("ligerVerbose"),
...
) {
Expand Down Expand Up @@ -296,7 +297,9 @@ removeMissing <- function(
featureIdx = featureIdx,
cellIdx = cellIdx,
filenameSuffix = filenameSuffix,
verbose = verbose, ...
verbose = verbose,
newH5 = newH5,
...
)
} else {
datasets.new[[d]] <- ld
Expand Down
Loading

0 comments on commit 3739902

Please sign in to comment.