diff --git a/CMakeLists.txt b/CMakeLists.txt index 159969b9..d363dd51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "") set(CMAKE_VERBOSE_MAKEFILE TRUE) set(srclist ModularityOptimizer.cpp RModularityOptimizer.cpp RcppExports.cpp -data_processing.cpp fast_wilcox.cpp feature_mat.cpp quantile_norm.cpp run_nmf.cpp snn.cpp) +data_processing.cpp fast_wilcox.cpp feature_mat.cpp quantile_norm.cpp snn.cpp) list(TRANSFORM srclist PREPEND "${PROJECT_SOURCE_DIR}/src/") add_library(rliger2 SHARED ${srclist}) set_target_properties(rliger2 PROPERTIES PREFIX "") diff --git a/DESCRIPTION b/DESCRIPTION index 5c25e57e..04fd260c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,7 @@ Author: Joshua Welch [aut], Robert Lee [ctb], Yichen Wang [aut, cre], Andrew Robbins [ctb] -Maintainer: Chao Gao +Maintainer: Yichen Wang BugReports: https://github.com/welch-lab/liger/issues URL: https://github.com/welch-lab/liger License: GPL-3 | file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 539de6f3..5f648a7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,9 +11,12 @@ S3method(as.ligerDataset,matrixLike) S3method(c,liger) S3method(cbind,ligerDataset) S3method(fortify,liger) -S3method(runBPPINMF,Seurat) -S3method(runBPPINMF,liger) -S3method(runBPPINMF,list) +S3method(runINMF,Seurat) +S3method(runINMF,liger) +S3method(runINMF,list) +S3method(runOnlineINMF,Seurat) +S3method(runOnlineINMF,liger) +S3method(runOnlineINMF,list) S3method(runUINMF,Seurat) S3method(runUINMF,liger) S3method(runUINMF,list) @@ -39,6 +42,7 @@ export(calcDatasetSpecificity) export(calcNormLoadings) export(calcPurity) export(cellMeta) +export(closeAllH5) export(commandDiff) export(commands) export(convertOldLiger) @@ -73,7 +77,6 @@ export(normPeak) export(normalize) export(normalizePeak) export(online_iNMF) -export(optimizeALS) export(optimizeNewData) export(optimizeNewK) export(optimizeNewLambda) @@ -117,14 +120,15 @@ export(removeMissing) export(restoreH5Liger) export(retrieveCellFeature) export(reverseMethData) -export(runBPPINMF) export(runCluster) export(runDoubletFinder) export(runGOEnrich) export(runGSEA) export(runGeneralQC) export(runINMF) +export(runINMF_R) export(runMarkerDEG) +export(runOnlineINMF) export(runPairwiseDEG) export(runPseudoBulkDEG) export(runTSNE) @@ -180,12 +184,11 @@ exportMethods(length) exportMethods(names) exportMethods(normData) exportMethods(normPeak) -exportMethods(optimizeALS) exportMethods(quantileNorm) exportMethods(quantile_norm) exportMethods(rawData) exportMethods(rawPeak) -exportMethods(runINMF) +exportMethods(runINMF_R) exportMethods(scaleData) exportMethods(scaleUnsharedData) exportMethods(show) diff --git a/R/RcppExports.R b/R/RcppExports.R index 22504899..fe87fba9 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -115,10 +115,6 @@ max_factor <- function(H, dims_use, center_cols = FALSE) { .Call(`_rliger2_max_factor`, H, dims_use, center_cols) } -solveNNLS <- function(C, B) { - .Call(`_rliger2_solveNNLS`, C, B) -} - ComputeSNN <- function(nn_ranked, prune) { .Call(`_rliger2_ComputeSNN`, nn_ranked, prune) } diff --git a/R/h5Utility.R b/R/h5Utility.R index 9cb768e1..94eda029 100644 --- a/R/h5Utility.R +++ b/R/h5Utility.R @@ -195,11 +195,14 @@ restoreH5Liger <- function(object, filePath = NULL) { } h5.meta <- h5fileInfo(object) if (is.null(filePath)) filePath <- h5.meta$filename + if (is.null(filePath)) { + stop("No filename identified") + } if (!file.exists(filePath)) { stop("HDF5 file path does not exist:\n", filePath) } - .log("Restoring ligerDataset from: ", filePath) + .log("filename identified: ", filePath) h5file <- hdf5r::H5File$new(filePath, mode = "r+") h5.meta$filename <- h5file$filename pathChecks <- unlist(lapply(h5.meta[4:10], function(x) { @@ -228,16 +231,15 @@ restoreH5Liger <- function(object, filePath = NULL) { rawData(object, check = FALSE) <- h5file[[h5.meta$rawData]] if (!is.null(h5.meta$normData)) normData(object, check = FALSE) <- h5file[[h5.meta$normData]] - if (!is.null(h5.meta$scaleData)) + if (!is.null(h5.meta$scaleData)) { scaleData(object, check = FALSE) <- h5file[[h5.meta$scaleData]] + } methods::validObject(object) } else { # Working for liger object if (!is.null(filePath)) { if (!is.list(filePath) || is.null(names(filePath))) stop("`filePath` has to be a named list for liger object.") - if (!any(names(filePath) %in% names(object))) - stop("names of `filePath` must be found in `names(object)`.") } for (d in names(object)) { if (isH5Liger(object, d)) { @@ -249,7 +251,8 @@ restoreH5Liger <- function(object, filePath = NULL) { filePath[[d]]) else path <- filePath[[d]] } - dataset(object, d, qc = FALSE) <- + .log("Restoring dataset \"", d, "\"") + datasets(object, check = FALSE)[[d]] <- restoreH5Liger(dataset(object, d), filePath[[d]]) } } @@ -270,3 +273,39 @@ restoreH5Liger <- function(object, filePath = NULL) { list(folder = NULL, data = path) } } + +#' Close all links (to HDF5 files) of a liger object +#' @description When need to interact with the data embedded in HDF5 files out +#' of the currect R session, the HDF5 files has to be closed in order to be +#' available to other processes. +#' @param object liger object. +#' @return \code{object} with links closed. +#' @export +closeAllH5 <- function(object) { + if (!isH5Liger(object)) return(object) + for (dn in names(object)) { + if (!isH5Liger(object, dn)) next + ld <- dataset(object, dn) + # if (!is.null(rawData(ld))) rawData(ld)$close() + # if (!is.null(normData(ld))) normData(ld)$close() + # if (!is.null(scaleData(ld))) scaleData(ld)$close() + # if (!is.null(scaleUnsharedData(ld))) scaleUnsharedData(ld)$close() + h5f <- getH5File(object, dn) + h5f$close_all() + } + return(object) +} + +.H5GroupToH5SpMat <- function(obj, dims) { + groupPath <- obj$get_obj_name() + RcppPlanc::H5SpMat(filename = obj$get_filename(), + valuePath = paste0(groupPath, "/data"), + rowindPath = paste0(groupPath, "/indices"), + colptrPath = paste0(groupPath, "/indptr"), + nrow = dims[1], ncol = dims[2]) +} + +.H5DToH5Mat <- function(obj) { + RcppPlanc::H5Mat(filename = obj$get_filename(), + dataPath = obj$get_obj_name()) +} diff --git a/R/import.R b/R/import.R index 4b138db8..05cd5974 100644 --- a/R/import.R +++ b/R/import.R @@ -324,21 +324,24 @@ readLiger <- function( clusterName = "clusters", h5FilePath = NULL, update = TRUE) { - oldObj <- readRDS(filename) - if (!inherits(oldObj, "liger")) + obj <- readRDS(filename) + if (!inherits(obj, "liger")) stop("Object is not of class \"liger\".") - oldVer <- oldObj@version - if (oldVer >= package_version("1.99.0")) return(oldObj) - .log("Older version (", oldVer, ") of liger object detected.") + ver <- obj@version + if (ver >= package_version("1.99.0")) { + if (isH5Liger(obj)) obj <- restoreH5Liger(obj) + return(obj) + } + .log("Older version (", ver, ") of liger object detected.") if (isTRUE(update)) { .log("Updating the object structure to make it compatible ", "with current version (", utils::packageVersion("rliger2"), ")") - return(convertOldLiger(oldObj, dimredName = dimredName, + return(convertOldLiger(obj, dimredName = dimredName, clusterName = clusterName, h5FilePath = h5FilePath)) } else { .log("`update = FALSE` specified. Returning the original object.") - return(oldObj) + return(obj) } } diff --git a/R/liger-class.R b/R/liger-class.R index 355ff205..74247ab6 100644 --- a/R/liger-class.R +++ b/R/liger-class.R @@ -93,7 +93,6 @@ liger <- setClass( for (d in names(x)) { ld <- dataset(x, d) if (!is.null(ld@V)) { - print(all.equal(rownames(ld@V), varFeatures(x))) if (!identical(rownames(ld@V), varFeatures(x))) return(paste("Variable features do not match dimension", "of V matrix in dataset", d)) @@ -105,9 +104,18 @@ liger <- setClass( return(paste("Variable features do not match dimension", "of scaleData in dataset", d)) } else { - if (scaleData(ld)$dims[1] != length(varFeatures(x))) - return(paste("Variable features do not match dimension", - "of scaleData in dataset (H5)", d)) + if (inherits(scaleData(ld), "H5D")) { + if (scaleData(ld)$dims[1] != length(varFeatures(x))) + return(paste("Variable features do not match ", + "dimension of scaleData in dataset ", + "(H5)", d)) + } else if (inherits(scaleData(ld), "H5Group")) { + if (scaleData(ld)[["featureIdx"]]$dims != length(varFeatures(x))) { + return(paste("Variable features do not match ", + "dimension of scaleData in dataset ", + "(H5)", d)) + } + } } } } @@ -116,7 +124,7 @@ liger <- setClass( } .valid.liger <- function(object) { - message("Checking liger object validity") + # message("Checking liger object validity") res <- .checkLigerBarcodes(object) if (!is.null(res)) return(res) res <- .checkLigerVarFeature(object) @@ -372,7 +380,7 @@ setGeneric("dataset", function(x, dataset = NULL) standardGeneric("dataset")) #' @export #' @rdname liger-class -setGeneric("dataset<-", function(x, dataset, type = NULL, qc = TRUE, value) { +setGeneric("dataset<-", function(x, dataset, type = NULL, qc = FALSE, value) { standardGeneric("dataset<-") }) @@ -404,6 +412,56 @@ setMethod("dataset", signature(x = "liger", dataset = "numeric"), datasets(x)[[dataset]] }) +.mergeCellMeta <- function(cm1, cm2) { + newDF <- S4Vectors::DataFrame(row.names = c(rownames(cm1), rownames(cm2))) + for (var in names(cm1)) { + value <- cm1[[var]] + if (var %in% names(cm2)) { + # TODO: check type + tryCatch( + expr = { + if (is.null(dim(value))) { + value <- c(value, cm2[[var]]) + } else { + value <- rbind(value, cm2[[var]]) + } + }, + finally = { + cm2Idx <- seq(nrow(cm1) + 1, nrow(cm1) + nrow(cm2)) + if (is.null(dim(value))) value[cm2Idx] <- NA + else { + empty <- matrix(NA, nrow = nrow(cm2), ncol = ncol(value)) + value <- rbind(value, empty) + } + } + ) + } else { + cm2Idx <- seq(nrow(cm1) + 1, nrow(cm1) + nrow(cm2)) + if (is.null(dim(value))) value[cm2Idx] <- NA + else { + empty <- matrix(NA, nrow = nrow(cm2), ncol = ncol(value)) + value <- rbind(value, empty) + } + } + newDF[[var]] <- value + } + for (var in names(cm2)[!names(cm2) %in% names(cm1)]) { + value <- cm2[[var]] + if (is.null(dim(value))) { + if (is.factor(value)) { + value <- factor(c(rep(NA, nrow(cm1)), value), + levels = levels(value)) + } else { + value <- c(rep(NA, nrow(cm1)), value) + } + } else { + empty <- matrix(NA, nrow = nrow(cm1), ncol = ncol(value)) + value <- rbind(empty, value) + } + newDF[[var]] <- value + } + return(newDF) +} .expandDataFrame <- function(df, idx) { dfList <- as.list(df) dfList <- lapply(dfList, function(x, idx) { @@ -428,19 +486,20 @@ setMethod("dataset", signature(x = "liger", dataset = "numeric"), setReplaceMethod("dataset", signature(x = "liger", dataset = "character", type = "missing", qc = "ANY", value = "ligerDataset"), - function(x, dataset, type = NULL, qc = TRUE, value) { + function(x, dataset, type = NULL, qc = FALSE, value) { if (dataset %in% names(x)) { dataset(x, dataset) <- NULL } new.idx <- seq(ncol(x) + 1, ncol(x) + ncol(value)) x@datasets[[dataset]] <- value - # TODO also add rows to cellMeta and H.norm cm <- x@cellMeta remainingRowname <- rownames(cm) cm <- .expandDataFrame(cm, new.idx) rownames(cm) <- c(remainingRowname, colnames(value)) + levels(cm$dataset) <- c(levels(cm$dataset), dataset) cm$dataset[new.idx] <- dataset + cm$barcode[new.idx] <- colnames(value) x@cellMeta <- cm # x@W is genes x k, no need to worry if (!is.null(x@H.norm)) { @@ -462,11 +521,12 @@ setReplaceMethod("dataset", signature(x = "liger", dataset = "character", value = "matrixLike"), function(x, dataset, type = c("rawData", "normData", "scaleData"), - qc = TRUE, + qc = FALSE, value) { type <- match.arg(type) if (type == "rawData") { ld <- createLigerDataset(rawData = value) + colnames(ld) <- paste0(dataset, "_", colnames(ld)) } else if (type == "normData") { ld <- createLigerDataset(normData = value) } else if (type == "scaleData") { diff --git a/R/ligerDataset-class.R b/R/ligerDataset-class.R index 06e55430..c427326a 100644 --- a/R/ligerDataset-class.R +++ b/R/ligerDataset-class.R @@ -5,9 +5,10 @@ setClassUnion("matrixLike_OR_NULL", c("matrixLike", "NULL")) # It is quite hard to handle "H5D here, which is indeed defined as an R6 class. # I'm not sure if this is a proper solution setOldClass("H5D") +setOldClass("H5Group") suppressWarnings(setClassUnion("dgCMatrix_OR_H5D_OR_NULL", c("dgCMatrix", "H5D", "NULL"))) setClassUnion("matrix_OR_H5D_OR_NULL", c("matrix", "H5D", "NULL")) -setClassUnion("matrixLike_OR_H5D_OR_NULL", c("matrixLike", "H5D", "NULL")) +setClassUnion("matrixLike_OR_H5D_OR_H5Group_OR_NULL", c("matrixLike", "H5D", "H5Group", "NULL")) setClassUnion("index", members = c("logical", "numeric", "character")) @@ -39,8 +40,8 @@ ligerDataset <- setClass( representation( rawData = "dgCMatrix_OR_H5D_OR_NULL", normData = "dgCMatrix_OR_H5D_OR_NULL", - scaleData = "matrixLike_OR_H5D_OR_NULL", - scaleUnsharedData = "matrixLike_OR_H5D_OR_NULL", + scaleData = "matrixLike_OR_H5D_OR_H5Group_OR_NULL", + scaleUnsharedData = "matrixLike_OR_H5D_OR_H5Group_OR_NULL", varUnsharedFeatures = "character", H = "matrix_OR_NULL", V = "matrix_OR_NULL", @@ -182,10 +183,10 @@ modalOf <- function(object) { .valid.ligerDataset <- function(object) { if (isH5Liger(object)) { - message("Checking h5 ligerDataset validity") + # message("Checking h5 ligerDataset validity") .checkH5LigerDatasetLink(object) } else { - message("Checking in memory ligerDataset validity") + # message("Checking in memory ligerDataset validity") .checkLigerDatasetBarcodes(object) } # TODO more checks @@ -276,8 +277,18 @@ setMethod( cat(paste0(slot, ":"), nrow(data), "features\n") } if (inherits(data, "H5D")) { - cat(paste0(slot, ":"), paste(data$dims, collapse = " x "), - "values in H5D object\n") + if (length(data$dims) == 1) { + cat(paste0(slot, ":"), data$dims, + "non-zero values in H5D object\n") + } else { + cat(paste0(slot, ":"), + paste(data$dims, collapse = " x "), + "values in H5D object\n") + } + } + if (inherits(data, "H5Group")) { + cat(paste0(slot, ":"), data[["data"]]$dims, + "non-zero values in H5Group object\n") } } } @@ -583,6 +594,20 @@ setReplaceMethod( } ) +#' @export +#' @rdname ligerDataset-class +setReplaceMethod( + "scaleData", + signature(x = "ligerDataset", check = "ANY", value = "H5Group"), + function(x, check = TRUE, value) { + if (!isH5Liger(x)) + stop("Cannot replace slot with on-disk data for in-memory object.") + x@scaleData <- value + if (isTRUE(check)) methods::validObject(x) + x + } +) + #' @export #' @rdname ligerDataset-class setGeneric( @@ -650,6 +675,20 @@ setReplaceMethod( } ) +#' @export +#' @rdname ligerDataset-class +setReplaceMethod( + "scaleUnsharedData", + signature(x = "ligerDataset", check = "ANY", value = "H5Group"), + function(x, check = TRUE, value) { + if (!isH5Liger(x)) + stop("Cannot replace slot with on-disk data for in-memory object.") + x@scaleUnsharedData <- value + if (isTRUE(check)) methods::validObject(x) + x + } +) + #' @export #' @rdname ligerDataset-class setGeneric( diff --git a/R/optimizeUANLS.R b/R/optimizeUANLS.R deleted file mode 100644 index b01d1631..00000000 --- a/R/optimizeUANLS.R +++ /dev/null @@ -1,289 +0,0 @@ -#' Perform iNMF on scaled datasets, and include unshared features -#' @param object \linkS4class{liger} object. Should run -#' \code{\link{selectGenes}} with \code{unshared = TRUE} and then run -#' \code{\link{scaleNotCenter}} in advance. -#' @param k Integer, inner dimension of factorization (number of factors). -#' Default \code{30}. -#' @param lambda Numeric, the lambda penalty. Default \code{5}. -#' @param thresh Numeric, convergence threshold. Convergence occurs when -#' \eqn{|obj_0-obj|/(mean(obj_0,obj)) < thresh}. Default \code{1e-10}. -#' @param maxIter Maximum number of block coordinate descent iterations to -#' perform. Default \code{30}. -#' @param nrep Number of restarts to perform (iNMF objective function is -#' non-convex, so taking the best objective from multiple successive -#' initializations is recommended). For easier reproducibility, this increments -#' the random seed by 1 for each consecutive restart, so future factorizations -#' of the same dataset can be run with one rep if necessary. Default \code{1}. -#' @param seed Random seed to allow reproducible results. Default \code{1}. -#' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. -#' @noRd -optimizeUANLS <- function( - object, - k = 30, - lambda = 5, - maxIter = 30, - nrep = 1, - thresh = 1e-10, - seed = 1, - verbose = getOption("ligerVerbose") -) { - set.seed(seed) - if (isTRUE(verbose)) - .log('Performing Factorization using UINMF and unshared features') - if (length(lambda) == 1) - lambda <- rep(lambda, length(object)) - else if (length(lambda) != length(object)) - stop("Vectorized lambda should have one value for each dataset.") - - # Get a list of all the matrices - # scaleData in mlist: gene x cell - mlist <- getMatrix(object, "scaleData", returnList = TRUE) - mlist <- lapply(mlist, as.matrix) - xdim <- lapply(mlist, dim) - - # Return what datasets have unshared features, - # and the dimensions of those unshared features - ulist <- getMatrix(object, "scaleUnsharedData", returnList = TRUE) - ulist <- lapply(ulist, as.matrix) - udim <- lapply(ulist, dim) - unshared <- which(sapply(ulist, function(x) !is.null(x))) - max_feats <- max(unlist(lapply(ulist, nrow))) - - # For every set of additional features less than the maximum, - # append an additional zero matrix s.t. it matches the maximum - for (i in seq_along(object)) { - if (i %in% unshared) - # X: (g+u) x c - mlist[[i]] <- rbind(mlist[[i]], ulist[[i]]) - } - - X <- mlist - - # Create an 0 matrix the size of U for all U's, s.t. it can be stacked to W - zero_matrix_u_full <- c() - zero_matrix_u_partial <- c() - for (i in seq_along(object)) { - if (i %in% unshared) { - # u x c - zero_matrix_u_full[[i]] <- - matrix(0, nrow = udim[[i]][1], ncol = udim[[i]][2]) - # u x k - zero_matrix_u_partial[[i]] <- - matrix(0, nrow = udim[[i]][1], ncol = k) - } - } - - nCells <- sapply(X, ncol) - nGenes <- length(varFeatures(object)) - bestObj <- Inf - - for (i in seq(nrep)) { - current <- seed + i - 1 - # initialization - idX <- list() - - for (i in seq_along(X)) idX[[i]] <- sample(nCells[i], k) - - # Establish V from only the RNA dimensions - # V matrices: g x k - V <- list() - for (i in seq_along(X)) - V[[i]] <- as.matrix(scaleData(object, i)[, idX[[i]]]) - # Establish W from the shared gene dimensions - # W matrices: g x k - W <- matrix(stats::runif(nGenes*k, 0, 2), nGenes, k) - H <- list() - # Initialize U - U <- list() - for (i in seq_along(X)) { - if (i %in% unshared) { - # u x k - U[[i]] <- ulist[[i]][, idX[[i]]] - } - } - - iter <- 0 - total_time <- 0 - if (isTRUE(verbose)) - pb <- utils::txtProgressBar(min = 0, max = maxIter, style = 3) - sqrtLambda <- lapply(lambda, sqrt) - - # Initial Training Objects - - obj_train_approximation <- 0 - obj_train_penalty <- 0 - - for (i in seq_along(X)) { - # H: k x c - H[[i]] = matrix(stats::runif(k * nCells[i], 0, 2), k, nCells[i]) - if (i %in% unshared) { - obj_train_approximation <- obj_train_approximation + - norm(X[[i]] - (rbind(W, zero_matrix_u_partial[[i]]) + - rbind(V[[i]], U[[i]])) %*% H[[i]], - "F") ^ 2 - obj_train_penalty <- obj_train_penalty + - lambda[[i]] * norm(rbind(V[[i]], U[[i]]) %*% H[[i]], - "F") ^ 2 - } else { - obj_train_approximation <- obj_train_approximation + - norm(X[[i]] - (W + V[[i]]) %*% H[[i]], "F") ^ 2 - obj_train_penalty <- obj_train_penalty + - lambda[[i]] * norm(V[[i]] %*% H[[i]], "F") ^ 2 - - } - } - obj_train <- obj_train_approximation + obj_train_penalty - - #### Initialize Object Complete - #### Begin Updates - delta <- Inf - iter <- 1 - while (delta > thresh & iter <= maxIter) { - iter_start_time <- Sys.time() - - # H - Updates - # H: k x c - for (i in seq_along(X)) { - if (!(i %in% unshared)) { - H[[i]] <- solveNNLS( - rbind(W + V[[i]], sqrtLambda[[i]] * V[[i]]), - rbind(X[[i]], matrix(0, nGenes, xdim[[i]][2])) - ) - } else { - H[[i]] = solveNNLS( - rbind( - rbind(W, zero_matrix_u_partial[[i]]) + - rbind(V[[i]], U[[i]]), - sqrtLambda[[i]]*rbind(V[[i]], U[[i]]) - ), - rbind(X[[i]], - matrix(0, nGenes + udim[[i]][1], xdim[[i]][2])) - ) - } - } - - # V - updates - for (i in seq_along(X)) { - V[[i]] = t(solveNNLS( - rbind(t(H[[i]]), sqrtLambda[[i]] * t(H[[i]])), - rbind( - t(X[[i]][seq(nGenes), ] - W %*% H[[i]]), - matrix(0, nCells[i], nGenes) - ) - )) - } - - # U - updates - # U: u x k - for (i in seq_along(X)) { - if (i %in% unshared) { - U[[i]] = t(solveNNLS( - rbind(t(H[[i]]), sqrtLambda[[i]] * t(H[[i]])), - rbind( - t(X[[i]][seq(nGenes + 1, udim[[i]][1] + nGenes),]), - t(zero_matrix_u_full[[i]]) - ) - )) - } - } - - - # W - updates - # H_t_stack: C x k - # ("C" for all cells, "c" for number of cells in each dataset) - H_t_stack <- t(Reduce(cbind, H)) - # diff_stack_w: C x g - diff_stack_w = c() - for (i in seq_along(X)) { - diff_stack_w = cbind( - diff_stack_w, - X[[i]][seq(nGenes), ] - V[[i]] %*% H[[i]] - ) - } - diff_stack_w <- t(diff_stack_w) - # W: g x k - W = t(solveNNLS(H_t_stack, diff_stack_w)) - - ## End of updates - iter_end_time <- Sys.time() - iter_time <- as.numeric(difftime(iter_end_time, iter_start_time, - units = "secs")) - total_time <- total_time + iter_time - - #Updating training object - obj_train_prev <- obj_train - obj_train_approximation <- 0 - obj_train_penalty <- 0 - - for (i in seq_along(X)) { - if (i %in% unshared) { - obj_train_approximation <- obj_train_approximation + - norm( - X[[i]] - (rbind(W, zero_matrix_u_partial[[i]]) + - rbind(V[[i]], U[[i]])) %*% H[[i]], - "F") ^ 2 - obj_train_penalty <- obj_train_penalty + lambda[[i]] * - norm(rbind(V[[i]], U[[i]]) %*% H[[i]], "F") ^ 2 - } - else { - obj_train_approximation <- obj_train_approximation + - norm(X[[i]] - (W + V[[i]]) %*% H[[i]], "F") ^ 2 - obj_train_penalty = obj_train_penalty + lambda[[i]] * - norm(V[[i]] %*% H[[i]], "F") ^ 2 - } - } - - obj_train <- obj_train_approximation + obj_train_penalty - delta <- abs(obj_train_prev - obj_train) / - mean(c(obj_train_prev, obj_train)) - iter <- iter + 1 - if (isTRUE(verbose)) utils::setTxtProgressBar(pb = pb, value = iter) - } - if (isTRUE(verbose)) { - utils::setTxtProgressBar(pb = pb, value = maxIter) - cat("\n") - .log("Current seed: ", current, "; Current objective: ", obj_train) - } - if (obj_train < bestObj) { - W_m <- W - H_m <- H - V_m <- V - U_m <- U - bestObj <- obj_train - best_seed <- current - } - } - - factorNames <- paste0("factor_", seq(k)) - - rownames(W_m) <- varFeatures(object) - colnames(W_m) <- factorNames - object@W <- W_m - - for (i in seq_along(object)) { - ld <- dataset(object, i) - - rownames(V_m[[i]]) <- varFeatures(object) - colnames(V_m[[i]]) <- factorNames - ld@V <- V_m[[i]] - - rownames(H_m[[i]]) <- factorNames - colnames(H_m[[i]]) <- colnames(X[[i]]) - ld@H <- H_m[[i]] - - if (i %in% unshared) { - rownames(U_m[[i]]) <- ld@varUnsharedFeatures - colnames(U_m[[i]]) <- factorNames - ld@U <- U_m[[i]] - } - datasets(object, check = FALSE)[[i]] <- ld - } - param <- list(k = k, lambda = lambda) - object@uns$factorization <- param - - if (isTRUE(verbose)) - .log("Objective: ", bestObj, "\nBest results with seed: ", best_seed) - - return(object) -} diff --git a/R/runBPPINMF.R b/R/runBPPINMF.R deleted file mode 100644 index 87d1fe93..00000000 --- a/R/runBPPINMF.R +++ /dev/null @@ -1,311 +0,0 @@ -#' Perform iNMF on scaled datasets -#' @description -#' Performs integrative non-negative matrix factorization (iNMF) (J.D. Welch, -#' 2019) to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. The -#' objective function is stated as -#' -#' \deqn{\arg\min_{H\ge0,W\ge0,V\ge0}\sum_{i}^{d}||E_i-(W+V_i)Hi||^2_F+\lambda\sum_{i}^{d}||V_iH_i||_F^2} -#' -#' where \eqn{E_i} is the input non-negative matrix of the i'th dataset, \eqn{d} -#' is the total number of datasets. -#' -#' The factorization produces a shared \eqn{W} matrix (genes by k), and for each -#' dataset, an \eqn{H} matrix (k by cells) and a \eqn{V} matrix (genes by k). -#' The \eqn{H} matrices represent the cell factor loadings. \eqn{W} is held -#' consistent among all datasets, as it represents the shared components of the -#' metagenes across datasets. The \eqn{V} matrices represent the -#' dataset-specific components of the metagenes. -#' -#' This function adopts highly optimized fast and memory efficient -#' implementation extended from Planc (Kannan, 2016). Pre-installation of -#' extension package \code{RcppPlanc} is required. The underlying algorithm -#' adopts the identical ANLS strategy as \code{\link{optimizeALS}} in the old -#' version of LIGER. -#' @param object A \linkS4class{liger} object, a Seurat object or a named list -#' of matrix, dgCMatrix, H5D objects, where the names represents dataset names -#' and matrices are scaled on the same set of variable features, with rows as -#' features and columns as cells. -#' @param k Inner dimension of factorization (number of factors). Run -#' \code{\link{suggestK}} to determine appropriate value; a general rule of -#' thumb is that a higher \code{k} will be needed for datasets with more -#' sub-structure. -#' @param lambda Regularization parameter. Larger values penalize -#' dataset-specific effects more strongly (i.e. alignment should increase as -#' \code{lambda} increases). Default \code{5}. -#' @param nIteration Total number of block coordinate descent iterations to -#' perform. Default \code{30}. -#' @param nRandomStarts Number of restarts to perform (iNMF objective function -#' is non-convex, so taking the best objective from multiple successive -#' initialization is recommended). For easier reproducibility, this increments -#' the random seed by 1 for each consecutive restart, so future factorization -#' of the same dataset can be run with one rep if necessary. Default \code{1}. -#' @param HInit Initial values to use for \eqn{H} matrices. A list object where -#' each element is the initial \eqn{H} matrix of each dataset. Default -#' \code{NULL}. -#' @param WInit Initial values to use for \eqn{W} matrix. A matrix object. -#' Default \code{NULL}. -#' @param VInit Initial values to use for \eqn{V} matrices. A list object where -#' each element is the initial \eqn{V} matrix of each dataset. Default -#' \code{NULL}. -#' @param seed Random seed to allow reproducible results. Default \code{1}. -#' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. -#' @param ... Arguments passed to methods. -#' @rdname runBPPINMF -#' @export -#' @examples -#' pbmc <- normalize(pbmc) -#' pbmc <- selectGenes(pbmc) -#' pbmc <- scaleNotCenter(pbmc) -#' pbmc <- runBPPINMF(pbmc, k = 20) -runBPPINMF <- function( - object, - k, - lambda = 5.0, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - ... -) { - UseMethod("runBPPINMF", object) -} - -#' @rdname runBPPINMF -#' @export -#' @param readH5 \code{TRUE} to force reading H5 based data into memory and -#' conduct factorization. \code{"auto"} reads H5 dataset with less than 8000 -#' cells. \code{FALSE} will stop users from running if H5 data presents. -#' @method runBPPINMF liger -#' @return The liger method returns the input \linkS4class{liger} object with -#' factorization result updated. A list of all \eqn{H} matrices can be accessed -#' with \code{getMatrix(object, "H")}, a list of all \eqn{V} matrices can be -#' accessed with \code{getMatrix(object, "V")}, and the \eqn{W} matrix can be -#' accessed with \code{getMatrix(object, "W")}. -runBPPINMF.liger <- function( - object, - k, - lambda = 5.0, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - readH5 = "auto", - ... -) { - .checkObjVersion(object) - object <- recordCommand(object, dependencies = "RcppPlanc") - object <- removeMissing(object, orient = "cell", verbose = verbose) - data <- lapply(datasets(object), function(ld) { - if (is.null(scaleData(ld))) - stop("Scaled data not available. ", - "Run `scaleNotCenter(object)` first") - return(scaleData(ld)) - }) - dataClasses <- sapply(data, function(x) class(x)[1]) - if (!all(dataClasses == dataClasses[1])) { - stop("Currently the scaledData of all datasets have to be of the same class.") - } - out <- runBPPINMF.list( - object = data, - k = k, - lambda = lambda, - nIteration = nIteration, - nRandomStarts = nRandomStarts, - HInit = HInit, - WInit = WInit, - VInit = VInit, - seed = seed, - verbose = verbose, - barcodeList = lapply(datasets(object), colnames), - features = varFeatures(object) - ) - - object@W <- out$W - for (d in names(object)) { - ld <- dataset(object, d) - ld@H <- out$H[[d]] - ld@V <- out$V[[d]] - datasets(object, check = FALSE)[[d]] <- ld - } - object@uns$factorization$k <- k - object@uns$factorization$lambda <- lambda - return(object) -} - -#' @rdname runBPPINMF -#' @export -#' @param barcodeList List object of barcodes for each datasets, for setting -#' dimnames of output \eqn{H} matrices. Default \code{NULL} uses \code{colnames} -#' of matrices in the \code{object}. -#' @param features Character vector of feature names, for setting dimnames of -#' output \eqn{V} and \eqn{W} matrices. Default \code{NULL} uses \code{rownames} -#' of matrices in the \code{object}. -#' @return The list method returns a list of entries \code{H}, \code{V} and -#' \code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} -#' is a list of \eqn{V} matrices for each dataset. \code{W} is the shared -#' \eqn{W} matrix. -#' @method runBPPINMF list -runBPPINMF.list <- function( - object, - k, - lambda = 5.0, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - barcodeList = NULL, - features = NULL, - ... -) { - if (!requireNamespace("RcppPlanc", quietly = TRUE)) - stop("RcppPlanc installation required. Currently, please get the ", - "GitHub private repository access from the lab and run: \n", - "devtools::install_github(\"welch-lab/RcppPlanc\")") - - if (is.null(barcodeList)) { - barcodeList <- lapply(object, colnames) - } else if (!identical(sapply(barcodeList, length), sapply(object, ncol))) { - stop("Given `barcodeList` cannot match to columns of all matrices.") - } - - if (is.null(features)) features <- rownames(object[[1]]) - else { - if (!identical(length(features), unique(sapply(object, nrow)))) { - stop("Either the length of `features` does not match with the ", - "rows of all matrices, or the matrices have varied numbers ", - "of rows.") - } - } - - nCells <- sapply(object, ncol) - nGenes <- nrow(object[[1]]) - nDatasets <- length(object) - datasetNames <- names(object) - if (k >= nGenes) { - stop("Select k lower than the number of variable genes: ", nGenes) - } - Wm <- Vm <- Hm <- NULL - bestObj <- Inf - bestSeed <- seed - for (i in seq(nRandomStarts)) { - if (isTRUE(verbose) && nRandomStarts > 1) { - .log("Replicate run ", i, "...") - } - set.seed(seed = seed + i - 1) - if (!is.null(WInit)) - W <- t(.checkInit(WInit, nCells, nGenes, k, "W")) - else W <- t(matrix(stats::runif(nGenes * k, 0, 2), k, nGenes)) - if (!is.null(VInit)) { - V <- .checkInit(VInit, nCells, nGenes, k, "V") - V <- lapply(V, t) - } else { - V <- lapply(seq(nDatasets), function(i) { - t(matrix(stats::runif(nGenes * k, 0, 2), k, nGenes))}) - } - if (!is.null(HInit)) { - H <- .checkInit(HInit, nCells, nGenes, k, "H") - H <- lapply(H, t) - } else { - H <- lapply(nCells, function(n) { - matrix(stats::runif(n * k, 0, 2), n, k) - }) - } - if (inherits(object[[1]], "H5D")) { - # RcppPlanc::bppinmf_h5dense() - stop("TODO: Push Yichen to test bppinmf_h5sparse/bppinmf_h5dense!") - } else { - out <- RcppPlanc::bppinmf( - objectList = object, k = k, lambda = lambda, niter = nIteration, - verbose = verbose) - } - if (out$objErr < bestObj) { - Wm <- out$W - Hm <- out$H - Vm <- out$V - bestObj <- out$objErr - bestSeed <- seed + i - 1 - } - } - if (isTRUE(verbose) && nRandomStarts > 1) { - .log("Best objective error: ", bestObj, "\nBest seed: ", bestSeed) - } - factorNames <- paste0("Factor_", seq(k)) - for (i in seq(nDatasets)) { - Hm[[i]] <- t(Hm[[i]]) - colnames(Hm[[i]]) <- barcodeList[[i]] - rownames(Hm[[i]]) <- factorNames - rownames(Vm[[i]]) <- features - colnames(Vm[[i]]) <- factorNames - } - names(Vm) <- names(Hm) <- names(object) - rownames(Wm) <- features - colnames(Wm) <- factorNames - return(list(H = Hm, V = Vm, W = Wm)) -} - -#' @rdname runBPPINMF -#' @export -#' @param datasetVar Variable name in metadata indicating a factor of dataset -#' belonging, or directly a factor that match with the number of cells. -#' @return The Seurat method returns a list of entries \code{H}, \code{V} and -#' \code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} -#' is a list of \eqn{V} matrices for each dataset. \code{W} is the shared -#' \eqn{W} matrix. -#' @method runBPPINMF Seurat -runBPPINMF.Seurat <- function( - object, - datasetVar, - k, - lambda = 5.0, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - ... -) { - if (!requireNamespace("Seurat", quietly = TRUE)) { - stop("Seurat installation required. Please run\n", - "install.packages(\"Seurat\")") - } - EBind <- Seurat::GetAssayData(object, "scale.data") - if (any(EBind < 0)) { - stop("Non-negative Matrix Factorization requires non-negative data. ", - "Please scale the library-size-normalized data without centering.") - } - if (is.character(datasetVar) && length(datasetVar) == 1) { - datasetVar <- object[[datasetVar]][[1]] - } - if (!is.factor(datasetVar) || length(datasetVar) != ncol(EBind)) { - stop("Invalid `datasetVar`. Please see `?runBPPINMF` for instruction.") - } - datasetVar <- droplevels(datasetVar) - Es <- lapply(levels(datasetVar), function(d) { - as(EBind[, datasetVar == d], "CsparseMatrix") - }) - names(Es) <- levels(datasetVar) - runBPPINMF.list( - object = Es, - k = k, - lambda = lambda, - nIteration = nIteration, - nRandomStarts = nRandomStarts, - HInit = HInit, - WInit = WInit, - VInit = VInit, - seed = seed, - verbose = verbose - ) -} diff --git a/R/runINMF.R b/R/runINMF.R index 5373e27c..d1e5b169 100644 --- a/R/runINMF.R +++ b/R/runINMF.R @@ -1,413 +1,272 @@ #' Perform iNMF on scaled datasets #' @description -#' Performs integrative non-negative matrix (iNMF) factorization to return -#' factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. It optimizes the iNMF -#' objective function using block coordinate descent (alternating non-negative -#' least squares), where the number of factors is set by \code{k}. TODO: include -#' objective function equation here in documentation (using deqn) +#' Performs integrative non-negative matrix factorization (iNMF) (J.D. Welch, +#' 2019) to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. The +#' objective function is stated as #' -#' For each dataset, this factorization produces an \eqn{H} matrix (cells by k), -#' a \eqn{V} matrix (k by genes), and a shared \eqn{W} matrix (k by genes). The -#' \eqn{H} matrices represent the cell factor loadings. \eqn{W} is held +#' \deqn{\arg\min_{H\ge0,W\ge0,V\ge0}\sum_{i}^{d}||E_i-(W+V_i)Hi||^2_F+\lambda\sum_{i}^{d}||V_iH_i||_F^2} +#' +#' where \eqn{E_i} is the input non-negative matrix of the i'th dataset, \eqn{d} +#' is the total number of datasets. \eqn{E_i} is of size \eqn{m \times n_i} for +#' \eqn{m} variable genes and \eqn{n_i} cells, \eqn{H_i} is of size +#' \eqn{n_i \times k}, \eqn{V_i} is of size \eqn{m \times k}, and \eqn{W} is of +#' size \eqn{m \times k}. +#' +#' The factorization produces a shared \eqn{W} matrix (genes by k), and for each +#' dataset, an \eqn{H} matrix (k by cells) and a \eqn{V} matrix (genes by k). +#' The \eqn{H} matrices represent the cell factor loadings. \eqn{W} is held #' consistent among all datasets, as it represents the shared components of the #' metagenes across datasets. The \eqn{V} matrices represent the #' dataset-specific components of the metagenes. -#' @param object A \linkS4class{liger} object or a named list of matrix object, -#' where the names represents dataset names and matrices are scaled on the same -#' set of variable features, with rows as features and columns as cells. +#' +#' This function adopts highly optimized fast and memory efficient +#' implementation extended from Planc (Kannan, 2016). Pre-installation of +#' extension package \code{RcppPlanc} is required. The underlying algorithm +#' adopts the identical ANLS strategy as \code{\link{optimizeALS}} in the old +#' version of LIGER. +#' @param object A \linkS4class{liger} object, a Seurat object or a named list +#' of matrix, dgCMatrix, H5D objects, where the names represents dataset names +#' and matrices are scaled on the same set of variable features, with rows as +#' features and columns as cells. #' @param k Inner dimension of factorization (number of factors). Run #' \code{\link{suggestK}} to determine appropriate value; a general rule of #' thumb is that a higher \code{k} will be needed for datasets with more -#' sub-structure. +#' sub-structure. Default \code{20}. #' @param lambda Regularization parameter. Larger values penalize #' dataset-specific effects more strongly (i.e. alignment should increase as #' \code{lambda} increases). Default \code{5}. -#' @param thresh Convergence threshold. Convergence occurs when -#' \eqn{|obj_0-obj|/(mean(obj_0,obj)) < thresh}. Default \code{1e-6}. -#' @param maxIter Maximum number of block coordinate descent iterations to +#' @param nIteration Total number of block coordinate descent iterations to #' perform. Default \code{30}. -#' @param nrep Number of restarts to perform (iNMF objective function is -#' non-convex, so taking the best objective from multiple successive +#' @param nRandomStarts Number of restarts to perform (iNMF objective function +#' is non-convex, so taking the best objective from multiple successive #' initialization is recommended). For easier reproducibility, this increments #' the random seed by 1 for each consecutive restart, so future factorization #' of the same dataset can be run with one rep if necessary. Default \code{1}. -#' @param H.init Initial values to use for \eqn{H} matrices. A list object where +#' @param HInit Initial values to use for \eqn{H} matrices. A list object where #' each element is the initial \eqn{H} matrix of each dataset. Default #' \code{NULL}. -#' @param W.init Initial values to use for \eqn{W} matrix. A matrix object. +#' @param WInit Initial values to use for \eqn{W} matrix. A matrix object. #' Default \code{NULL}. -#' @param V.init Initial values to use for \eqn{V} matrices. A list object where +#' @param VInit Initial values to use for \eqn{V} matrices. A list object where #' each element is the initial \eqn{V} matrix of each dataset. Default #' \code{NULL}. -#' @param method NNLS subproblem solver. Choose from \code{"liger"} (default -#' original implementation), \code{"planc"} or \code{"rcppml"}. -#' @param useUnshared Logical, whether to include unshared variable features and -#' run optimizeUANLS algorithm. Defaul \code{FALSE}. Running -#' \code{\link{selectGenes}} with \code{unshared = TRUE} and then running -#' \code{\link{scaleNotCenter}} is required. #' @param seed Random seed to allow reproducible results. Default \code{1}. -#' @param readH5 \code{TRUE} to force reading H5 based data into memory and -#' conduct factorization. \code{"auto"} reads H5 dataset with less than 8000 -#' cells. \code{FALSE} will stop users from running if H5 data presents. #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. -#' @param max.iters,use.unshared,rand.seed \bold{Deprecated}. See Usage section -#' for replacement. -#' @param print.obj \bold{Defunct}. Whether to print objective function values -#' after convergence when \code{verbose = TRUE}. Now always print when verbose. -#' @return \code{object} with \code{W} slot updated with the result \eqn{W} -#' matrix, and the \code{H} and \code{V} slots of each -#' \linkS4class{ligerDataset} object in the \code{datasets} slot updated with -#' the dataset specific \eqn{H} and \eqn{V} matrix, respectively. +#' @param ... Arguments passed to methods. #' @rdname runINMF #' @export #' @examples #' pbmc <- normalize(pbmc) #' pbmc <- selectGenes(pbmc) #' pbmc <- scaleNotCenter(pbmc) -#' # Only running a few iterations for fast examples -#' pbmc <- runINMF(pbmc, k = 20, maxIter = 2) -setGeneric( - "runINMF", - function( +#' pbmc <- runINMF(pbmc, k = 20) +runINMF <- function( object, - k, + k = 20, lambda = 5.0, - thresh = 1e-6, - maxIter = 30, - nrep = 1, - H.init = NULL, - W.init = NULL, - V.init = NULL, - method = c("planc", "liger", "rcppml"), - useUnshared = FALSE, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, seed = 1, - readH5 = "auto", verbose = getOption("ligerVerbose"), - # Deprecated coding style - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - # Deprecated functionality - print.obj = NULL - ) standardGeneric("runINMF") -) + ... +) { + UseMethod("runINMF", object) +} #' @rdname runINMF #' @export -setMethod( - "runINMF", - signature(object = "liger"), - function( +#' @param readH5 \code{TRUE} to force reading H5 based data into memory and +#' conduct factorization. \code{"auto"} reads H5 dataset with less than 8000 +#' cells. \code{FALSE} will stop users from running if H5 data presents. +#' @method runINMF liger +#' @return The liger method returns the input \linkS4class{liger} object with +#' factorization result updated. A list of all \eqn{H} matrices can be accessed +#' with \code{getMatrix(object, "H")}, a list of all \eqn{V} matrices can be +#' accessed with \code{getMatrix(object, "V")}, and the \eqn{W} matrix can be +#' accessed with \code{getMatrix(object, "W")}. +runINMF.liger <- function( object, - k, + k = 20, lambda = 5.0, - thresh = 1e-6, - maxIter = 30, - nrep = 1, - H.init = NULL, - W.init = NULL, - V.init = NULL, - method = c("planc", "liger", "rcppml"), - useUnshared = FALSE, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, seed = 1, - readH5 = "auto", verbose = getOption("ligerVerbose"), - # Deprecated coding style - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - # Deprecated functionality - print.obj = NULL - ) { - .deprecateArgs(list(max.iters = "maxIter", use.unshared = "useUnshared", - rand.seed = "seed"), defunct = "print.obj") - .checkObjVersion(object) - method <- match.arg(method) - object <- recordCommand(object) - if (isFALSE(useUnshared)) { - object <- removeMissing(object, orient = "cell", - verbose = verbose) - data <- lapply(datasets(object), function(ld) { - if (is.null(scaleData(ld))) - stop("Scaled data not available. ", - "Run `scaleNotCenter(object)` first") - if (isH5Liger(ld)) { - if (!isFALSE(readH5)) { - h5d <- scaleData(ld) - if (readH5 == "auto") { - if (h5d$dims[2] <= 8000) { - warning("Automatically reading H5 based ", - "scaled dense matrix into memory. ", - "Dim: ", h5d$dims[1], "x", h5d$dims[2], - immediate. = verbose) - return(h5d[,]) - } else { - stop("Scaled data in H5 based dataset with ", - "more than 8000 cells will not be ", - "automatically read into memory. Use ", - "`readH5 = TRUE` to force reading, or ", - "try `online_iNMF()` instead.") - } - } else if (isTRUE(readH5)) { - return(h5d[,]) - } else { - stop("Can only set `readH5` to TRUE, FALSE, ", - "or 'auto'.") - } - } else { - stop("H5 based dataset detected while `readH5` is ", - "set to FALSE.") - } - } else { - return(scaleData(ld)) - } - }) - out <- runINMF( - object = data, - k = k, - lambda = lambda, - thresh = thresh, - maxIter = maxIter, - nrep = nrep, - H.init = H.init, - W.init = W.init, - V.init = V.init, - method = method, - useUnshared = FALSE, - seed = seed, - verbose = verbose - ) - # return(out) - object@W <- out$W - rownames(object@W) <- varFeatures(object) - for (d in names(object)) { - ld <- dataset(object, d) - ld@H <- out$H[[d]] - colnames(ld@H) <- colnames(ld) - ld@V <- out$V[[d]] - rownames(ld@V) <- varFeatures(object) - datasets(object, check = FALSE)[[d]] <- ld - } - object@uns$factorization$k <- k - object@uns$factorization$lambda <- lambda - } else { - object <- optimizeUANLS( - object = object, - k = k, - lambda = lambda, - thresh = thresh, - maxIter = maxIter, - nrep = nrep, - seed = seed, - verbose = verbose - ) - } - return(object) + readH5 = "auto", + ... +) { + .checkObjVersion(object) + object <- recordCommand(object, dependencies = "RcppPlanc") + object <- removeMissing(object, orient = "cell", verbose = verbose) + data <- lapply(datasets(object), function(ld) { + if (is.null(scaleData(ld))) + stop("Scaled data not available. ", + "Run `scaleNotCenter(object)` first") + return(scaleData(ld)) + }) + dataClasses <- sapply(data, function(x) class(x)[1]) + if (!all(dataClasses == dataClasses[1])) { + stop("Currently the scaledData of all datasets have to be of the ", + "same class.") } -) + out <- runINMF.list( + object = data, + k = k, + lambda = lambda, + nIteration = nIteration, + nRandomStarts = nRandomStarts, + HInit = HInit, + WInit = WInit, + VInit = VInit, + seed = seed, + verbose = verbose, + barcodeList = lapply(datasets(object), colnames), + features = varFeatures(object) + ) + + object@W <- out$W + for (d in names(object)) { + ld <- dataset(object, d) + ld@H <- out$H[[d]] + ld@V <- out$V[[d]] + datasets(object, check = FALSE)[[d]] <- ld + } + object@uns$factorization <- list(k = k, lambda = lambda) + return(object) +} #' @rdname runINMF #' @export -setMethod( - "runINMF", - signature(object = "list"), - function( +#' @param barcodeList List object of barcodes for each datasets, for setting +#' dimnames of output \eqn{H} matrices. Default \code{NULL} uses \code{colnames} +#' of matrices in the \code{object}. +#' @param features Character vector of feature names, for setting dimnames of +#' output \eqn{V} and \eqn{W} matrices. Default \code{NULL} uses \code{rownames} +#' of matrices in the \code{object}. +#' @return The list method returns a list of entries \code{H}, \code{V} and +#' \code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} +#' is a list of \eqn{V} matrices for each dataset. \code{W} is the shared +#' \eqn{W} matrix. +#' @method runINMF list +runINMF.list <- function( object, - k, + k = 20, lambda = 5.0, - thresh = 1e-6, - maxIter = 30, - nrep = 1, - H.init = NULL, - W.init = NULL, - V.init = NULL, - method = c("planc", "liger", "rcppml"), - useUnshared = FALSE, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, seed = 1, - readH5 = "auto", verbose = getOption("ligerVerbose"), - # Deprecated coding style - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - # Deprecated functionality - print.obj = NULL - ) { - .deprecateArgs(list(max.iters = "maxIter", use.unshared = "useUnshared", - rand.seed = "seed"), defunct = "print.obj") - # if (!all(sapply(object, is.matrix))) { - # stop("All values in 'object' must be a matrix") - # } - # E ==> cell x gene scaled matrices - method <- match.arg(method) - if (method == "planc" && !requireNamespace("RcppPlanc", quietly = TRUE)) - stop("RcppPlanc installation required") - if (method == "rcppml" && !requireNamespace("RcppML", quietly = TRUE)) - stop("RcppML installation required") - E <- lapply(object, t) - nDatasets <- length(E) - nCells <- sapply(E, nrow) - tmp <- gc() # nolint - nGenes <- ncol(E[[1]]) - if (k >= nGenes) { - stop("Select k lower than the number of variable genes: ", nGenes) - } - Wm <- matrix(0, k, nGenes) - Vm <- rep(list(matrix(0, k, nGenes)), nDatasets) - Hm <- lapply(nCells, function(n) matrix(0, n, k)) - - bestObj <- Inf - bestSeed <- seed - runStats <- matrix(0, nrep, 2) - for (i in seq(nrep)) { - set.seed(seed = seed + i - 1) - startTime <- Sys.time() - if (!is.null(W.init)) - W <- t(.checkInit(W.init, nCells, nGenes, k, "W")) - else W <- matrix(stats::runif(nGenes * k, 0, 2), k, nGenes) - if (!is.null(V.init)) { - V <- .checkInit(V.init, nCells, nGenes, k, "V") - V <- lapply(V, t) - } else - V <- lapply(seq(nDatasets), function(i) { - matrix(stats::runif(nGenes * k, 0, 2), k, nGenes)}) - if (!is.null(H.init)) { - H <- .checkInit(H.init, nCells, nGenes, k, "H") - H <- lapply(H, t) - } else - H <- lapply(nCells, function(n) { - matrix(stats::runif(n * k, 0, 2), n, k) - }) + barcodeList = NULL, + features = NULL, + ... +) { + if (!requireNamespace("RcppPlanc", quietly = TRUE)) + stop("RcppPlanc installation required. Currently, please get the ", + "GitHub private repository access from the lab and run: \n", + "devtools::install_github(\"welch-lab/RcppPlanc\")") - delta <- 1 - iters <- 0 - sqrtLambda <- sqrt(lambda) - obj0 <- inmf_calcObj(E, H, W, V, lambda) - # tmp <- gc() - if (isTRUE(verbose)) { - .log("Start iNMF with seed: ", seed + i - 1, "...") - pb <- utils::txtProgressBar(0, maxIter, style = 3) - } - # return(list(E = E, H = H, W = W, V = V)) - while (delta > thresh & iters < maxIter) { - # .log("Iter: ", iters) - # .log("Solving for H") - H <- lapply( - seq(nDatasets), - function(i) { - t(callNNLS( - C = rbind(t(W + V[[i]]), sqrtLambda*t(V[[i]])), - B = expandSpZeroRow(t(E[[i]])), - method = method - )) - } - ) - # tmp <- gc() - # .log("Solving for V") - # c x k, c x g -> g x k - V <- lapply( - seq(nDatasets), - function(i) { - callNNLS( - C = rbind(H[[i]], sqrtLambda*H[[i]]), - B = rbind( - E[[i]] - H[[i]] %*% W, - matrix(0, nCells[[i]], nGenes) - ), - method = method - ) - } - ) - # tmp <- gc() - # .log("Solving for W") - # .log("calc B") - wB <- rbindlist( - lapply(seq(nDatasets), - function(i) E[[i]] - H[[i]] %*% V[[i]] - )) - # .log("bppnnls") - W <- callNNLS(C = rbindlist(H), B = wB, - method = method) - # tmp <- gc() - obj <- inmf_calcObj(E, H, W, V, lambda) - # tmp <- gc() - delta <- abs(obj0 - obj) / (mean(obj0, obj)) - obj0 <- obj - iters <- iters + 1 - if (isTRUE(verbose)) - utils::setTxtProgressBar(pb, value = iters) - } - if (isTRUE(verbose)) { - utils::setTxtProgressBar(pb, value = maxIter) - cat("\n") - } - # if (iters == maxIter) { - # print("Warning: failed to converge within the allowed number of iterations. - # Re-running with a higher maxIter is recommended.") - # } - if (obj < bestObj) { - Wm <- W - Hm <- H - Vm <- V - bestObj <- obj - bestSeed <- seed + i - 1 - } - endTime <- difftime(time1 = Sys.time(), time2 = startTime, - units = "auto", ) - runStats[i, 1] <- as.double(endTime) - runStats[i, 2] <- iters - if (isTRUE(verbose)) { - .log("Finished in ", runStats[i, 1], " ", units(endTime), - ", ", iters, " iterations. \nMax iterations set: ", - maxIter, "\nFinal objective delta: ", delta) - .log("Objective: ", obj) - .log("Best results with seed ", bestSeed) - } + bestResult <- list() + bestObj <- Inf + bestSeed <- seed + for (i in seq(nRandomStarts)) { + if (isTRUE(verbose) && nRandomStarts > 1) { + .log("Replicate run ", i, "...") } - out <- list(H = Hm, V = Vm, W = t(Wm)) - factorNames <- paste0("Factor_", seq(k)) - for (i in seq(nDatasets)) { - out$H[[i]] <- t(out$H[[i]]) - colnames(out$H[[i]]) <- colnames(object[[i]]) - rownames(out$H[[i]]) <- factorNames - out$V[[i]] <- t(out$V[[i]]) - rownames(out$V[[i]]) <- rownames(object[[i]]) - colnames(out$V[[i]]) <- factorNames + set.seed(seed = seed + i - 1) + if (inherits(object[[1]], "H5D")) { + # RcppPlanc::bppinmf_h5dense() + stop("TODO: Push Yichen to test bppinmf_h5sparse/bppinmf_h5dense!") + } else { + out <- RcppPlanc::inmf(objectList = object, k = k, lambda = lambda, + niter = nIteration, Hinit = HInit, + Vinit = VInit, Winit = WInit, + verbose = verbose) + } + if (out$objErr < bestObj) { + bestResult <- out + bestObj <- out$objErr + bestSeed <- seed + i - 1 } - names(out$V) <- names(out$H) <- names(object) - rownames(out$W) <- rownames(object[[1]]) - colnames(out$W) <- factorNames - return(out) } -) - -# Binds list of matrices row-wise (vertical stack) -rbindlist <- function(mat_list) do.call(rbind, mat_list) - -expandSpZeroRow <- function(E) { - dimnames(E) <- list(NULL, NULL) - E@Dim <- c(as.integer(2*nrow(E)), as.integer(ncol(E))) - return(E) -} - -inmf_calcObj <- function(E, H, W, V, lambda) { - # E - dgCMatrix - # H, W, V - matrix - obj <- 0 - for (i in seq_along(H)) { - obj <- obj + - Matrix::norm(E[[i]] - H[[i]] %*% (W + V[[i]]), "F") ^ 2 + - lambda*norm(H[[i]] %*% V[[i]], "F") ^ 2 + if (isTRUE(verbose) && nRandomStarts > 1) { + .log("Best objective error: ", bestObj, "\nBest seed: ", bestSeed) } - return(obj) + barcodeList <- lapply(object, colnames) + features <- rownames(object[[1]]) + factorNames <- paste0("Factor_", seq(k)) + for (i in seq_along(object)) { + bestResult$H[[i]] <- t(bestResult$H[[i]]) + dimnames(bestResult$H[[i]]) <- list(factorNames, barcodeList[[i]]) + dimnames(bestResult$V[[i]]) <- list(features, factorNames) + } + names(bestResult$V) <- names(bestResult$H) <- names(object) + dimnames(bestResult$W) <- list(features, factorNames) + return(bestResult) } -callNNLS <- function(C, B, method = c("planc", "liger", "rcppml")) { - method <- match.arg(method) - switch(method, - planc = RcppPlanc::bppnnls(C, methods::as(B, "CsparseMatrix")), - liger = solveNNLS(C, as.matrix(B)), - rcppml = RcppML::project(w = C, data = B) +#' @rdname runINMF +#' @export +#' @param datasetVar Variable name in metadata indicating a factor of dataset +#' belonging, or directly a factor that match with the number of cells. +#' @return The Seurat method returns a list of entries \code{H}, \code{V} and +#' \code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} +#' is a list of \eqn{V} matrices for each dataset. \code{W} is the shared +#' \eqn{W} matrix. +#' @method runINMF Seurat +runINMF.Seurat <- function( + object, + datasetVar, + k, + lambda = 5.0, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, + verbose = getOption("ligerVerbose"), + ... +) { + if (!requireNamespace("Seurat", quietly = TRUE)) { + stop("Seurat installation required. Please run\n", + "install.packages(\"Seurat\")") + } + EBind <- Seurat::GetAssayData(object, "scale.data") + if (any(EBind < 0)) { + stop("Non-negative Matrix Factorization requires non-negative data. ", + "Please scale the library-size-normalized data without centering.") + } + if (is.character(datasetVar) && length(datasetVar) == 1) { + datasetVar <- object[[datasetVar]][[1]] + } + if (!is.factor(datasetVar) || length(datasetVar) != ncol(EBind)) { + stop("Invalid `datasetVar`. Please see `?runINMF` for instruction.") + } + datasetVar <- droplevels(datasetVar) + Es <- lapply(levels(datasetVar), function(d) { + as(EBind[, datasetVar == d], "CsparseMatrix") + }) + names(Es) <- levels(datasetVar) + runINMF.list( + object = Es, + k = k, + lambda = lambda, + nIteration = nIteration, + nRandomStarts = nRandomStarts, + HInit = HInit, + WInit = WInit, + VInit = VInit, + seed = seed, + verbose = verbose ) } diff --git a/R/optimizeALS.R b/R/runINMF_R.R similarity index 64% rename from R/optimizeALS.R rename to R/runINMF_R.R index 1bb9f2c3..399a4a9b 100644 --- a/R/optimizeALS.R +++ b/R/runINMF_R.R @@ -39,6 +39,8 @@ #' @param V.init Initial values to use for \eqn{V} matrices. A list object where #' each element is the initial \eqn{V} matrix of each dataset. Default #' \code{NULL}. +#' @param method NNLS subproblem solver. Choose from \code{"liger"} (default +#' original implementation), \code{"planc"} or \code{"rcppml"}. #' @param useUnshared Logical, whether to include unshared variable features and #' run optimizeUANLS algorithm. Defaul \code{FALSE}. Running #' \code{\link{selectGenes}} with \code{unshared = TRUE} and then running @@ -57,16 +59,16 @@ #' matrix, and the \code{H} and \code{V} slots of each #' \linkS4class{ligerDataset} object in the \code{datasets} slot updated with #' the dataset specific \eqn{H} and \eqn{V} matrix, respectively. -#' @rdname optimizeALS +#' @rdname runINMF_R #' @export #' @examples #' pbmc <- normalize(pbmc) #' pbmc <- selectGenes(pbmc) #' pbmc <- scaleNotCenter(pbmc) #' # Only running a few iterations for fast examples -#' pbmc <- optimizeALS(pbmc, k = 20, maxIter = 2) +#' pbmc <- runINMF(pbmc, k = 20, maxIter = 2) setGeneric( - "optimizeALS", + "runINMF_R", function( object, k, @@ -77,23 +79,18 @@ setGeneric( H.init = NULL, W.init = NULL, V.init = NULL, + method = c("planc", "liger", "rcppml"), useUnshared = FALSE, seed = 1, readH5 = "auto", - verbose = getOption("ligerVerbose"), - # Deprecated coding style - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - # Deprecated functionality - print.obj = NULL - ) standardGeneric("optimizeALS") + verbose = getOption("ligerVerbose") + ) standardGeneric("runINMF_R") ) -#' @rdname optimizeALS +#' @rdname runINMF_R #' @export setMethod( - "optimizeALS", + "runINMF_R", signature(object = "liger"), function( object, @@ -108,16 +105,8 @@ setMethod( useUnshared = FALSE, seed = 1, readH5 = "auto", - verbose = getOption("ligerVerbose"), - # Deprecated coding style - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - # Deprecated functionality - print.obj = NULL + verbose = getOption("ligerVerbose") ) { - .deprecateArgs(list(max.iters = "maxIter", use.unshared = "useUnshared", - rand.seed = "seed"), defunct = "print.obj") .checkObjVersion(object) object <- recordCommand(object) if (isFALSE(useUnshared)) { @@ -155,10 +144,10 @@ setMethod( "set to FALSE.") } } else { - return(as.matrix(scaleData(ld))) + return(scaleData(ld)) } }) - out <- optimizeALS( + out <- runINMF_R( object = data, k = k, lambda = lambda, @@ -173,19 +162,16 @@ setMethod( verbose = verbose ) object@W <- out$W - rownames(object@W) <- varFeatures(object) for (d in names(object)) { ld <- dataset(object, d) ld@H <- out$H[[d]] - colnames(ld@H) <- colnames(ld) ld@V <- out$V[[d]] - rownames(ld@V) <- varFeatures(object) datasets(object, check = FALSE)[[d]] <- ld } object@uns$factorization$k <- k object@uns$factorization$lambda <- lambda } else { - object <- optimizeUANLS( + object <- runUINMF( object = object, k = k, lambda = lambda, @@ -200,65 +186,53 @@ setMethod( } ) -#' @rdname optimizeALS +#' @rdname runINMF_R #' @export setMethod( - "optimizeALS", + "runINMF_R", signature(object = "list"), function( object, k, lambda = 5.0, - thresh = 1e-6, maxIter = 30, nrep = 1, H.init = NULL, W.init = NULL, V.init = NULL, + method = c("planc", "liger", "rcppml"), useUnshared = FALSE, seed = 1, readH5 = "auto", - verbose = getOption("ligerVerbose"), - # Deprecated coding style - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - # Deprecated functionality - print.obj = NULL + verbose = getOption("ligerVerbose") ) { - .deprecateArgs(list(max.iters = "maxIter", use.unshared = "useUnshared", - rand.seed = "seed"), defunct = "print.obj") - if (!all(sapply(object, is.matrix))) { - stop("All values in 'object' must be a matrix") - } # E ==> cell x gene scaled matrices - E <- lapply(object, t) + E <- object nDatasets <- length(E) - nCells <- sapply(E, nrow) - tmp <- gc() # nolint - nGenes <- ncol(E[[1]]) + nCells <- sapply(E, ncol) + nGenes <- nrow(E[[1]]) if (k >= nGenes) { stop("Select k lower than the number of variable genes: ", nGenes) } - Wm <- matrix(0, k, nGenes) - Vm <- rep(list(matrix(0, k, nGenes)), nDatasets) + Wm <- matrix(0, nGenes, k) + Vm <- rep(list(matrix(0, nGenes, k)), nDatasets) Hm <- lapply(nCells, function(n) matrix(0, n, k)) - tmp <- gc() + bestObj <- Inf bestSeed <- seed - runStats <- matrix(0, nrep, 2) for (i in seq(nrep)) { set.seed(seed = seed + i - 1) startTime <- Sys.time() if (!is.null(W.init)) - W <- t(.checkInit(W.init, nCells, nGenes, k, "W")) - else W <- matrix(stats::runif(nGenes * k, 0, 2), k, nGenes) + W <- .checkInit(W.init, nCells, nGenes, k, "W") + else W <- matrix(stats::runif(nGenes * k, 0, 2), nGenes, k) + if (!is.null(V.init)) { V <- .checkInit(V.init, nCells, nGenes, k, "V") - V <- lapply(V, t) } else V <- lapply(seq(nDatasets), function(i) { - matrix(stats::runif(nGenes * k, 0, 2), k, nGenes)}) + matrix(stats::runif(nGenes * k, 0, 2), nGenes, k)}) + if (!is.null(H.init)) { H <- .checkInit(H.init, nCells, nGenes, k, "H") H <- lapply(H, t) @@ -266,71 +240,27 @@ setMethod( H <- lapply(nCells, function(n) { matrix(stats::runif(n * k, 0, 2), n, k) }) - tmp <- gc() - delta <- 1 - iters <- 0 - sqrtLambda <- sqrt(lambda) - obj0 <- sum(sapply( - seq(nDatasets), - function(i) norm(E[[i]] - H[[i]] %*% (W + V[[i]]), "F") ^ 2 - )) + - sum(sapply( - seq(nDatasets), - function(i) lambda*norm(H[[i]] %*% V[[i]], "F") ^ 2 - )) - tmp <- gc() if (isTRUE(verbose)) { .log("Start iNMF with seed: ", seed + i - 1, "...") - pb <- utils::txtProgressBar(0, maxIter, style = 3) + if (maxIter > 0) + pb <- utils::txtProgressBar(0, maxIter, style = 3) } - while (delta > thresh & iters < maxIter) { - H <- lapply( - seq(nDatasets), - function(i) - t(solveNNLS( - C = rbind(t(W + V[[i]]), sqrtLambda*t(V[[i]])), - B = rbind(t(E[[i]]), matrix(0, nGenes, nCells[i])) - )) - ) - tmp <- gc() - V <- lapply( - seq(nDatasets), - function(i) - solveNNLS(C = rbind(H[[i]], sqrtLambda*H[[i]]), - B = rbind(E[[i]] - H[[i]] %*% W, - matrix(0, nCells[[i]], nGenes))) - ) - tmp <- gc() - W <- solveNNLS(C = rbindlist(H), - B = rbindlist(lapply(seq(nDatasets), - function(i) E[[i]] - H[[i]] %*% V[[i]] - ))) - tmp <- gc() - obj <- sum(sapply( - seq(nDatasets), - function(i) norm(E[[i]] - H[[i]] %*% (W + V[[i]]), "F") ^ 2 - )) + - sum(sapply( - seq(nDatasets), - function(i) - lambda*norm(H[[i]] %*% V[[i]], "F") ^ 2 - )) - tmp <- gc() - delta <- abs(obj0 - obj) / (mean(obj0, obj)) - obj0 <- obj - iters <- iters + 1 - if (isTRUE(verbose)) - utils::setTxtProgressBar(pb, value = iters) + iter <- 0 + while (iter < maxIter) { + H <- inmfSolveH(W = W, V = V, E = E, lambda = lambda) + V <- inmfSolveV(W = W, H = H, E = E, lambda = lambda) + W <- inmfSolveW(H = H, V = V, E = E, lambda = lambda) + iter <- iter + 1 + if (isTRUE(verbose) && maxIter > 0) + utils::setTxtProgressBar(pb, value = iter) + } - if (isTRUE(verbose)) { + if (isTRUE(verbose) && maxIter > 0) { utils::setTxtProgressBar(pb, value = maxIter) cat("\n") } - # if (iters == maxIter) { - # print("Warning: failed to converge within the allowed number of iterations. - # Re-running with a higher maxIter is recommended.") - # } + obj <- inmf_calcObj(E, H, W, V, lambda) if (obj < bestObj) { Wm <- W Hm <- H @@ -339,33 +269,68 @@ setMethod( bestSeed <- seed + i - 1 } endTime <- difftime(time1 = Sys.time(), time2 = startTime, - units = "auto") - runStats[i, 1] <- as.double(endTime) - runStats[i, 2] <- iters + units = "auto") if (isTRUE(verbose)) { - .log("Finished in ", runStats[i, 1], " ", units(endTime), - ", ", iters, " iterations. \nMax iterations set: ", - maxIter, "\nFinal objective delta: ", delta) + .log("Finished in ", endTime, " ", units(endTime), + "\nObjective error: ", bestObj) .log("Objective: ", obj) .log("Best results with seed ", bestSeed) } } - out <- list(H = Hm, V = Vm, W = t(Wm)) + out <- list(H = lapply(Hm, t), V = Vm, W = Wm) factorNames <- paste0("Factor_", seq(k)) for (i in seq(nDatasets)) { - out$H[[i]] <- t(out$H[[i]]) - colnames(out$H[[i]]) <- colnames(object[[i]]) - rownames(out$H[[i]]) <- factorNames - out$V[[i]] <- t(out$V[[i]]) - rownames(out$V[[i]]) <- rownames(object[[i]]) - colnames(out$V[[i]]) <- factorNames + dimnames(out$H[[i]]) <- list(factorNames, colnames(object[[i]])) + dimnames(out$V[[i]]) <- list(rownames(object[[i]]), factorNames) } names(out$V) <- names(out$H) <- names(object) - rownames(out$W) <- rownames(object[[1]]) - colnames(out$W) <- factorNames + dimnames(out$W) <- list(rownames(object[[1]]), factorNames) return(out) } ) -# Binds list of matrices row-wise (vertical stack) -rbindlist <- function(mat_list) do.call(rbind, mat_list) +inmf_calcObj <- function(E, H, W, V, lambda) { + # E - dgCMatrix + # H, W, V - matrix + obj <- 0 + for (i in seq_along(E)) { + obj <- obj + + Matrix::norm(E[[i]] - (W + V[[i]]) %*% t(H[[i]]), "F") ^ 2 + + lambda*norm(V[[i]] %*% t(H[[i]]), "F") ^ 2 + } + return(obj) +} + +inmfSolveH <- function(W, V, E, lambda) { + H <- list() + for (i in seq_along(E)) { + CtC <- t(W + V[[i]]) %*% (W + V[[i]]) + lambda*(t(V[[i]]) %*% V[[i]]) + CtB <- as.matrix(t(W + V[[i]]) %*% E[[i]]) + H[[i]] <- t(RcppPlanc::bppnnls_prod(CtC, CtB)) + } + return(H) +} + +inmfSolveV <- function(W, H, E, lambda) { + V <- list() + for (i in seq_along(E)) { + CtC <- (1 + lambda)*(t(H[[i]]) %*% H[[i]]) + CtB <- as.matrix(t(H[[i]]) %*% t(E[[i]])) + CtB <- CtB - t(H[[i]]) %*% H[[i]] %*% t(W) + V[[i]] <- t(RcppPlanc::bppnnls_prod(CtC, CtB)) + } + return(V) +} + +inmfSolveW <- function(H, V, E, lambda) { + m <- nrow(E[[1]]) + k <- ncol(H[[1]]) + CtC <- matrix(0, k, k) + CtB <- matrix(0, k, m) + for (i in seq_along(E)) { + CtC <- CtC + t(H[[i]]) %*% H[[i]] + CtB <- CtB + as.matrix(t(H[[i]]) %*% t(E[[i]])) - + t(H[[i]]) %*% H[[i]] %*% t(V[[i]]) + } + return(t(RcppPlanc::bppnnls_prod(CtC, CtB))) +} diff --git a/R/runOnlineINMF.R b/R/runOnlineINMF.R new file mode 100644 index 00000000..f31dbc14 --- /dev/null +++ b/R/runOnlineINMF.R @@ -0,0 +1,262 @@ +#' Perform online iNMF on scaled datasets +#' @description Perform online integrative non-negative matrix factorization to +#' represent multiple single-cell datasets in terms of \eqn{H}, \eqn{W}, and +#' \eqn{V} matrices. It optimizes the iNMF objective function using online +#' learning (non-negative least squares for H matrix, hierarchical alternating +#' least squares for \eqn{W} and \eqn{V} matrices), where the number of factors +#' is set by \code{k}. The function allows online learning in 3 scenarios: +#' +#' \enumerate{ +#' \item Fully observed datasets; +#' \item Iterative refinement using continually arriving datasets; +#' \item Projection of new datasets without updating the existing factorization +#' } +#' +#' All three scenarios require fixed memory independent of the number of cells. +#' +#' For each dataset, this factorization produces an \eqn{H} matrix (k by cell), +#' a \eqn{V} matrix (genes by k) \eqn{C^\mathsf{T}C}, and a shared \eqn{W} matrix (genes by k). The +#' \eqn{H} matrices represent the cell factor loadings. \eqn{W} is identical +#' among all datasets, as it represents the shared components of the metagenes +#' across datasets. The \eqn{V} matrices represent the dataset-specific +#' components of the metagenes. +#' @details +#' For optional initialization, \code{W.init} must be a matrix object with +#' number of rows equal to number of variable genes (denoted as \code{g}) and +#' number of columns equal to \code{k}. Any of \code{V.init}, \code{A} and +#' \code{B} must be a list object of n matrices where n is the number of +#' datasets in \code{object}. For \code{V.init}, each matrix should be of size g +#' x k. For \code{A.init}, each matrix should be k x k and for \code{B.init}, +#' each matrix should be g x k. +#' +#' Minibatch iterations is performed on small subset of cells. The exact +#' minibatch size applied on each dataset is \code{miniBatch_size} multiplied by +#' the proportion of cells in this dataset out of all cells. The setting of +#' \code{miniBatch_size} is by default \code{5000}, which is reasonable. +#' However, a smaller value such as \code{1000} may be necessary for analyzing +#' very small datasets. In general, \code{miniBatch_size} should be no larger +#' than the number of cells in the smallest dataset. An epoch is one completion +#' of calculation on all cells after a number of iterations of minibatches. +#' Therefore, the total number of iterations is determined by the setting of +#' \code{max.epochs}, total number of cells, and \code{miniBatch_size}. +#' +#' @param object \linkS4class{liger} object. Scaled data required. +#' @param newDatasets New datasets for scenario 2 or scenario 3. +#' @param projection Perform data integration with scenario 3. See description. +#' Default \code{FALSE}. +#' @param WInit Optional initialization for W. See detail. Default \code{NULL}. +#' @param VInit Optional initialization for V. See detail. Default \code{NULL}. +#' @param AInit Optional initialization for A. See detail. Default \code{NULL}. +#' @param BInit Optional initialization for B. See detail. Default \code{NULL}. +#' @param k Inner dimension of factorization--number of metagenes. A value in +#' the range 20-50 works well for most analyses. Default \code{20}. +#' @param lambda Regularization parameter. Larger values penalize +#' dataset-specific effects more strongly (i.e. alignment should increase as +#' lambda increases). We recommend always using the default value except +#' possibly for analyses with relatively small differences (biological +#' replicates, male/female comparisons, etc.) in which case a lower value such +#' as 1.0 may improve reconstruction quality. Default \code{5.0}. +#' @param maxEpochs The number of epochs to iterate through. See detail. +#' Default \code{5}. +#' @param HALSiters Maximum number of block coordinate descent (HALS +#' algorithm) iterations to perform for each update of \eqn{W} and \eqn{V}. +#' Default \code{1}. Changing this parameter is not recommended. +#' @param miniBatchSize Total number of cells in each minibatch. See detail. +#' Default \code{5000}. +#' @param seed Random seed to allow reproducible results. Default \code{123}. +#' @param verbose Logical. Whether to show information of the progress. Default +#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' @return \code{object} with \code{W} slot updated with resulting \eqn{W} +#' matrix; the \code{H}, \code{V}, \code{A} and \code{B} slots of each +#' \linkS4class{ligerDataset} object in \code{datasets} slot is updated with the +#' corresponding result matrices. +#' @export +#' @rdname runOnlineINMF +#' @examples +#' pbmc <- normalize(pbmc) +#' pbmc <- selectGenes(pbmc) +#' pbmc <- scaleNotCenter(pbmc) +#' # Minibatch size has to be less than number of cell in the smallest dataset +#' # Scenario 1 +#' pbmc <- online_iNMF(pbmc, miniBatch_size = 100) +#' # Scenario 2 +#' # Fake new dataset by increasing all non-zero value in "ctrl" by 1 +#' ctrl2 <- rawData(dataset(pbmc, "ctrl")) +#' ctrl2@x <- ctrl2@x + 1 +#' colnames(ctrl2) <- paste0(colnames(ctrl2), 2) +#' pbmc2 <- online_iNMF(pbmc, k = 20, X_new = list(ctrl2 = ctrl2), +#' miniBatch_size = 100) +#' # Scenario 3 +#' pbmc3 <- online_iNMF(pbmc, k = 20, X_new = list(ctrl2 = ctrl2), +#' miniBatch_size = 100, projection = TRUE) +runOnlineINMF <- function( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) { + UseMethod("runOnlineINMF", object) +} + +#' @export +#' @rdname runOnlineINMF +#' @method runOnlineINMF liger +runOnlineINMF.liger <- function( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) { + .checkObjVersion(object) + object <- recordCommand(object, dependencies = c("hdf5r", "RcppPlanc")) + Es <- getMatrix(object, "scaleData", returnList = TRUE) + Es <- lapply(datasets(object), function(ld) { + sd <- scaleData(ld) + if (is.null(sd)) + stop("Scaled data not available. Run `scaleNotCenter()` first") + if (inherits(sd, "H5D")) return(.H5DToH5Mat(sd)) + else if (inherits(sd, "H5Group")) + return(.H5GroupToH5SpMat(sd, c(length(varFeatures(object)), ncol(ld)))) + else return(scaleData(ld)) + }) + WInit <- VInit <- AInit <- BInit <- NULL + if (!is.null(newDatasets)) { + WInit <- getMatrix(object, "W") + VInit <- getMatrix(object, "V") + AInit <- getMatrix(object, "A") + BInit <- getMatrix(object, "B") + if (is.null(WInit) || any(sapply(VInit, is.null)) || + any(sapply(AInit, is.null)) || any(sapply(BInit, is.null))) { + stop("Cannot find complete online iNMF result for current ", + "datasets. Please run `runOnlineINMF()` without `newDataset` ", + "first.") + } + # Put new datasets into input liger object + if (is.list(newDatasets)) { + allType <- sapply(newDatasets, function(x) class(x)[1]) + if (!all(allType == "dgCMatrix")) { + stop("`newDatasets` must be a list of dgCMatrix for now.") + } + if (is.null(names(newDatasets))) { + stop("`newDatasets` must be a named list.") + } + allNewNames <- character() + for (i in seq_along(newDatasets)) { + if (names(newDatasets)[i] %in% names(object)) { + newName <- paste0(names(newDatasets)[i], ".1") + } else newName <- names(newDatasets)[i] + dataset(object, newName) <- newDatasets[[i]] + allNewNames <- c(allNewNames, newName) + } + object <- normalize(object, useDatasets = allNewNames, + verbose = verbose) + object <- scaleNotCenter(object, useDatasets = allNewNames, + verbose = verbose) + newDatasets <- getMatrix(object, slot = "scaleData", + dataset = allNewNames, returnList = TRUE) + } + } + object <- closeAllH5(object) + res <- runOnlineINMF.list(Es, newDatasets = newDatasets, + projection = projection, k = k, lambda = lambda, + maxEpochs = maxEpochs, + miniBatchSize = miniBatchSize, + HALSiter = HALSiter, verbose = verbose, + WInit = WInit, VInit = VInit, AInit = AInit, + BInit = BInit, seed = seed) + + for (i in seq_along(object)) { + ld <- dataset(object, i) + ld@H <- res$H[[i]] + ld@V <- res$V[[i]] + ld@A <- res$A[[i]] + ld@B <- res$B[[i]] + datasets(object, check = FALSE)[[i]] <- ld + } + object@W <- res$W + object@uns$factorization <- list(k = k, lambda = lambda) + suppressMessages({object <- restoreH5Liger(object)}) + return(object) +} + +#' @export +#' @rdname runOnlineINMF +#' @method runOnlineINMF list +runOnlineINMF.list <- function( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + WInit = NULL, + VInit = NULL, + AInit = NULL, + BInit = NULL, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) { + if (!requireNamespace("RcppPlanc", quietly = TRUE)) + stop("RcppPlanc installation required. Currently, please get the ", + "GitHub private repository access from the lab and run: \n", + "devtools::install_github(\"welch-lab/RcppPlanc\")") + nDatasets <- length(object) + length(newDatasets) + barcodeList <- c(lapply(object, colnames), lapply(newDatasets, colnames)) + features <- rownames(object[[1]]) + if (!is.null(seed)) set.seed(seed) + res <- RcppPlanc::onlineINMF(objectList = object, newDatasets = newDatasets, + project = projection, k = k, lambda = lambda, + maxEpoch = maxEpochs, + minibatchSize = miniBatchSize, + maxHALSIter = HALSiter, Vinit = VInit, + Winit = WInit, Ainit = AInit, Binit = BInit, + verbose = verbose) + factorNames <- paste0("Factor_", seq(k)) + for (i in seq(nDatasets)) { + res$H[[i]] <- t(res$H[[i]]) + dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[i]]) + dimnames(res$V[[i]]) <- list(features, factorNames) + dimnames(res$A[[i]]) <- list(factorNames, factorNames) + dimnames(res$B[[i]]) <- list(features, factorNames) + } + names(res$A) <- names(res$B) <- names(res$V) <- + names(res$H) <- c(names(object), names(newDatasets)) + dimnames(res$W) <- list(features, factorNames) + return(res) +} + +#' @export +#' @rdname runOnlineINMF +#' @method runOnlineINMF Seurat +runOnlineINMF.Seurat <- function( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) { + +} diff --git a/R/runUINMF.R b/R/runUINMF.R index 2c111ccf..9037cb9c 100644 --- a/R/runUINMF.R +++ b/R/runUINMF.R @@ -31,6 +31,7 @@ runUINMF <- function( #' @export #' @rdname runUINMF +#' @method runUINMF liger runUINMF.liger <- function( object, k = 20, @@ -41,25 +42,40 @@ runUINMF.liger <- function( verbose = getOption("ligerVerbose"), ... ) { + .checkObjVersion(object) object <- recordCommand(object, dependencies = "RcppPlanc") - Elist <- getMatrix(object, "scaleData", returnList = TRUE) + object <- removeMissing(object, orient = "cell", verbose = verbose) + # Elist <- getMatrix(object, "scaleData", returnList = TRUE) + + Elist <- lapply(datasets(object), function(ld) { + if (is.null(scaleData(ld))) + stop("Scaled data not available. ", + "Run `scaleNotCenter(object)` first") + return(scaleData(ld)) + }) Ulist <- getMatrix(object, "scaleUnsharedData", returnList = TRUE) + if (all(sapply(Ulist, is.null))) { + stop("No scaled data for unshared feature found. Run `selectGenes()` ", + "with `unshared = TRUE` and then `scaleNotCenter()`.") + } res <- runUINMF.list(Elist, Ulist, k = k, lambda = lambda, nIteration = nIteration, nRandomStarts = nRandomStarts, seed = seed, verbose = verbose, ...) for (i in seq_along(object)) { ld <- dataset(object, i) - ld@H <- t(res$H[[i]]) + ld@H <- res$H[[i]] ld@V <- res$V[[i]] if (!is.null(ld@scaleUnsharedData)) ld@U <- res$U[[i]] datasets(object, check = FALSE)[[i]] <- ld } object@W <- res$W + object@uns$factorization <- list(k = k, lambda = lambda) return(object) } #' @export #' @rdname runUINMF +#' @method runUINMF list #' @param unsharedList List of matrices for unshared features runUINMF.list <- function( object, @@ -72,27 +88,9 @@ runUINMF.list <- function( verbose = getOption("ligerVerbose"), ... ) { - nGene <- sapply(object, nrow) - if (!all(nGene == nGene[1])) { - stop("Number of rows must be the same in all matrices in `object`") - } - for (i in seq_along(object)) { - # Force sparse, and create 0xN matrix for dataset without unshared - object[[i]] <- as(object[[i]], "CsparseMatrix") - if (!is.null(unsharedList[[i]])) { - if (ncol(unsharedList[[i]]) != ncol(object[[i]])) { - stop("Number of columns in each matrix from `unsharedList` ", - "must match with the corresponding matrix from `object`") - } - unsharedList[[i]] <- as(unsharedList[[i]], "CsparseMatrix") - } else { - unsharedList[[i]] <- as(as(as(Matrix::Matrix( - nrow = 0, ncol = ncol(object[[i]]) - ), "dMatrix"), "generalMatrix"), "CsparseMatrix") - } - } bestObj <- Inf bestRes <- NULL + bestSeed <- NULL for (i in seq(nRandomStarts)) { seed <- seed + i - 1 set.seed(seed) @@ -101,28 +99,32 @@ runUINMF.list <- function( if (res$objErr < bestObj) { bestRes <- res bestObj <- res$objErr + bestSeed <- seed } } + if (isTRUE(verbose) && nRandomStarts > 1) { + .log("Best objective error: ", bestObj, "\nBest seed: ", bestSeed) + } rm(res) - genes <- rownames(object[[1]]) + features <- rownames(object[[1]]) unsharedFeatures <- lapply(unsharedList, rownames) factorNames <- paste0("Factor_", seq(k)) barcodes <- lapply(object, colnames) for (i in seq_along(object)) { - rownames(bestRes$H[[i]]) <- barcodes[[i]] - colnames(bestRes$H[[i]]) <- factorNames - rownames(bestRes$V[[i]]) <- genes - colnames(bestRes$V[[i]]) <- factorNames + bestRes$H[[i]] <- t(bestRes$H[[i]]) + dimnames(bestRes$H[[i]]) <- list(factorNames, barcodes[[i]]) + dimnames(bestRes$V[[i]]) <- list(features, factorNames) + dimnames(bestRes$U[[i]]) <- list(unsharedFeatures[[i]], factorNames) rownames(bestRes$U[[i]]) <- unsharedFeatures[[i]] colnames(bestRes$U[[i]]) <- factorNames } - rownames(bestRes$W) <- genes - colnames(bestRes$W) <- factorNames + dimnames(bestRes$W) <- list(features, factorNames) return(bestRes) } #' @export #' @rdname runUINMF +#' @method runUINMF Seurat #' @param datasetVar Variable for dataset belonging. runUINMF.Seurat <- function( object, diff --git a/R/scaleNotCenter.R b/R/scaleNotCenter.R index 99cc1a6d..a0e592ba 100644 --- a/R/scaleNotCenter.R +++ b/R/scaleNotCenter.R @@ -137,6 +137,7 @@ scaleNotCenter <- function( dims = length(features), dtype = "int") h5file[[paste0(resultH5Path, "/featureIdx")]][1:length(featureIdx)] <- featureIdx + h5fileInfo(ld, "scaleData", check = FALSE) <- resultH5Path return(ld) } @@ -169,7 +170,7 @@ scaleNotCenter <- function( cellIdx] <- chunk } ) - h5fileInfo(ld, resultH5Path, check = FALSE) <- resultH5Path + h5fileInfo(ld, "scaleData", check = FALSE) <- resultH5Path safeH5Create( ld, dataPath = paste0(resultH5Path, ".featureIdx"), @@ -188,9 +189,8 @@ scaleNotCenter <- function( dimnames = list(NULL, colnames(norm.subset)))) scaled <- t(scaleNotCenterFast(t(norm.subset))) # scaled: g x c - # scaled <- as.matrix(t(scaled)) - scaled[is.na(scaled)] <- 0 - scaled[scaled == Inf] = 0 + scaled@x[is.na(scaled@x)] <- 0 + scaled@x[scaled@x == Inf] <- 0 rownames(scaled) <- rownames(norm.subset) colnames(scaled) <- colnames(norm.subset) return(scaled) diff --git a/man/closeAllH5.Rd b/man/closeAllH5.Rd new file mode 100644 index 00000000..5be7bf7d --- /dev/null +++ b/man/closeAllH5.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/h5Utility.R +\name{closeAllH5} +\alias{closeAllH5} +\title{Close all links (to HDF5 files) of a liger object} +\usage{ +closeAllH5(object) +} +\arguments{ +\item{object}{liger object.} +} +\value{ +\code{object} with links closed. +} +\description{ +When need to interact with the data embedded in HDF5 files out +of the currect R session, the HDF5 files has to be closed in order to be +available to other processes. +} diff --git a/man/liger-class.Rd b/man/liger-class.Rd index 00c81d82..ca2d3e46 100644 --- a/man/liger-class.Rd +++ b/man/liger-class.Rd @@ -36,7 +36,7 @@ \alias{cellMeta<-,liger,missing-method} \alias{cellMeta<-,liger,character-method} \alias{[[,liger,ANY,missing-method} -\alias{[[<-,liger,ANY,missing,ANY-method} +\alias{[[<-,liger,ANY,missing-method} \alias{$,liger-method} \alias{$<-,liger-method} \alias{varFeatures} @@ -81,7 +81,7 @@ datasets(x, check = TRUE) <- value dataset(x, dataset = NULL) -dataset(x, dataset, type = NULL, qc = TRUE) <- value +dataset(x, dataset, type = NULL, qc = FALSE) <- value \S4method{dataset}{liger,character_OR_NULL}(x, dataset = NULL) @@ -89,9 +89,9 @@ dataset(x, dataset, type = NULL, qc = TRUE) <- value \S4method{dataset}{liger,numeric}(x, dataset = NULL) -\S4method{dataset}{liger,character,missing,ANY,ligerDataset}(x, dataset, type = NULL, qc = TRUE) <- value +\S4method{dataset}{liger,character,missing,ANY,ligerDataset}(x, dataset, type = NULL, qc = FALSE) <- value -\S4method{dataset}{liger,character,ANY,ANY,matrixLike}(x, dataset, type = c("rawData", "normData", "scaleData"), qc = TRUE) <- value +\S4method{dataset}{liger,character,ANY,ANY,matrixLike}(x, dataset, type = c("rawData", "normData", "scaleData"), qc = FALSE) <- value \S4method{dataset}{liger,character,missing,ANY,`NULL`}(x, dataset, type = NULL, qc = TRUE) <- value @@ -117,7 +117,7 @@ cellMeta(x, columns = NULL, check = FALSE) <- value \S4method{[[}{liger,ANY,missing}(x, i, j, ...) -\S4method{[[}{liger,ANY,missing,ANY}(x, i, j, ...) <- value +\S4method{[[}{liger,ANY,missing}(x, i, j, ...) <- value \S4method{$}{liger}(x, name) diff --git a/man/ligerDataset-class.Rd b/man/ligerDataset-class.Rd index deecb4d9..1d19d0b3 100644 --- a/man/ligerDataset-class.Rd +++ b/man/ligerDataset-class.Rd @@ -26,11 +26,13 @@ \alias{scaleData,ligerDataset,missing-method} \alias{scaleData<-,ligerDataset,ANY,matrixLike_OR_NULL-method} \alias{scaleData<-,ligerDataset,ANY,H5D-method} +\alias{scaleData<-,ligerDataset,ANY,H5Group-method} \alias{scaleUnsharedData} \alias{scaleUnsharedData<-} \alias{scaleUnsharedData,ligerDataset,missing-method} \alias{scaleUnsharedData<-,ligerDataset,ANY,matrixLike_OR_NULL-method} \alias{scaleUnsharedData<-,ligerDataset,ANY,H5D-method} +\alias{scaleUnsharedData<-,ligerDataset,ANY,H5Group-method} \alias{getMatrix} \alias{getMatrix,ligerDataset,ANY,missing,missing-method} \alias{h5fileInfo} @@ -90,6 +92,8 @@ scaleData(x, check = TRUE) <- value \S4method{scaleData}{ligerDataset,ANY,H5D}(x, check = TRUE) <- value +\S4method{scaleData}{ligerDataset,ANY,H5Group}(x, check = TRUE) <- value + scaleUnsharedData(x, dataset = NULL) scaleUnsharedData(x, check = TRUE) <- value @@ -100,6 +104,8 @@ scaleUnsharedData(x, check = TRUE) <- value \S4method{scaleUnsharedData}{ligerDataset,ANY,H5D}(x, check = TRUE) <- value +\S4method{scaleUnsharedData}{ligerDataset,ANY,H5Group}(x, check = TRUE) <- value + getMatrix(x, slot = "rawData", dataset = NULL, returnList = FALSE) \S4method{getMatrix}{ligerDataset,ANY,missing,missing}( diff --git a/man/runBPPINMF.Rd b/man/runBPPINMF.Rd deleted file mode 100644 index e2a7c37e..00000000 --- a/man/runBPPINMF.Rd +++ /dev/null @@ -1,172 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/runBPPINMF.R -\name{runBPPINMF} -\alias{runBPPINMF} -\alias{runBPPINMF.liger} -\alias{runBPPINMF.list} -\alias{runBPPINMF.Seurat} -\title{Perform iNMF on scaled datasets} -\usage{ -runBPPINMF( - object, - k, - lambda = 5, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - ... -) - -\method{runBPPINMF}{liger}( - object, - k, - lambda = 5, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - readH5 = "auto", - ... -) - -\method{runBPPINMF}{list}( - object, - k, - lambda = 5, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - barcodeList = NULL, - features = NULL, - ... -) - -\method{runBPPINMF}{Seurat}( - object, - datasetVar, - k, - lambda = 5, - nIteration = 30, - nRandomStarts = 1, - HInit = NULL, - WInit = NULL, - VInit = NULL, - seed = 1, - verbose = getOption("ligerVerbose"), - ... -) -} -\arguments{ -\item{object}{A \linkS4class{liger} object, a Seurat object or a named list -of matrix, dgCMatrix, H5D objects, where the names represents dataset names -and matrices are scaled on the same set of variable features, with rows as -features and columns as cells.} - -\item{k}{Inner dimension of factorization (number of factors). Run -\code{\link{suggestK}} to determine appropriate value; a general rule of -thumb is that a higher \code{k} will be needed for datasets with more -sub-structure.} - -\item{lambda}{Regularization parameter. Larger values penalize -dataset-specific effects more strongly (i.e. alignment should increase as -\code{lambda} increases). Default \code{5}.} - -\item{nIteration}{Total number of block coordinate descent iterations to -perform. Default \code{30}.} - -\item{nRandomStarts}{Number of restarts to perform (iNMF objective function -is non-convex, so taking the best objective from multiple successive -initialization is recommended). For easier reproducibility, this increments -the random seed by 1 for each consecutive restart, so future factorization -of the same dataset can be run with one rep if necessary. Default \code{1}.} - -\item{HInit}{Initial values to use for \eqn{H} matrices. A list object where -each element is the initial \eqn{H} matrix of each dataset. Default -\code{NULL}.} - -\item{WInit}{Initial values to use for \eqn{W} matrix. A matrix object. -Default \code{NULL}.} - -\item{VInit}{Initial values to use for \eqn{V} matrices. A list object where -each element is the initial \eqn{V} matrix of each dataset. Default -\code{NULL}.} - -\item{seed}{Random seed to allow reproducible results. Default \code{1}.} - -\item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} - -\item{...}{Arguments passed to methods.} - -\item{readH5}{\code{TRUE} to force reading H5 based data into memory and -conduct factorization. \code{"auto"} reads H5 dataset with less than 8000 -cells. \code{FALSE} will stop users from running if H5 data presents.} - -\item{barcodeList}{List object of barcodes for each datasets, for setting -dimnames of output \eqn{H} matrices. Default \code{NULL} uses \code{colnames} -of matrices in the \code{object}.} - -\item{features}{Character vector of feature names, for setting dimnames of -output \eqn{V} and \eqn{W} matrices. Default \code{NULL} uses \code{rownames} -of matrices in the \code{object}.} - -\item{datasetVar}{Variable name in metadata indicating a factor of dataset -belonging, or directly a factor that match with the number of cells.} -} -\value{ -The liger method returns the input \linkS4class{liger} object with -factorization result updated. A list of all \eqn{H} matrices can be accessed -with \code{getMatrix(object, "H")}, a list of all \eqn{V} matrices can be -accessed with \code{getMatrix(object, "V")}, and the \eqn{W} matrix can be -accessed with \code{getMatrix(object, "W")}. - -The list method returns a list of entries \code{H}, \code{V} and -\code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} -is a list of \eqn{V} matrices for each dataset. \code{W} is the shared -\eqn{W} matrix. - -The Seurat method returns a list of entries \code{H}, \code{V} and -\code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} -is a list of \eqn{V} matrices for each dataset. \code{W} is the shared -\eqn{W} matrix. -} -\description{ -Performs integrative non-negative matrix factorization (iNMF) (J.D. Welch, -2019) to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. The -objective function is stated as - -\deqn{\arg\min_{H\ge0,W\ge0,V\ge0}\sum_{i}^{d}||E_i-(W+V_i)Hi||^2_F+\lambda\sum_{i}^{d}||V_iH_i||_F^2} - -where \eqn{E_i} is the input non-negative matrix of the i'th dataset, \eqn{d} -is the total number of datasets. - -The factorization produces a shared \eqn{W} matrix (genes by k), and for each -dataset, an \eqn{H} matrix (k by cells) and a \eqn{V} matrix (genes by k). -The \eqn{H} matrices represent the cell factor loadings. \eqn{W} is held -consistent among all datasets, as it represents the shared components of the -metagenes across datasets. The \eqn{V} matrices represent the -dataset-specific components of the metagenes. - -This function adopts highly optimized fast and memory efficient -implementation extended from Planc (Kannan, 2016). Pre-installation of -extension package \code{RcppPlanc} is required. The underlying algorithm -adopts the identical ANLS strategy as \code{\link{optimizeALS}} in the old -version of LIGER. -} -\examples{ -pbmc <- normalize(pbmc) -pbmc <- selectGenes(pbmc) -pbmc <- scaleNotCenter(pbmc) -pbmc <- runBPPINMF(pbmc, k = 20) -} diff --git a/man/runINMF.Rd b/man/runINMF.Rd index 8646d4ec..acc84bb8 100644 --- a/man/runINMF.Rd +++ b/man/runINMF.Rd @@ -2,157 +2,174 @@ % Please edit documentation in R/runINMF.R \name{runINMF} \alias{runINMF} -\alias{runINMF,liger-method} -\alias{runINMF,list-method} +\alias{runINMF.liger} +\alias{runINMF.list} +\alias{runINMF.Seurat} \title{Perform iNMF on scaled datasets} \usage{ runINMF( object, - k, + k = 20, lambda = 5, - thresh = 1e-06, - maxIter = 30, - nrep = 1, - H.init = NULL, - W.init = NULL, - V.init = NULL, - method = c("planc", "liger", "rcppml"), - useUnshared = FALSE, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, seed = 1, - readH5 = "auto", verbose = getOption("ligerVerbose"), - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - print.obj = NULL + ... ) -\S4method{runINMF}{liger}( +\method{runINMF}{liger}( object, - k, + k = 20, lambda = 5, - thresh = 1e-06, - maxIter = 30, - nrep = 1, - H.init = NULL, - W.init = NULL, - V.init = NULL, - method = c("planc", "liger", "rcppml"), - useUnshared = FALSE, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, seed = 1, + verbose = getOption("ligerVerbose"), readH5 = "auto", + ... +) + +\method{runINMF}{list}( + object, + k = 20, + lambda = 5, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, verbose = getOption("ligerVerbose"), - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - print.obj = NULL + barcodeList = NULL, + features = NULL, + ... ) -\S4method{runINMF}{list}( +\method{runINMF}{Seurat}( object, + datasetVar, k, lambda = 5, - thresh = 1e-06, - maxIter = 30, - nrep = 1, - H.init = NULL, - W.init = NULL, - V.init = NULL, - method = c("planc", "liger", "rcppml"), - useUnshared = FALSE, + nIteration = 30, + nRandomStarts = 1, + HInit = NULL, + WInit = NULL, + VInit = NULL, seed = 1, - readH5 = "auto", verbose = getOption("ligerVerbose"), - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - print.obj = NULL + ... ) } \arguments{ -\item{object}{A \linkS4class{liger} object or a named list of matrix object, -where the names represents dataset names and matrices are scaled on the same -set of variable features, with rows as features and columns as cells.} +\item{object}{A \linkS4class{liger} object, a Seurat object or a named list +of matrix, dgCMatrix, H5D objects, where the names represents dataset names +and matrices are scaled on the same set of variable features, with rows as +features and columns as cells.} \item{k}{Inner dimension of factorization (number of factors). Run \code{\link{suggestK}} to determine appropriate value; a general rule of thumb is that a higher \code{k} will be needed for datasets with more -sub-structure.} +sub-structure. Default \code{20}.} \item{lambda}{Regularization parameter. Larger values penalize dataset-specific effects more strongly (i.e. alignment should increase as \code{lambda} increases). Default \code{5}.} -\item{thresh}{Convergence threshold. Convergence occurs when -\eqn{|obj_0-obj|/(mean(obj_0,obj)) < thresh}. Default \code{1e-6}.} - -\item{maxIter}{Maximum number of block coordinate descent iterations to +\item{nIteration}{Total number of block coordinate descent iterations to perform. Default \code{30}.} -\item{nrep}{Number of restarts to perform (iNMF objective function is -non-convex, so taking the best objective from multiple successive +\item{nRandomStarts}{Number of restarts to perform (iNMF objective function +is non-convex, so taking the best objective from multiple successive initialization is recommended). For easier reproducibility, this increments the random seed by 1 for each consecutive restart, so future factorization of the same dataset can be run with one rep if necessary. Default \code{1}.} -\item{H.init}{Initial values to use for \eqn{H} matrices. A list object where +\item{HInit}{Initial values to use for \eqn{H} matrices. A list object where each element is the initial \eqn{H} matrix of each dataset. Default \code{NULL}.} -\item{W.init}{Initial values to use for \eqn{W} matrix. A matrix object. +\item{WInit}{Initial values to use for \eqn{W} matrix. A matrix object. Default \code{NULL}.} -\item{V.init}{Initial values to use for \eqn{V} matrices. A list object where +\item{VInit}{Initial values to use for \eqn{V} matrices. A list object where each element is the initial \eqn{V} matrix of each dataset. Default \code{NULL}.} -\item{method}{NNLS subproblem solver. Choose from \code{"liger"} (default -original implementation), \code{"planc"} or \code{"rcppml"}.} +\item{seed}{Random seed to allow reproducible results. Default \code{1}.} -\item{useUnshared}{Logical, whether to include unshared variable features and -run optimizeUANLS algorithm. Defaul \code{FALSE}. Running -\code{\link{selectGenes}} with \code{unshared = TRUE} and then running -\code{\link{scaleNotCenter}} is required.} +\item{verbose}{Logical. Whether to show information of the progress. Default +\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} -\item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{...}{Arguments passed to methods.} \item{readH5}{\code{TRUE} to force reading H5 based data into memory and conduct factorization. \code{"auto"} reads H5 dataset with less than 8000 cells. \code{FALSE} will stop users from running if H5 data presents.} -\item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +\item{barcodeList}{List object of barcodes for each datasets, for setting +dimnames of output \eqn{H} matrices. Default \code{NULL} uses \code{colnames} +of matrices in the \code{object}.} -\item{max.iters, use.unshared, rand.seed}{\bold{Deprecated}. See Usage section -for replacement.} +\item{features}{Character vector of feature names, for setting dimnames of +output \eqn{V} and \eqn{W} matrices. Default \code{NULL} uses \code{rownames} +of matrices in the \code{object}.} -\item{print.obj}{\bold{Defunct}. Whether to print objective function values -after convergence when \code{verbose = TRUE}. Now always print when verbose.} +\item{datasetVar}{Variable name in metadata indicating a factor of dataset +belonging, or directly a factor that match with the number of cells.} } \value{ -\code{object} with \code{W} slot updated with the result \eqn{W} -matrix, and the \code{H} and \code{V} slots of each -\linkS4class{ligerDataset} object in the \code{datasets} slot updated with -the dataset specific \eqn{H} and \eqn{V} matrix, respectively. +The liger method returns the input \linkS4class{liger} object with +factorization result updated. A list of all \eqn{H} matrices can be accessed +with \code{getMatrix(object, "H")}, a list of all \eqn{V} matrices can be +accessed with \code{getMatrix(object, "V")}, and the \eqn{W} matrix can be +accessed with \code{getMatrix(object, "W")}. + +The list method returns a list of entries \code{H}, \code{V} and +\code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} +is a list of \eqn{V} matrices for each dataset. \code{W} is the shared +\eqn{W} matrix. + +The Seurat method returns a list of entries \code{H}, \code{V} and +\code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} +is a list of \eqn{V} matrices for each dataset. \code{W} is the shared +\eqn{W} matrix. } \description{ -Performs integrative non-negative matrix (iNMF) factorization to return -factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. It optimizes the iNMF -objective function using block coordinate descent (alternating non-negative -least squares), where the number of factors is set by \code{k}. TODO: include -objective function equation here in documentation (using deqn) - -For each dataset, this factorization produces an \eqn{H} matrix (cells by k), -a \eqn{V} matrix (k by genes), and a shared \eqn{W} matrix (k by genes). The -\eqn{H} matrices represent the cell factor loadings. \eqn{W} is held +Performs integrative non-negative matrix factorization (iNMF) (J.D. Welch, +2019) to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. The +objective function is stated as + +\deqn{\arg\min_{H\ge0,W\ge0,V\ge0}\sum_{i}^{d}||E_i-(W+V_i)Hi||^2_F+\lambda\sum_{i}^{d}||V_iH_i||_F^2} + +where \eqn{E_i} is the input non-negative matrix of the i'th dataset, \eqn{d} +is the total number of datasets. \eqn{E_i} is of size \eqn{m \times n_i} for +\eqn{m} variable genes and \eqn{n_i} cells, \eqn{H_i} is of size +\eqn{n_i \times k}, \eqn{V_i} is of size \eqn{m \times k}, and \eqn{W} is of +size \eqn{m \times k}. + +The factorization produces a shared \eqn{W} matrix (genes by k), and for each +dataset, an \eqn{H} matrix (k by cells) and a \eqn{V} matrix (genes by k). +The \eqn{H} matrices represent the cell factor loadings. \eqn{W} is held consistent among all datasets, as it represents the shared components of the metagenes across datasets. The \eqn{V} matrices represent the dataset-specific components of the metagenes. + +This function adopts highly optimized fast and memory efficient +implementation extended from Planc (Kannan, 2016). Pre-installation of +extension package \code{RcppPlanc} is required. The underlying algorithm +adopts the identical ANLS strategy as \code{\link{optimizeALS}} in the old +version of LIGER. } \examples{ pbmc <- normalize(pbmc) pbmc <- selectGenes(pbmc) pbmc <- scaleNotCenter(pbmc) -# Only running a few iterations for fast examples -pbmc <- runINMF(pbmc, k = 20, maxIter = 2) +pbmc <- runINMF(pbmc, k = 20) } diff --git a/man/optimizeALS.Rd b/man/runINMF_R.Rd similarity index 86% rename from man/optimizeALS.Rd rename to man/runINMF_R.Rd index 37681b04..624769d0 100644 --- a/man/optimizeALS.Rd +++ b/man/runINMF_R.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/optimizeALS.R -\name{optimizeALS} -\alias{optimizeALS} -\alias{optimizeALS,liger-method} -\alias{optimizeALS,list-method} +% Please edit documentation in R/runINMF_R.R +\name{runINMF_R} +\alias{runINMF_R} +\alias{runINMF_R,liger-method} +\alias{runINMF_R,list-method} \title{Perform iNMF on scaled datasets} \usage{ -optimizeALS( +runINMF_R( object, k, lambda = 5, @@ -16,17 +16,14 @@ optimizeALS( H.init = NULL, W.init = NULL, V.init = NULL, + method = c("planc", "liger", "rcppml"), useUnshared = FALSE, seed = 1, readH5 = "auto", - verbose = getOption("ligerVerbose"), - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - print.obj = NULL + verbose = getOption("ligerVerbose") ) -\S4method{optimizeALS}{liger}( +\S4method{runINMF_R}{liger}( object, k, lambda = 5, @@ -39,31 +36,23 @@ optimizeALS( useUnshared = FALSE, seed = 1, readH5 = "auto", - verbose = getOption("ligerVerbose"), - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - print.obj = NULL + verbose = getOption("ligerVerbose") ) -\S4method{optimizeALS}{list}( +\S4method{runINMF_R}{list}( object, k, lambda = 5, - thresh = 1e-06, maxIter = 30, nrep = 1, H.init = NULL, W.init = NULL, V.init = NULL, + method = c("planc", "liger", "rcppml"), useUnshared = FALSE, seed = 1, readH5 = "auto", - verbose = getOption("ligerVerbose"), - max.iters = maxIter, - use.unshared = useUnshared, - rand.seed = seed, - print.obj = NULL + verbose = getOption("ligerVerbose") ) } \arguments{ @@ -103,6 +92,9 @@ Default \code{NULL}.} each element is the initial \eqn{V} matrix of each dataset. Default \code{NULL}.} +\item{method}{NNLS subproblem solver. Choose from \code{"liger"} (default +original implementation), \code{"planc"} or \code{"rcppml"}.} + \item{useUnshared}{Logical, whether to include unshared variable features and run optimizeUANLS algorithm. Defaul \code{FALSE}. Running \code{\link{selectGenes}} with \code{unshared = TRUE} and then running @@ -148,5 +140,5 @@ pbmc <- normalize(pbmc) pbmc <- selectGenes(pbmc) pbmc <- scaleNotCenter(pbmc) # Only running a few iterations for fast examples -pbmc <- optimizeALS(pbmc, k = 20, maxIter = 2) +pbmc <- runINMF(pbmc, k = 20, maxIter = 2) } diff --git a/man/runOnlineINMF.Rd b/man/runOnlineINMF.Rd new file mode 100644 index 00000000..bb6f9507 --- /dev/null +++ b/man/runOnlineINMF.Rd @@ -0,0 +1,177 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/runOnlineINMF.R +\name{runOnlineINMF} +\alias{runOnlineINMF} +\alias{runOnlineINMF.liger} +\alias{runOnlineINMF.list} +\alias{runOnlineINMF.Seurat} +\title{Perform online iNMF on scaled datasets} +\usage{ +runOnlineINMF( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) + +\method{runOnlineINMF}{liger}( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) + +\method{runOnlineINMF}{list}( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + WInit = NULL, + VInit = NULL, + AInit = NULL, + BInit = NULL, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) + +\method{runOnlineINMF}{Seurat}( + object, + k = 20, + lambda = 5, + newDatasets = NULL, + projection = FALSE, + maxEpochs = 5, + HALSiter = 1, + miniBatchSize = 5000, + seed = 123, + verbose = getOption("ligerVerbose"), + ... +) +} +\arguments{ +\item{object}{\linkS4class{liger} object. Scaled data required.} + +\item{k}{Inner dimension of factorization--number of metagenes. A value in +the range 20-50 works well for most analyses. Default \code{20}.} + +\item{lambda}{Regularization parameter. Larger values penalize +dataset-specific effects more strongly (i.e. alignment should increase as +lambda increases). We recommend always using the default value except +possibly for analyses with relatively small differences (biological +replicates, male/female comparisons, etc.) in which case a lower value such +as 1.0 may improve reconstruction quality. Default \code{5.0}.} + +\item{newDatasets}{New datasets for scenario 2 or scenario 3.} + +\item{projection}{Perform data integration with scenario 3. See description. +Default \code{FALSE}.} + +\item{maxEpochs}{The number of epochs to iterate through. See detail. +Default \code{5}.} + +\item{miniBatchSize}{Total number of cells in each minibatch. See detail. +Default \code{5000}.} + +\item{seed}{Random seed to allow reproducible results. Default \code{123}.} + +\item{verbose}{Logical. Whether to show information of the progress. Default +\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} + +\item{WInit}{Optional initialization for W. See detail. Default \code{NULL}.} + +\item{VInit}{Optional initialization for V. See detail. Default \code{NULL}.} + +\item{AInit}{Optional initialization for A. See detail. Default \code{NULL}.} + +\item{BInit}{Optional initialization for B. See detail. Default \code{NULL}.} + +\item{HALSiters}{Maximum number of block coordinate descent (HALS +algorithm) iterations to perform for each update of \eqn{W} and \eqn{V}. +Default \code{1}. Changing this parameter is not recommended.} +} +\value{ +\code{object} with \code{W} slot updated with resulting \eqn{W} +matrix; the \code{H}, \code{V}, \code{A} and \code{B} slots of each +\linkS4class{ligerDataset} object in \code{datasets} slot is updated with the +corresponding result matrices. +} +\description{ +Perform online integrative non-negative matrix factorization to +represent multiple single-cell datasets in terms of \eqn{H}, \eqn{W}, and +\eqn{V} matrices. It optimizes the iNMF objective function using online +learning (non-negative least squares for H matrix, hierarchical alternating +least squares for \eqn{W} and \eqn{V} matrices), where the number of factors +is set by \code{k}. The function allows online learning in 3 scenarios: + +\enumerate{ + \item Fully observed datasets; + \item Iterative refinement using continually arriving datasets; + \item Projection of new datasets without updating the existing factorization +} + +All three scenarios require fixed memory independent of the number of cells. + +For each dataset, this factorization produces an \eqn{H} matrix (k by cell), +a \eqn{V} matrix (genes by k) \eqn{C^\mathsf{T}C}, and a shared \eqn{W} matrix (genes by k). The +\eqn{H} matrices represent the cell factor loadings. \eqn{W} is identical +among all datasets, as it represents the shared components of the metagenes +across datasets. The \eqn{V} matrices represent the dataset-specific +components of the metagenes. +} +\details{ +For optional initialization, \code{W.init} must be a matrix object with +number of rows equal to number of variable genes (denoted as \code{g}) and +number of columns equal to \code{k}. Any of \code{V.init}, \code{A} and +\code{B} must be a list object of n matrices where n is the number of +datasets in \code{object}. For \code{V.init}, each matrix should be of size g +x k. For \code{A.init}, each matrix should be k x k and for \code{B.init}, +each matrix should be g x k. + +Minibatch iterations is performed on small subset of cells. The exact +minibatch size applied on each dataset is \code{miniBatch_size} multiplied by +the proportion of cells in this dataset out of all cells. The setting of +\code{miniBatch_size} is by default \code{5000}, which is reasonable. +However, a smaller value such as \code{1000} may be necessary for analyzing +very small datasets. In general, \code{miniBatch_size} should be no larger +than the number of cells in the smallest dataset. An epoch is one completion +of calculation on all cells after a number of iterations of minibatches. +Therefore, the total number of iterations is determined by the setting of +\code{max.epochs}, total number of cells, and \code{miniBatch_size}. +} +\examples{ +pbmc <- normalize(pbmc) +pbmc <- selectGenes(pbmc) +pbmc <- scaleNotCenter(pbmc) +# Minibatch size has to be less than number of cell in the smallest dataset +# Scenario 1 +pbmc <- online_iNMF(pbmc, miniBatch_size = 100) +# Scenario 2 +# Fake new dataset by increasing all non-zero value in "ctrl" by 1 +ctrl2 <- rawData(dataset(pbmc, "ctrl")) +ctrl2@x <- ctrl2@x + 1 +colnames(ctrl2) <- paste0(colnames(ctrl2), 2) +pbmc2 <- online_iNMF(pbmc, k = 20, X_new = list(ctrl2 = ctrl2), + miniBatch_size = 100) +# Scenario 3 +pbmc3 <- online_iNMF(pbmc, k = 20, X_new = list(ctrl2 = ctrl2), + miniBatch_size = 100, projection = TRUE) +} diff --git a/man/runUINMF.Rd b/man/runUINMF.Rd index efdcae17..5f656cf5 100644 --- a/man/runUINMF.Rd +++ b/man/runUINMF.Rd @@ -66,8 +66,8 @@ Default \code{20}.} \item{nIteration}{Total number of iterations to perform. Default \code{30}.} -\item{nRandomStarts}{Number of restarts to perform (iNMF objective function is -non-convex, so taking the best objective from multiple successive +\item{nRandomStarts}{Number of restarts to perform (iNMF objective function +is non-convex, so taking the best objective from multiple successive initializations is recommended). For easier reproducibility, this increments the random seed by 1 for each consecutive restart, so future factorizations of the same dataset can be run with one rep if necessary. Default \code{1}.} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 7988da58..9c20dc2a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -343,18 +343,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// solveNNLS -arma::mat solveNNLS(const arma::mat& C, const arma::mat& B); -RcppExport SEXP _rliger2_solveNNLS(SEXP CSEXP, SEXP BSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const arma::mat& >::type C(CSEXP); - Rcpp::traits::input_parameter< const arma::mat& >::type B(BSEXP); - rcpp_result_gen = Rcpp::wrap(solveNNLS(C, B)); - return rcpp_result_gen; -END_RCPP -} // ComputeSNN Eigen::SparseMatrix ComputeSNN(Eigen::MatrixXd nn_ranked, double prune); RcppExport SEXP _rliger2_ComputeSNN(SEXP nn_rankedSEXP, SEXP pruneSEXP) { @@ -420,7 +408,6 @@ static const R_CallMethodDef CallEntries[] = { {"_rliger2_cluster_vote", (DL_FUNC) &_rliger2_cluster_vote, 2}, {"_rliger2_scale_columns_fast", (DL_FUNC) &_rliger2_scale_columns_fast, 3}, {"_rliger2_max_factor", (DL_FUNC) &_rliger2_max_factor, 3}, - {"_rliger2_solveNNLS", (DL_FUNC) &_rliger2_solveNNLS, 2}, {"_rliger2_ComputeSNN", (DL_FUNC) &_rliger2_ComputeSNN, 2}, {"_rliger2_WriteEdgeFile", (DL_FUNC) &_rliger2_WriteEdgeFile, 3}, {"_rliger2_DirectSNNToFile", (DL_FUNC) &_rliger2_DirectSNNToFile, 4}, diff --git a/src/cpm-package-lock.cmake b/src/cpm-package-lock.cmake new file mode 100644 index 00000000..67eeaba6 --- /dev/null +++ b/src/cpm-package-lock.cmake @@ -0,0 +1,3 @@ +# CPM Package Lock +# This file should be committed to version control + diff --git a/src/entities.h b/src/entities.h deleted file mode 100755 index eca0df02..00000000 --- a/src/entities.h +++ /dev/null @@ -1,472 +0,0 @@ -#pragma once - -#include -#include - -#include -#if __cplusplus > 199711L - #include -#endif - -#include "parameters.h" - -using std::endl; - -typedef struct DenseMatrix -{ - dtype **rowmajor,**colmajor; - int rows,cols,totalsize; - bool dependent; - int originalrows,originalcols; - DenseMatrix(); - //DenseMatrix() : rows(0),cols(0),totalsize(0),dependent(false),rowmajor(NULL),colmajor(NULL) {} - - DenseMatrix(int rows_,int cols_,bool dependent_=false) : - rows(rows_),cols(cols_),totalsize(rows*cols),dependent(dependent_), - originalrows(rows_),originalcols(cols_) - { - rowmajor = new dtype*[rows]; - colmajor = new dtype*[cols]; - if (not dependent) - { - for(int row=0;row vs; - Mask() : value(NULL),dim(0) {} - Mask(int dim_,bool init) : dim(dim_) - { - value = new bool[dim](); - for(int i=0;icols](); - } - ~NNLS_Single_Input() - { - delete[] x; - } -} NNLS_Single_Input; - -typedef struct NNLS_Single_State -{//Cx = b, min norm Cx-b - DenseMatrix *C_xmask,*C_ymask; - Mask *xmask,*infeasiblemask; - dtype *y_masked,*x_masked,*y_masked_intermediate; - int full_exchange_buffer,infeasible,lowest_infeasible; - int iterations; - bool full_exchange_mode; - NNLS_Single_State(int rows,int cols) - { - xmask = new Mask(cols,false); - infeasiblemask = new Mask(cols,false); - C_xmask = new DenseMatrix(rows,cols,true); - C_xmask->cols = 0; - C_ymask = new DenseMatrix(rows,cols,true); - x_masked = new dtype[cols](); - y_masked = new dtype[cols](); - y_masked_intermediate = new dtype[rows](); - full_exchange_buffer = FULL_EXCHANGE_BUFFER; - full_exchange_mode = true; - lowest_infeasible = cols+1; - iterations = 0; - } - ~NNLS_Single_State() - { - delete C_xmask; - delete C_ymask; - delete xmask; - delete infeasiblemask; - delete[] x_masked; - delete[] y_masked; - delete[] y_masked_intermediate; - } -} NNLS_Single_State; - - -#if __cplusplus > 199711L - // typedef std::unordered_map CholeskyMap; - typedef std::map CholeskyMap; - // typedef std::unordered_map,LowerTriangularMatrix*> CholeskyMap2; -#else - typedef std::map CholeskyMap; - // typedef std::map,LowerTriangularMatrix*> CholeskyMap2; -#endif - -inline void destroyCholeskyMap(CholeskyMap& cm) -{ - for(CholeskyMap::iterator it = cm.begin(); it != cm.end(); ++it) - { - - delete it->second; - } - cm.clear(); -} -// void destroyCholeskyMap2(CholeskyMap2& cm) -// { -// for(CholeskyMap2::iterator it = cm.begin(); it != cm.end(); ++it) -// { -// delete it->second; -// } -// cm.clear(); -// } - - - -typedef struct NNLS_Multiple_State -{//CX = B, min norm CX-B - Mask **xmasks,**infeasiblemasks; - int *full_exchange_buffer; - int *lowest_infeasible; - bool *full_exchange_mode; - int *infeasible; - int iterations; - const int cols,cols_rhs; - LowerTriangularMatrix **G; - CholeskyMap choleskyMap; - // CholeskyMap2 choleskyMap2; - - LowerTriangularMatrix **CFTCF; - DenseMatrix **CGTCF; - dtype **y_masked,**x_masked,**CGTb; - - int totalfeasible; - - NNLS_Multiple_State(int cols_, int cols_rhs_) : cols(cols_),cols_rhs(cols_rhs_) - { - xmasks = new Mask*[cols_rhs]; - for(int i=0;idim,cols_rhs); - } - NNLS_Multiple_Input(LowerTriangularMatrix* CTC_,dtype**CTB_,int cols_rhs_,int max_iter_) : - CTC(CTC_),CTB(CTB_),cols_rhs(cols_rhs_),max_iter(max_iter_), - allocate_time(0),init_time(0), - advance_time(0),switch_time(0),determine_time(0), - apply_time(0),normal_time(0),generateCGTCF_time(0), - matvec_time(0),generateCGTb_time(0),vector_time(0),mark_time(0),overwrite_time(0), - choleskyCFTCF_time(0),choleskyFactorization_time(0),choleskyMapInsert_time(0),choleskyCheck_time(0), - choleskyAllocate_time(0),choleskyKeyFind_time(0),choleskyMaskToString_time(0) - { - allocateX = true; - X = new dtype*[cols_rhs]; - for(int i=0;idim]; - state = new NNLS_Multiple_State(CTC->dim,cols_rhs); - } - ~NNLS_Multiple_Input() - { - if(allocateX) - { - for(int i=0;irows; - k = W->cols; - n = H->cols; - } - ~NMF_Input() - { - } -} NMF_Input; - -typedef struct NMF_State -{//A: m * n, W : m * k, H: k * n - LowerTriangularMatrix *HHT,*WTW; - dtype **WTA,**HAT; - int m,k,n;//data size, dimensions, data points - NMF_State(int m_,int k_,int n_) : m(m_),k(k_),n(n_) - { - HHT = new LowerTriangularMatrix(k); - WTW = new LowerTriangularMatrix(k); - WTA = new dtype*[n]; - for(int col=0;col -#include - -void normal_equations_cpu(DenseMatrix& C,dtype*x,dtype*b) -{ - //Algorithm from 5.3.2 (pg. 262) in Golub and Van Loan. - LowerTriangularMatrix G = LowerTriangularMatrix(C.cols); - LowerTriangularMatrix CTC = LowerTriangularMatrix(C.cols); - - matmult_ata_lowertriangular_cpu(CTC,C); - matvecmult_transpose_cpu(C,b,x); - cholesky_lowertriangular_cpu(G,CTC); - - forwardsubstitution(G,x); - backsubstitution(G,x); - -} - -void normal_equations_precomputedCholesky_cpu(LowerTriangularMatrix& G,dtype*x) -{//x needs to be initialized with appropriately masked CTb. - forwardsubstitution(G,x); - backsubstitution(G,x); -} - -void normal_equations_precomputedCholesky_cpu_test(LowerTriangularMatrix& G,dtype*x) -{//x needs to be initialized with appropriately masked CTb. - forwardsubstitution(G,x); - backsubstitution(G,x); -} - - diff --git a/src/mask.h b/src/mask.h deleted file mode 100755 index dbdb6778..00000000 --- a/src/mask.h +++ /dev/null @@ -1,268 +0,0 @@ -#pragma once - -#include -#include - -#include "entities.h" - -void applyColumnMask(DenseMatrix& original,DenseMatrix& masked,Mask& mask,bool toggle=false) -{ - masked.cols = 0; - masked.totalsize = 0; - for(int i=0;i=0;--i) - { - if( infeasiblemask.value[i] ) - { - xmask.value[i] = not xmask.value[i]; - if( not bpp ) return; - } - } -} - -void generateCGTCF(DenseMatrix& CGTCF,LowerTriangularMatrix& CTC,Mask& xmask) -{ - //int rowmap[xmask.dim]; - //int colmap[xmask.dim]; - std::vector rowmap(xmask.dim); - std::vector colmap(xmask.dim); - int newrows = 0; - for(int row = 0; row < xmask.dim; ++row) - { - if( not xmask.value[row] ) - { - rowmap[newrows] = row; - ++newrows; - } - } - CGTCF.rows = newrows; - int newcols = 0; - for(int col = 0; col < xmask.dim; ++col) - { - if( xmask.value[col] ) - { - colmap[newcols] = col; - ++newcols; - } - } - CGTCF.cols = newcols; - - CGTCF.totalsize = CGTCF.rows*CGTCF.cols; - - for(int row = 0; row < CGTCF.rows; ++row) - { - for(int col = 0; col < CGTCF.cols; ++col ) - { - int oldrow = rowmap[row]; - int oldcol = colmap[col]; - int index = (oldcol > oldrow) ? ((oldcol*(oldcol+1))/2+oldrow) : ((oldrow*(oldrow+1))/2+oldcol); - CGTCF.colmajor[col][row] = CTC.rowmajor[index]; - } - } -} -void generateCFTCF(LowerTriangularMatrix& CFTCF,LowerTriangularMatrix& CTC,Mask& columnmask) -{ - //int colmap[columnmask.dim]; - std::vector colmap(columnmask.dim); - int newdim = 0; - for(int i = 0; i < columnmask.dim; ++i) - { - if( columnmask.value[i] ) - { - colmap[newdim] = i; - ++newdim; - } - } - CFTCF.dim = newdim; - CFTCF.totalsize = (newdim*(newdim+1))/2; - - for(int row = 0; row < CFTCF.dim; ++row) - { - int index = (row*(row+1))/2; - for(int col = 0; col <= row; ++col ) - { - int oldrow = colmap[row]; - int oldcol = colmap[col]; - int oldindex = (oldcol > oldrow) ? ((oldcol*(oldcol+1))/2+oldrow) : ((oldrow*(oldrow+1))/2+oldcol); - CFTCF.rowmajor[index + col] = CTC.rowmajor[oldindex]; - - } - } -} -void generateCGTb(dtype* CTb,dtype* CGTb,Mask& xmask) -{ - int newdim = 0; - for(int i = 0; i < xmask.dim; ++i) - { - if( not xmask.value[i] ) - { - CGTb[newdim] = CTb[i]; - ++newdim; - } - } -} -std::string maskToString(Mask& mask) -{ - std::stringstream stream; - // std::string key = ""; - for(int i=0;i 0) - { - str += (char)((num%10) + 48); - num/=10; - } -} -void maskToString4(Mask& mask) -{ - mask.tmp = ""; - for(int i=0;i& vec) -{ - while(num > 0) - { - vec.push_back( (char)((num%10) + 48) ); - num/=10; - } -} -void maskToString5(Mask& mask) -{ - mask.vs.clear(); - for(int i=0;i& key) -{ - for(int i=0;i - -#include "vector.h" -#include "entities.h" - -void forwardsubstitution(LowerTriangularMatrix& G,dtype*rhs) -{//rhs is overwritten with the answer. - for (int row = 0; row < G.dim; ++row) - { - int startindex = (row*(row+1))/2; - dtype rowsum = 0; - for(int col = 0; col < row; ++col) rowsum += G.rowmajor[startindex + col]*rhs[col]; - int diagonalindex = startindex + row; - rhs[row] = ( rhs[row] - rowsum ) / G.rowmajor[diagonalindex]; - } -} -void backsubstitution(LowerTriangularMatrix& G,dtype*rhs) -{//column-oriented back substitutes from the transpose of G. - //rhs is overwritten with the answer. - for(int col = G.dim-1; col >= 0; --col) - { - int startindex = (col*(col+1))/2; - int diagonalindex = startindex+col; - rhs[col] = rhs[col] / G.rowmajor[diagonalindex]; - for(int row=0;row < col;++row) rhs[row] = rhs[row] - rhs[col]*G.rowmajor[startindex+row]; - } -} -void matmult_ata_lowertriangular_cpu( - LowerTriangularMatrix& C, DenseMatrix& A) -{//computes C=ATA from A. - for(int Crow=0;Crowrows;++row) - { - for(int col = 0;colcols;++col) - { - sum += A->colmajor[col][row]*A->colmajor[col][row]; - } - } - return sqrt(sum); -} - -dtype sparsity(DenseMatrix* A) -{ - int zeros = 0; - for(int row = 0;rowrows;++row) - { - for(int col = 0;colcols;++col) - { - if(A->colmajor[col][row] == 0) ++zeros; - } - } - return (dtype)zeros/(dtype)(A->cols*A->rows); -} -void check_colmajor(dtype**standard,dtype**check,int cols,int rows,dtype tol=1e-5) -{ - int numwrong = 0; - for(int col=0;col tol ) - { - ++numwrong; - } - } - } - if(numwrong > 0) - { - //uh oh - } - else - { - //ok - } -} -void copy_colmajor(dtype**from,dtype**to,int cols,int rows) -{ - for(int col=0;col -#include -#include - -#include "ls.h" -#include "mask.h" - -void advanceInfeasibilityState_single(NNLS_Single_State& state) -{ - if( state.infeasible < state.lowest_infeasible ) - { - state.lowest_infeasible = state.infeasible; - state.full_exchange_buffer = FULL_EXCHANGE_BUFFER; - state.full_exchange_mode = true; - } - else - { - --state.full_exchange_buffer; - if ( state.full_exchange_buffer <= 0 ) - { - state.full_exchange_mode = false; - } - } -} - -int nnls_single_cpu(NNLS_Single_Input& input) -{//nonnegative least squares of Cx - b - int cols = input.C->cols; - int rows = input.C->rows; - NNLS_Single_State state = NNLS_Single_State(rows,cols); - - matvecmult_transpose_cpu(*input.C,input.b,state.y_masked,-1); - state.infeasible = markInfeasible(*state.infeasiblemask,state.x_masked,state.y_masked,*state.xmask); - while( state.infeasible > 0 and state.iterations < input.max_iter ) - { - advanceInfeasibilityState_single(state); - - switchSets(*state.infeasiblemask,*state.xmask,state.full_exchange_mode); - // printf("infeasiblemask: %d,%d,%d,%d\n",state.infeasiblemask->value[0],state.infeasiblemask->value[1], - // state.infeasiblemask->value[2],state.infeasiblemask->value[3]); - applyColumnMask(*input.C,*state.C_xmask,*state.xmask); - applyColumnMask(*input.C,*state.C_ymask,*state.xmask,true); - - normal_equations_cpu(*state.C_xmask,state.x_masked,input.b); - // printf("x_masked, xmask: %f,%f,%f,%f; %d,%d,%d,%d\n",state.x_masked[0],state.x_masked[1],state.x_masked[2],state.x_masked[3], - // state.xmask->value[0],state.xmask->value[1],state.xmask->value[2],state.xmask->value[3]); - - matvecmult_colmajor_cpu(*state.C_xmask,state.x_masked,state.y_masked_intermediate); - vectorsub(input.b,state.y_masked_intermediate,rows); - matvecmult_transpose_cpu(*state.C_ymask,state.y_masked_intermediate,state.y_masked); - // printf("y_masked: %f,%f,%f,%f\n",state.y_masked[0],state.y_masked[1],state.y_masked[2],state.y_masked[3]); - - state.infeasible = markInfeasible(*state.infeasiblemask,state.x_masked,state.y_masked,*state.xmask); - ++state.iterations; - } - - overwriteOriginalWithMask(input.x,state.x_masked,*state.xmask); - - return state.iterations; -} - -void initialize_multiple_cpu(NNLS_Multiple_Input& input,NNLS_Multiple_State& state) -{ - for(int i=0;idim;++j) state.xmasks[i]->value[j] = false; - for(int j=0;jdim;++j) state.infeasiblemasks[i]->value[j] = false; - state.CFTCF[i]->dim = state.CGTCF[i]->originalcols; - state.CFTCF[i]->totalsize = (state.CGTCF[i]->originalcols*(state.CGTCF[i]->originalcols+1))/2; - state.CGTCF[i]->rows = state.CGTCF[i]->originalrows; - state.CGTCF[i]->cols = state.CGTCF[i]->originalcols; - state.CGTCF[i]->totalsize = state.CGTCF[i]->originalrows*state.CGTCF[i]->originalcols; - //printf("%f,%f,%f,%f\n",state.y_masked[i][0],state.y_masked[i][1],state.y_masked[i][2],state.y_masked[i][3]); - } - state.totalfeasible = 0; - state.iterations = 0; - destroyCholeskyMap(state.choleskyMap); -} -void markInfeasible_multiple_cpu(NNLS_Multiple_State& state) -{ - for(int i=0;i 0) - { - state.infeasible[i] = markInfeasible(*state.infeasiblemasks[i],state.x_masked[i],state.y_masked[i],*state.xmasks[i]); - if(state.infeasible[i] == 0) ++state.totalfeasible; - } - } -} - -void determineCholeskyFactors_cpu(NNLS_Multiple_Input& input,NNLS_Multiple_State& state) -{ - for(int i=0;i 0) - { - std::string key = maskToString(*state.xmasks[i]); - CholeskyMap::iterator it = state.choleskyMap.find(key); - if( it != state.choleskyMap.end() ) - { - state.G[i] = it->second; - } - else - { - generateCFTCF(*state.CFTCF[i],*input.CTC,*state.xmasks[i]); - LowerTriangularMatrix* G = new LowerTriangularMatrix(state.CFTCF[i]->dim); - - cholesky_lowertriangular_cpu(*G,*state.CFTCF[i]); - state.G[i] = G; - std::pair ret = - state.choleskyMap.insert ( std::pair(key,G) ); - if ( not ret.second ) - { - //ERROR! Duplicate Cholesky factor was inserted - } - - } - } - } -} -void determineCholeskyFactors_cpu_profile(NNLS_Multiple_Input& input,NNLS_Multiple_State& state) -{ - std::clock_t start; - std::clock_t checkStart = std::clock(); - // #pragma omp parallel for - for(int i=0;i 0) - { - start = std::clock(); - // maskToString3(*state.xmasks[i]); - // std::string& key = state.xmasks[i]->tmp; - // maskToString4(*state.xmasks[i]);std::string& key = state.xmasks[i]->tmp; - maskToString5(*state.xmasks[i]);std::string key = &(*(state.xmasks[i]->vs.begin())); - // std::string key; maskToString2(*state.xmasks[i],key); - // std::vector key; maskToVector(*state.xmasks[i],key); - input.choleskyMaskToString_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - CholeskyMap::iterator it = state.choleskyMap.find(key); - // CholeskyMap2::iterator it = state.choleskyMap2.find(key); - input.choleskyKeyFind_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - // if( it != state.choleskyMap2.end() ) state.G[i] = it->second; - if( it != state.choleskyMap.end() ) state.G[i] = it->second; - else - { - start = std::clock(); - generateCFTCF(*state.CFTCF[i],*input.CTC,*state.xmasks[i]); - input.choleskyCFTCF_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - LowerTriangularMatrix* G = new LowerTriangularMatrix(state.CFTCF[i]->dim); - input.choleskyAllocate_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - cholesky_lowertriangular_cpu(*G,*state.CFTCF[i]); - input.choleskyFactorization_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - state.G[i] = G; - start = std::clock(); - std::pair ret = - state.choleskyMap.insert ( std::pair(key,G) ); - // std::pair ret = - // state.choleskyMap2.insert ( std::pair,LowerTriangularMatrix*>(key,G) ); - if ( not ret.second ) - { - //ERROR! Duplicate Cholesky factor was inserted - } - input.choleskyMapInsert_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - } - } - } - input.choleskyCheck_time += ( std::clock() - checkStart ) / (double) CLOCKS_PER_SEC; -} - -void advanceInfeasibilityState_multiple(NNLS_Multiple_State& state) -{ - for(int i=0;idim; - // int cols_rhs = input.cols_rhs; - // NNLS_Multiple_State state = NNLS_Multiple_State(cols,cols_rhs); - NNLS_Multiple_State& state = *input.state; - - initialize_multiple_cpu(input,state); - - markInfeasible_multiple_cpu(state); - determineCholeskyFactors_cpu(input,state); - - while( state.totalfeasible < state.cols_rhs and state.iterations < input.max_iter ) - { - advanceInfeasibilityState_multiple(state); - - for(int i=0;i 0) switchSets(*state.infeasiblemasks[i],*state.xmasks[i],state.full_exchange_mode[i]); - // printf("infeasiblemask: %d,%d,%d,%d\n",state.infeasiblemasks[0]->value[0],state.infeasiblemasks[0]->value[1], - // state.infeasiblemasks[0]->value[2],state.infeasiblemasks[0]->value[3]); - determineCholeskyFactors_cpu(input,state); - - // printv(state.G[0]->rowmajor,state.G[0]->totalsize); - for(int i=0;i 0) applyVectorMask(input.CTB[i],state.x_masked[i],*state.xmasks[i]); - for(int i=0;i 0) normal_equations_precomputedCholesky_cpu(*state.G[i],state.x_masked[i]); - // printf("x_masked, xmask: %f,%f,%f,%f; %d,%d,%d,%d\n",state.x_masked[0][0],state.x_masked[0][1],state.x_masked[0][2],state.x_masked[0][3], - // state.xmasks[0]->value[0],state.xmasks[0]->value[1],state.xmasks[0]->value[2],state.xmasks[0]->value[3]); - - for(int i=0;i 0) generateCGTCF(*state.CGTCF[i],*input.CTC,*state.xmasks[i]); - for(int i=0;i 0) matvecmult_colmajor_cpu(*state.CGTCF[i],state.x_masked[i],state.y_masked[i]); - for(int i=0;i 0) generateCGTb(input.CTB[i],state.CGTb[i],*state.xmasks[i]); - for(int i=0;i 0) vectorsub(state.CGTb[i],state.y_masked[i],state.CGTCF[i]->rows); - // printf("y_masked: %f,%f,%f,%f\n",state.y_masked[0][0],state.y_masked[0][1],state.y_masked[0][2],state.y_masked[0][3]); - - markInfeasible_multiple_cpu(state); - ++state.iterations; - } - - for(int i=0;idim; - // int cols_rhs = input.cols_rhs; - // NNLS_Multiple_State state = NNLS_Multiple_State(cols,cols_rhs); - // input.allocate_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - NNLS_Multiple_State& state = *input.state; - - start = std::clock(); - initialize_multiple_cpu(input,state); - input.init_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - markInfeasible_multiple_cpu(state); - input.mark_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - determineCholeskyFactors_cpu_profile(input,state); - input.determine_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - while( state.totalfeasible < state.cols_rhs and state.iterations < input.max_iter ) - { - start = std::clock(); - advanceInfeasibilityState_multiple(state); - input.advance_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - for(int i=0;i 0) switchSets(*state.infeasiblemasks[i],*state.xmasks[i],state.full_exchange_mode[i]); - input.switch_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - // printf("infeasiblemask: %d,%d,%d,%d\n",state.infeasiblemasks[0]->value[0],state.infeasiblemasks[0]->value[1], - // state.infeasiblemasks[0]->value[2],state.infeasiblemasks[0]->value[3]); - start = std::clock(); - determineCholeskyFactors_cpu_profile(input,state); - input.determine_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - // printv(state.G[0]->rowmajor,state.G[0]->totalsize); - start = std::clock(); - for(int i=0;i 0) applyVectorMask(input.CTB[i],state.x_masked[i],*state.xmasks[i]); - input.apply_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - for(int i=0;i 0) normal_equations_precomputedCholesky_cpu(*state.G[i],state.x_masked[i]); - input.normal_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - // printf("x_masked, xmask: %f,%f,%f,%f; %d,%d,%d,%d\n",state.x_masked[0][0],state.x_masked[0][1],state.x_masked[0][2],state.x_masked[0][3], - // state.xmasks[0]->value[0],state.xmasks[0]->value[1],state.xmasks[0]->value[2],state.xmasks[0]->value[3]); - - start = std::clock(); - for(int i=0;i 0) generateCGTCF(*state.CGTCF[i],*input.CTC,*state.xmasks[i]); - input.generateCGTCF_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - for(int i=0;i 0) matvecmult_colmajor_cpu(*state.CGTCF[i],state.x_masked[i],state.y_masked[i]); - input.matvec_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - for(int i=0;i 0) generateCGTb(input.CTB[i],state.CGTb[i],*state.xmasks[i]); - input.generateCGTb_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - start = std::clock(); - for(int i=0;i 0) vectorsub(state.CGTb[i],state.y_masked[i],state.CGTCF[i]->rows); - input.vector_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - // printf("y_masked: %f,%f,%f,%f\n",state.y_masked[0][0],state.y_masked[0][1],state.y_masked[0][2],state.y_masked[0][3]); - - start = std::clock(); - markInfeasible_multiple_cpu(state); - input.mark_time += ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - - ++state.iterations; - } - - start = std::clock(); - for(int i=0;i -#include -#include -#include - -#include "parameters.h" -#include "entities.h" -#include "vector.h" -#include "matrix.h" -#include "factorization.h" -#include "ls.h" -#include "mask.h" -#include "nnls.h" - -// NNLS implementation based on source code written by Robert Lee -// Original code available at https://github.com/rlee32/nmf -#include -// [[Rcpp::depends("RcppArmadillo")]] - -using namespace Rcpp; -using namespace arma; - -DenseMatrix r_to_cpp(const NumericMatrix& A) -{ - int n = A.nrow(); - int m = A.ncol(); - - DenseMatrix X = DenseMatrix(n,m); - for(int i = 0; i < n; ++i) - { - for (int j = 0; j < m; ++j) - { - X.rowmajor[i][j] = A(i,j); - } - } - X.copyRowToColumn(); - return X; -} - -LowerTriangularMatrix r_to_cpp_lower_triangular(const NumericMatrix& A) -{ - int n = A.nrow(); - int m = A.ncol(); - if (n != m) - { - //uh oh - } - LowerTriangularMatrix X = LowerTriangularMatrix(n); - int ind = 0; - for(int i = 0; i < n; ++i) - { - for (int j = 0; j <= i; ++j) - { - X.rowmajor[ind] = A(i,j); - ++ind; - } - } - return X; -} - -void arma_to_cpp(const mat& A, DenseMatrix& X) -{ - int n = A.n_rows; - int m = A.n_cols; - - for (int i = 0; i < n; ++i) - { - for (int j = 0; j < m; ++j) - { - X.rowmajor[i][j] = A(i,j); - } - } - X.copyRowToColumn(); -} - -void arma_to_cpp_lower_triangular(const mat& A, LowerTriangularMatrix& X) -{ - int n = A.n_rows; - int m = A.n_cols; - if (n != m) - { - //oh no - } - int ind = 0; - for (int i = 0; i < n; ++i) - { - for (int j = 0; j <= i; ++j) - { - X.rowmajor[ind] = A(i,j); - ++ind; - } - } -} - -NumericMatrix cpp_to_r(const DenseMatrix& X) -{ - int n = X.rows; - int m = X.cols; - - NumericMatrix A = NumericMatrix(n,m); - for (int i = 0; i < n; ++i) - { - for (int j = 0; j < m; ++j) - { - A(i,j) = X.rowmajor[i][j]; - } - } - return A; -} - -void cpp_to_arma(const DenseMatrix& X, mat& A) -{ - int n = X.rows; - int m = X.cols; - A.set_size(n,m); - for(int i = 0; i < n; ++i) - { - for (int j = 0; j < m; ++j) - { - A(i,j) = X.rowmajor[i][j]; - } - } -} - -// [[Rcpp::export]] -arma::mat solveNNLS(const arma::mat& C, const arma::mat& B) -{ - int max_iter_nnls = 100; - LowerTriangularMatrix CTC(C.n_cols); - DenseMatrix CTB = DenseMatrix(B.n_cols,C.n_cols); - arma_to_cpp_lower_triangular(C.t() * C,CTC); - arma_to_cpp(B.t() * C,CTB); - - DenseMatrix X(C.n_cols,B.n_cols); - NNLS_Multiple_Input nnls_input = NNLS_Multiple_Input(&CTC,X.colmajor,CTB.rowmajor,B.n_cols,max_iter_nnls); - nnls_multiple_cpu(nnls_input); - X.copyColumnToRow(); - - mat res; - cpp_to_arma(X,res); - return res; -} - diff --git a/src/vector.h b/src/vector.h deleted file mode 100755 index 57340a66..00000000 --- a/src/vector.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include "parameters.h" - -dtype vectordot(dtype*a,dtype*b,int size) -{ - dtype sum = 0; - for(int i=0;i