diff --git a/R/RcppExports.R b/R/RcppExports.R index 929a5c7..874af8e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,10 +9,6 @@ colNormalize_dense_cpp <- function(x, L) { .Call(`_rliger2_colNormalize_dense_cpp`, x, L) } -select_factor_cpp <- function(all_data, knn, threshold) { - .Call(`_rliger2_select_factor_cpp`, all_data, knn, threshold) -} - colAggregateMedian_dense_cpp <- function(x, group, n) { .Call(`_rliger2_colAggregateMedian_dense_cpp`, x, group, n) } diff --git a/R/cINMF.R b/R/cINMF.R index f8ef85c..b8d5013 100644 --- a/R/cINMF.R +++ b/R/cINMF.R @@ -3,9 +3,9 @@ #' Performs consensus integrative non-negative matrix factorization (c-iNMF) #' to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. We run the #' regular iNMF multiple times with different random starts, and then take the -#' consensus of the factorized matrices factor gene loading matrix \eqn{W} and -#' \eqn{V} matrices for each dataset. The cell factor loading \eqn{H} matrices -#' are eventually solved with the consensus \eqn{W} and \eqn{V} matrices. +#' consensus of frequently appearing factors from gene loading matrices, \eqn{W} +#' and \eqn{V}. The cell factor loading \eqn{H} matrices are eventually solved +#' with the consensus \eqn{W} and \eqn{V} matrices. #' #' Please see \code{\link{runINMF}} for detailed introduction to the regular #' iNMF algorithm which is run multiple times in this function. @@ -21,11 +21,13 @@ #' @param lambda Regularization parameter. Larger values penalize #' dataset-specific effects more strongly (i.e. alignment should increase as #' \code{lambda} increases). Default \code{5}. +#' @param rho Numeric number between 0 and 1. Fraction for determining the +#' number of nearest neighbors to look at for consensus (by +#' \code{rho * nRandomStarts}). Default \code{0.3}. #' @param nIteration Total number of block coordinate descent iterations to #' perform. Default \code{30}. -#' @param nRandomStarts Number of restarts to perform (iNMF objective function -#' is non-convex, so taking the best objective from multiple successive -#' initialization is recommended). Default \code{1}. +#' @param nRandomStarts Number of replicate runs for creating the pool of +#' factorization results. Default \code{10}. #' @param HInit Initial values to use for \eqn{H} matrices. A list object where #' each element is the initial \eqn{H} matrix of each dataset. Default #' \code{NULL}. @@ -35,6 +37,8 @@ #' each element is the initial \eqn{V} matrix of each dataset. Default #' \code{NULL}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to methods. @@ -63,8 +67,12 @@ #' \code{misc} slot of the same DimReduc object.} #' }} #' } -#' @references Joshua D. Welch and et al., Single-Cell Multi-omic Integration -#' Compares and Contrasts Features of Brain Cell Identity, Cell, 2019 +#' @references +#' Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares and +#' Contrasts Features of Brain Cell Identity, Cell, 2019 +#' +#' Dylan Kotliar and et al., Identifying gene expression programs of cell-type +#' identity and cellular activity with single-cell RNA-Seq, eLife, 2019 #' @examples #' pbmc <- normalize(pbmc) #' pbmc <- selectGenes(pbmc) @@ -77,7 +85,6 @@ runCINMF <- function( k = 20, lambda = 5.0, rho = 0.3, - tao = 0.1, ... ) { UseMethod("runCINMF", object) @@ -91,7 +98,6 @@ runCINMF.liger <- function( k = 20, lambda = 5.0, rho = 0.3, - tao = 0.1, nIteration = 30, nRandomStarts = 10, HInit = NULL, @@ -119,7 +125,6 @@ runCINMF.liger <- function( k = k, lambda = lambda, rho = rho, - tao = tao, nIteration = nIteration, nRandomStarts = nRandomStarts, HInit = HInit, @@ -129,6 +134,7 @@ runCINMF.liger <- function( nCores = nCores, verbose = verbose ) + # return(out) object@W <- out$W for (d in names(object)) { object@datasets[[d]]@H <- out$H[[d]] @@ -159,7 +165,6 @@ runCINMF.Seurat <- function( k = 20, lambda = 5.0, rho = 0.3, - tao = 0.1, datasetVar = "orig.ident", layer = "ligerScaleData", assay = NULL, @@ -201,7 +206,6 @@ runCINMF.Seurat <- function( k = k, lambda = lambda, rho = rho, - tao = tao, nIteration = nIteration, nRandomStarts = nRandomStarts, HInit = HInit, @@ -238,7 +242,6 @@ runCINMF.Seurat <- function( k = 20, lambda = 5.0, rho = 0.3, - tao = 0.1, nIteration = 30, nRandomStarts = 10, HInit = NULL, @@ -287,19 +290,70 @@ runCINMF.Seurat <- function( verbose = FALSE) Ws[[repName]] <- out$W for (n in names(object)) Vs[[n]][[repName]] <- out$V[[n]] - # for (n in names(object)) Hs[[n]][[repName]] <- out$H[[n]] + for (n in names(object)) Hs[[n]][[repName]] <- out$H[[n]] if (isTRUE(verbose)) cli::cli_progress_update( status = sprintf("[ %d / %d ]", i, nRandomStarts), set = i ) } + # return(list(W = Ws, V = Vs, H = Hs)) if (isTRUE(verbose)) cliID <- cli::cli_process_start("Taking the consensus") - W <- takeConsensus(Ws, rho = rho, tao = tao) - Vs <- lapply(Vs, takeConsensus, rho = rho, tao = tao) + # Attempt 1 #### + # W <- takeConsensus(Ws, rho = rho, tao = tao) + # Vs <- lapply(Vs, takeConsensus, rho = rho, tao = tao) + + # Attempt 2 #### + # factorSelect <- takeConsensus2(Ws, rho = rho, tao = tao) + # W <- Reduce(cbind, Ws)[, factorSelect$idx] + # W <- colNormalize_dense_cpp(W, L = 2) + # W <- colAggregateMedian_dense_cpp(W, group = factorSelect$cluster - 1, n = k) + # W <- colNormalize_dense_cpp(W, L = 1) + # for (i in seq_along(object)) { + # V <- Reduce(cbind, Vs[[i]])[, factorSelect$idx] + # V <- colNormalize_dense_cpp(V, L = 2) + # V <- colAggregateMedian_dense_cpp(V, group = factorSelect$cluster - 1, n = k) + # Vs[[i]] <- colNormalize_dense_cpp(V, L = 1) + # } + + # Attempt 3 #### + # W <- Reduce(cbind, Ws) + # selection <- cluster_sel(W, nNeighbor = 3, k = k, resolution = 0.8) + # W <- W[, selection$idx] + # W <- colNormalize_dense_cpp(W, L = 2) + # print(table(selection$cluster - 1)) + # W <- colAggregateMedian_dense_cpp(W, group = selection$cluster, n = k) + # W <- colNormalize_dense_cpp(W, L = 1) + # for (i in seq_along(Vs)) { + # V <- Reduce(cbind, Vs[[i]]) + # V <- V[, selection$idx] + # V <- colNormalize_dense_cpp(V, L = 2) + # V <- colAggregateMedian_dense_cpp(V, group = selection$cluster, n = k) + # Vs[[i]] <- colNormalize_dense_cpp(V, L = 1) + # } + + # Attempt 4 #### + nNeighbor <- round(rho * nRandomStarts) + geneLoadings <- list(Reduce(cbind, Ws)) + for (i in seq_along(object)) { + geneLoadings[[i + 1]] <- Reduce(cbind, Vs[[i]]) + } + geneLoadings <- lapply(geneLoadings, colNormalize_dense_cpp, L = 2) + # geneLoadings is a list, each element is a ngene by (nFactor * nRandomStart) matrix + # The first is for W, the rest are for Vs + selection <- factor_cluster_sel(geneLoadings, nNeighbor = nNeighbor, + minWeight = 0.6, k = k, resolution = 0.2) + W <- geneLoadings[[1]][, selection$idx] + W <- colAggregateMedian_dense_cpp(W, group = selection$cluster, n = k) + W <- colNormalize_dense_cpp(W, L = 1) + for (i in seq_along(Vs)) { + V <- geneLoadings[[i + 1]][, selection$idx] + V <- colAggregateMedian_dense_cpp(V, group = selection$cluster, n = k) + Vs[[i]] <- colNormalize_dense_cpp(V, L = 1) + } - # if (isTRUE(verbose)) cli::cli_process_done(id = cliID) - # - msg <- "Solving last round of NNLS problems" + if (isTRUE(verbose)) cli::cli_process_done(id = cliID) + + msg <- "Solving the last ANLS iterations" if (isTRUE(verbose)) cliID <- cli::cli_process_start(msg = msg) for (i in seq_along(object)) { Hs[[i]] <- inmf_solveH(NULL, W, Vs[[i]], object[[i]], lambda, nCores = nCores) @@ -307,15 +361,12 @@ runCINMF.Seurat <- function( # for (i in seq_along(object)) { # Vs[[i]] <- inmf_solveV(Hs[[i]], W, NULL, object[[i]], lambda, nCores = nCores) # } - # W <- inmf_solveW(Hs, NULL, Vs, object, lambda, nCores = nCores) # objErr <- sum(sapply(seq_along(object), function(i) # inmf_objErr_i(H = Hs[[i]], W = W, V = Vs[[i]], E = object[[i]], # lambda = lambda) # )) - out <- .runINMF.list(object, k = k, lambda = lambda, nIteration = 30, - HInit = Hs, WInit = W, VInit = Vs, verbose = FALSE) - if (isTRUE(verbose)) - cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {out$objErr}")) + # if (isTRUE(verbose)) + # cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {objErr}")) # factors <- paste0("Factor_", seq(k)) # dimnames(W) <- list(features, factors) @@ -326,48 +377,100 @@ runCINMF.Seurat <- function( # } # names(Hs) <- names(Vs) <- names(object) # result <- list(W = W, H = Hs, V = Vs, objErr = objErr) + # return(result) + + out <- .runINMF.list(object, k = k, lambda = lambda, nIteration = 1, + HInit = Hs, WInit = W, VInit = Vs, verbose = FALSE) + if (isTRUE(verbose)) + cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {out$objErr}")) return(out) - return(result) } -takeConsensus <- function(matList, rho = 0.3, tao = 0.1) { - ## According to cNMF method. - ## G - The program matrix. what we call W or V - ## rho - fraction parameter to set number of nearest neighbors to look at - ## tao - distance threshold to filter out factors - - ## matList are matrices of dimensionality gene x factor - # Step 1: Concatenate and l2 normalize - ## R - a number of replicates - G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart) - G_R <- colNormalize_dense_cpp(G_R, L = 2) +# takeConsensus <- function(matList, rho = 0.3, tao = 0.1) { +# ## According to cNMF method. +# ## G - The program matrix. what we call W or V +# ## rho - fraction parameter to set number of nearest neighbors to look at +# ## tao - distance threshold to filter out factors +# +# ## matList are matrices of dimensionality gene x factor +# # Step 1: Concatenate and l2 normalize +# ## R - a number of replicates +# G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart) +# G_R <- colNormalize_dense_cpp(G_R, L = 2) +# +# # Step 2: Find kNN matching for each component (factor) +# # and filter by mean distance against the kNN of each +# nNeighbor <- round(rho * length(matList)) +# knn <- RANN::nn2(t(G_R), k = nNeighbor + 1) +# +# # `select_factor_cpp` is a C++ function that returns the indices of the +# # factors that are selected by examining whether the mean euclidean distance +# # to its NNs is less than the threshold `tao`. +# selectedIdx <- rowMeans(knn$nn.dist[,2:(nNeighbor + 1)]) < tal +# # selectedIdx <- select_factor_cpp(all_data = G_R, +# # knn = knn$nn.idx - 1, +# # threshold = tao) +# +# G_R <- G_R[, selectedIdx] # genes by selected factors +# if (ncol(G_R) < ncol(matList[[1]])) { +# cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.", +# "i" = "Please try: ", +# "*" = "a larger {.var tao} for loosen threshold", +# "*" = "a larger {.var nRandomStarts} for more replicates to be used.")) +# } +# # Step 3: Kmeans clustering to get the k consensus +# cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster +# G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]])) +# G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1) +# return(G_consensus) +# } - # Step 2: Find kNN matching for each component (factor) - # and filter by mean distance against the kNN of each - nNeighbor <- round(rho * length(matList)) - knn <- RANN::nn2(t(G_R), k = nNeighbor + 1) - - # `select_factor_cpp` is a C++ function that returns the indices of the - # factors that are selected by examining whether the mean euclidean distance - # to its NNs is less than the threshold `tao`. - selectedIdx <- select_factor_cpp(all_data = G_R, - knn = knn$nn.idx - 1, - threshold = tao) - - G_R <- G_R[, selectedIdx] # genes by selected factors - if (ncol(G_R) < ncol(matList[[1]])) { - cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.", - "i" = "Please try: ", - "*" = "a larger {.var tao} for loosen threshold", - "*" = "a larger {.var nRandomStarts} for more replicates to be used.")) - } - # Step 3: Kmeans clustering to get the k consensus - cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster - G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]])) - G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1) - return(G_consensus) -} +# takeConsensus2 <- function(matList, rho = 0.3, tao = 0.1) { +# # 2nd attempt of methods +# # Return the selection of factors from the replicate pool as well as the +# # grouping on the selected ones. So this part of information will be used +# # to select the V factors that represent the same thing as what we want +# # from W +# +# ## According to cNMF method. +# ## G - The program matrix. what we call W or V +# ## rho - fraction parameter to set number of nearest neighbors to look at +# ## tao - distance threshold to filter out factors +# +# ## matList are matrices of dimensionality gene x factor +# # Step 1: Concatenate and l2 normalize +# ## R - a number of replicates +# G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart) +# G_R <- colNormalize_dense_cpp(G_R, L = 2) +# +# # Step 2: Find kNN matching for each component (factor) +# # and filter by mean distance against the kNN of each +# nNeighbor <- round(rho * length(matList)) +# knn <- RANN::nn2(t(G_R), k = nNeighbor + 1) +# nnDistMeans <- rowMeans(knn$nn.dist[,2:(nNeighbor + 1)]) +# selectedIdx <- nnDistMeans < tao +# # `select_factor_cpp` is a C++ function that returns the indices of the +# # factors that are selected by examining whether the mean euclidean distance +# # to its NNs is less than the threshold `tao`. +# # selectedIdx <- select_factor_cpp(all_data = G_R, +# # knn = knn$nn.idx - 1, +# # threshold = tao) +# +# G_R <- G_R[, selectedIdx] # genes by selected factors +# if (ncol(G_R) < ncol(matList[[1]])) { +# cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.", +# "i" = "Please try: ", +# "*" = "a larger {.var tao} for loosen threshold", +# "*" = "a larger {.var nRandomStarts} for more replicates to be used.")) +# } +# # Step 3: Kmeans clustering to get the k consensus +# cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster +# return(list(idx = selectedIdx, cluster = cluster)) +# # G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]])) +# # G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1) +# # return(G_consensus) +# } inmf_solveH <- function(H, W, V, E, lambda, nCores = 2L) { WV <- W + V @@ -377,54 +480,116 @@ inmf_solveH <- function(H, W, V, E, lambda, nCores = 2L) { return(t(H)) # return cell x factor } -inmf_solveV <- function(H, W, V, E, lambda, nCores = 2L) { - HtH <- t(H) %*% H - CtC <- (1 + lambda) * HtH - CtB <- t(H) %*% t(E) - HtH %*% t(W) - V <- RcppPlanc::bppnnls_prod(CtC, as.matrix(CtB), nCores = nCores) - return(t(V)) -} +# inmf_solveV <- function(H, W, V, E, lambda, nCores = 2L) { +# HtH <- t(H) %*% H +# CtC <- (1 + lambda) * HtH +# CtB <- t(H) %*% t(E) - HtH %*% t(W) +# V <- RcppPlanc::bppnnls_prod(CtC, as.matrix(CtB), nCores = nCores) +# return(t(V)) +# } -# W <- solveNNLS( -# C = rbindlist(mat_list = H), -# B = rbindlist(mat_list = lapply( -# X = 1:N, -# FUN = function(i) { -# return(E[[i]] - H[[i]] %*% V[[i]]) +# inmf_solveW <- function(Hs, W, Vs, Es, lambda, nCores = 2L) { +# CtC <- matrix(0, ncol(Vs[[1]]), ncol(Vs[[1]])) +# CtB <- matrix(0, ncol(Vs[[1]]), nrow(Vs[[1]])) +# for (i in seq_along(Es)) { +# HtH <- t(Hs[[i]]) %*% Hs[[i]] +# CtC <- CtC + HtH +# CtB <- CtB + as.matrix(t(Hs[[i]]) %*% t(Es[[i]])) - HtH %*% t(Vs[[i]]) +# } +# W <- RcppPlanc::bppnnls_prod(CtC, CtB, nCores = nCores) +# return(t(W)) +# } + +# inmf_objErr_i <- function(H, W, V, E, lambda) { +# # Objective error function was originally stated as: +# # obj_i = ||E_i - (W + V_i)*H_i||_F^2 + lambda * ||V_i*H_i||_F^2 +# # (Use caution with the matrix dimensionality, as we might transpose them +# # occasionally in different implementation.) +# # +# # Let L = W + V +# # ||E - LH||_F^2 = ||E||_F^2 - 2*Tr(Ht*(Et*L)) + Tr((Lt*L)*(Ht*H)) +# # ||V*H||_F^2 = Tr((Vt*V)*(Ht*H)) +# # This improves the performance in both speed and memory usage. +# L <- W + V +# sqnormE <- Matrix::norm(E, "F")^2 +# LtL <- t(L) %*% L +# HtH <- t(H) %*% H +# TrLtLHtH <- sum(diag(LtL %*% HtH)) +# EtL <- t(E) %*% L +# TrHtEtL <- sum(Matrix::diag(t(H) %*% EtL)) +# VtV <- t(V) %*% V +# TrVtVHtH <- sum(diag(VtV %*% HtH)) +# obj <- sqnormE - 2 * TrHtEtL + TrLtLHtH + lambda * TrVtVHtH +# return(obj) +# } + + +# cluster_sel <- function( +# W, +# nNeighbor = 3, +# k = 20, +# resolution = 0.8 +# ) { +# knn <- RANN::nn2(t(W), k = nNeighbor + 1) +# edges <- numeric() +# for (i in 1:nrow(knn$nn.idx)) { +# for (j in 2:(nNeighbor + 1)) { +# edges <- c(edges, i, knn$nn.idx[i, j]) # } -# )) -# ) -inmf_solveW <- function(Hs, W, Vs, Es, lambda, nCores = 2L) { - CtC <- matrix(0, ncol(Vs[[1]]), ncol(Vs[[1]])) - CtB <- matrix(0, ncol(Vs[[1]]), nrow(Vs[[1]])) - for (i in seq_along(Es)) { - HtH <- t(Hs[[i]]) %*% Hs[[i]] - CtC <- CtC + HtH - CtB <- CtB + as.matrix(t(Hs[[i]]) %*% t(Es[[i]])) - HtH %*% t(Vs[[i]]) - } - W <- RcppPlanc::bppnnls_prod(CtC, CtB, nCores = nCores) - return(t(W)) -} +# } +# weights <- numeric() +# for (i in 1:nrow(knn$nn.idx)) { +# for (j in 2:(nNeighbor + 1)) { +# weights <- c(weights, knn$nn.dist[i, j]) +# } +# } +# weights <- exp(-weights/0.5) +# graph <- igraph::graph(edges, directed = FALSE) +# cluster <- clusterGraph(graph, weights, resolution, k) +# select <- names(sort(table(cluster), decreasing = TRUE))[1:k] +# idx <- cluster %in% select +# cluster <- cluster[cluster %in% select] +# cluster <- as.integer(factor(cluster)) - 1L +# return(list(idx = idx, cluster = cluster)) +# } + +# clusterGraph <- function(graph, weights, resolution, minCluster) { +# cluster <- igraph::cluster_leiden(graph, resolution_parameter = resolution, +# weights = weights, objective_function = "mod") +# cluster <- cluster$membership +# if (length(unique(cluster)) <= minCluster) { +# cluster <- clusterGraph(graph, weights, resolution + 0.1, minCluster) +# } +# return(cluster) +# } + + + +factor_cluster_sel <- function(geneLoadings, nNeighbor = 3, minWeight = 0.6, k = 20, + resolution = 0.2) { + graphList <- lapply(geneLoadings, function(w) { + knn <- RANN::nn2(t(w), k = nNeighbor + 1) + target <- as.integer(t(knn$nn.idx)) + source <- rep(seq_len(nrow(knn$nn.idx)), each = ncol(knn$nn.idx)) + weight <- as.numeric(t(knn$nn.dists)) + weight <- exp(-weight/0.5) + graphmat <- cbind(source, target, weight) + graphmat <- graphmat[graphmat[,3] > minWeight,] + return(graphmat) + }) + graphmat <- Reduce(rbind, graphList) + + # Leiden cluster. CPP implementation expects 0-based vertex index + edgevec <- as.integer(t(graphmat[,1:2])) - 1L + cluster <- leidenAlg::find_partition_with_rep_rcpp( + edgelist = edgevec, edgelist_length = length(edgevec), + num_vertices = ncol(geneLoadings[[1]]), direction = FALSE, niter = 5, + edge_weights = graphmat[,3], resolution = resolution, nrep = 20 + ) -inmf_objErr_i <- function(H, W, V, E, lambda) { - # Objective error function was originally stated as: - # obj_i = ||E_i - (W + V_i)*H_i||_F^2 + lambda * ||V_i*H_i||_F^2 - # (Use caution with the matrix dimensionality, as we might transpose them - # occasionally in different implementation.) - # - # Let L = W + V - # ||E - LH||_F^2 = ||E||_F^2 - 2*Tr(Ht*(Et*L)) + Tr((Lt*L)*(Ht*H)) - # ||V*H||_F^2 = Tr((Vt*V)*(Ht*H)) - # This improves the performance in both speed and memory usage. - L <- W + V - sqnormE <- Matrix::norm(E, "F")^2 - LtL <- t(L) %*% L - HtH <- t(H) %*% H - TrLtLHtH <- sum(diag(LtL %*% HtH)) - EtL <- t(E) %*% L - TrHtEtL <- sum(Matrix::diag(t(H) %*% EtL)) - VtV <- t(V) %*% V - TrVtVHtH <- sum(diag(VtV %*% HtH)) - obj <- sqnormE - 2 * TrHtEtL + TrLtLHtH + lambda * TrVtVHtH - return(obj) + # Select top k clusters + selIdx <- cluster %in% names(sort(table(cluster), decreasing = TRUE))[seq_len(k)] + cluster <- cluster[selIdx] + cluster <- as.integer(factor(cluster)) - 1L + return(list(idx = selIdx, cluster = cluster)) } diff --git a/R/clustering.R b/R/clustering.R index d6efee0..c16e0e7 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -98,9 +98,7 @@ runCluster <- function( niter = nIterations, nrep = nRandomStarts ) } else { - edgeOutPath <- paste0("edge_", sub("\\s", "_", Sys.time()), '.txt') - edgeOutPath <- gsub("-", "", edgeOutPath) - edgeOutPath <- gsub(":", "", edgeOutPath) + edgeOutPath <- tempfile(pattern = "edge_", fileext = ".txt") WriteEdgeFile(snn, edgeOutPath, display_progress = FALSE) clusts <- RunModularityClusteringCpp( snn, diff --git a/R/integration.R b/R/integration.R index e7e148b..9e817ac 100644 --- a/R/integration.R +++ b/R/integration.R @@ -176,6 +176,8 @@ runIntegration.Seurat <- function( #' each element is the initial \eqn{V} matrix of each dataset. Default #' \code{NULL}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to methods. @@ -235,6 +237,7 @@ runINMF.liger <- function( WInit = NULL, VInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) { @@ -260,6 +263,7 @@ runINMF.liger <- function( WInit = WInit, VInit = VInit, seed = seed, + nCores = nCores, verbose = verbose ) @@ -305,6 +309,7 @@ runINMF.Seurat <- function( WInit = NULL, VInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) { @@ -337,6 +342,7 @@ runINMF.Seurat <- function( WInit = WInit, VInit = VInit, seed = seed, + nCores = nCores, verbose = verbose ) Hconcat <- t(Reduce(cbind, res$H)) @@ -371,6 +377,7 @@ runINMF.Seurat <- function( WInit = NULL, VInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE) ) { if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start @@ -393,7 +400,7 @@ runINMF.Seurat <- function( set.seed(seed = seed + i - 1) out <- RcppPlanc::inmf(objectList = object, k = k, lambda = lambda, niter = nIteration, Hinit = HInit, - Vinit = VInit, Winit = WInit, + Vinit = VInit, Winit = WInit, nCores = nCores, verbose = verbose) if (out$objErr < bestObj) { bestResult <- out @@ -591,6 +598,8 @@ optimizeALS <- function( # nocov start #' @param minibatchSize Total number of cells in each minibatch. See detail. #' Default \code{5000}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to other S3 methods of this function. @@ -668,6 +677,7 @@ runOnlineINMF.liger <- function( AInit = NULL, BInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) { @@ -748,7 +758,7 @@ runOnlineINMF.liger <- function( minibatchSize = minibatchSize, HALSiter = HALSiter, verbose = verbose, WInit = WInit, VInit = VInit, AInit = AInit, - BInit = BInit, seed = seed) + BInit = BInit, seed = seed, nCores = nCores) if (!isTRUE(projection)) { # Scenario 1&2, everything updated for (i in seq_along(object)) { @@ -797,6 +807,7 @@ runOnlineINMF.liger <- function( HALSiter = 1, minibatchSize = 5000, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) { @@ -819,7 +830,7 @@ runOnlineINMF.liger <- function( minibatchSize = minibatchSize, maxHALSIter = HALSiter, Vinit = VInit, Winit = WInit, Ainit = AInit, Binit = BInit, - verbose = verbose) + nCores = nCores, verbose = verbose) factorNames <- paste0("Factor_", seq(k)) if (isTRUE(projection)) { # Scenario 3 only got H for new datasets @@ -869,6 +880,7 @@ runOnlineINMF.Seurat <- function( HALSiter = 1, minibatchSize = 5000, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) { @@ -896,6 +908,7 @@ runOnlineINMF.Seurat <- function( newDatasets = NULL, projection = FALSE, maxEpochs = maxEpochs, HALSiter = HALSiter, minibatchSize = minibatchSize, seed = seed, verbose = verbose, + nCores = nCores, WInit = NULL, VInit = NULL, AInit = NULL, BInit = NULL, ) Hconcat <- t(Reduce(cbind, res$H)) @@ -1062,6 +1075,8 @@ online_iNMF <- function( # nocov start #' the random seed by 1 for each consecutive restart, so future factorization #' of the same dataset can be run with one rep if necessary. Default \code{1}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to other methods and wrapped functions. @@ -1103,10 +1118,6 @@ runUINMF <- function( object, k = 20, lambda = 5, - nIteration = 30, - nRandomStarts = 1, - seed = 1, - verbose = getOption("ligerVerbose", TRUE), ... ) { UseMethod("runUINMF", object) @@ -1122,6 +1133,7 @@ runUINMF.liger <- function( nIteration = 30, nRandomStarts = 1, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) { @@ -1144,7 +1156,7 @@ runUINMF.liger <- function( } res <- .runUINMF.list(Elist, Ulist, k = k, lambda = lambda, nIteration = nIteration, - nRandomStarts = nRandomStarts, + nRandomStarts = nRandomStarts, nCores = nCores, seed = seed, verbose = verbose, ...) for (d in names(object)) { ld <- dataset(object, d) @@ -1170,6 +1182,7 @@ runUINMF.liger <- function( nIteration = 30, nRandomStarts = 1, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE) ) { if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start @@ -1189,7 +1202,8 @@ runUINMF.liger <- function( seed <- seed + i - 1 set.seed(seed) res <- RcppPlanc::uinmf(object, unsharedList, k = k, lambda = lambda, - niter = nIteration, verbose = verbose) + niter = nIteration, nCores = nCores, + verbose = verbose) if (res$objErr < bestObj) { bestRes <- res bestObj <- res$objErr diff --git a/man/runCINMF.Rd b/man/runCINMF.Rd index 783626a..8142b61 100644 --- a/man/runCINMF.Rd +++ b/man/runCINMF.Rd @@ -6,14 +6,13 @@ \alias{runCINMF.Seurat} \title{Perform consensus iNMF on scaled datasets} \usage{ -runCINMF(object, k = 20, lambda = 5, rho = 0.3, tao = 0.1, ...) +runCINMF(object, k = 20, lambda = 5, rho = 0.3, ...) \method{runCINMF}{liger}( object, k = 20, lambda = 5, rho = 0.3, - tao = 0.1, nIteration = 30, nRandomStarts = 10, HInit = NULL, @@ -30,7 +29,6 @@ runCINMF(object, k = 20, lambda = 5, rho = 0.3, tao = 0.1, ...) k = 20, lambda = 5, rho = 0.3, - tao = 0.1, datasetVar = "orig.ident", layer = "ligerScaleData", assay = NULL, @@ -59,14 +57,17 @@ higher \code{k} will be needed for datasets with more sub-structure. Default dataset-specific effects more strongly (i.e. alignment should increase as \code{lambda} increases). Default \code{5}.} +\item{rho}{Numeric number between 0 and 1. Fraction for determining the +number of nearest neighbors to look at for consensus (by +\code{rho * nRandomStarts}). Default \code{0.3}.} + \item{...}{Arguments passed to methods.} \item{nIteration}{Total number of block coordinate descent iterations to perform. Default \code{30}.} -\item{nRandomStarts}{Number of restarts to perform (iNMF objective function -is non-convex, so taking the best objective from multiple successive -initialization is recommended). Default \code{1}.} +\item{nRandomStarts}{Number of replicate runs for creating the pool of +factorization results. Default \code{10}.} \item{HInit}{Initial values to use for \eqn{H} matrices. A list object where each element is the initial \eqn{H} matrix of each dataset. Default @@ -81,6 +82,9 @@ each element is the initial \eqn{V} matrix of each dataset. Default \item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + \item{verbose}{Logical. Whether to show information of the progress. Default \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} @@ -125,9 +129,9 @@ feature key. Default \code{"cinmf"}.} Performs consensus integrative non-negative matrix factorization (c-iNMF) to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. We run the regular iNMF multiple times with different random starts, and then take the -consensus of the factorized matrices factor gene loading matrix \eqn{W} and -\eqn{V} matrices for each dataset. The cell factor loading \eqn{H} matrices -are eventually solved with the consensus \eqn{W} and \eqn{V} matrices. +consensus of frequently appearing factors from gene loading matrices, \eqn{W} +and \eqn{V}. The cell factor loading \eqn{H} matrices are eventually solved +with the consensus \eqn{W} and \eqn{V} matrices. Please see \code{\link{runINMF}} for detailed introduction to the regular iNMF algorithm which is run multiple times in this function. @@ -144,6 +148,9 @@ if (requireNamespace("RcppPlanc", quietly = TRUE)) { } } \references{ -Joshua D. Welch and et al., Single-Cell Multi-omic Integration -Compares and Contrasts Features of Brain Cell Identity, Cell, 2019 +Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares and +Contrasts Features of Brain Cell Identity, Cell, 2019 + +Dylan Kotliar and et al., Identifying gene expression programs of cell-type +identity and cellular activity with single-cell RNA-Seq, eLife, 2019 } diff --git a/man/runINMF.Rd b/man/runINMF.Rd index d46a572..78dcea4 100644 --- a/man/runINMF.Rd +++ b/man/runINMF.Rd @@ -18,6 +18,7 @@ runINMF(object, k = 20, lambda = 5, ...) WInit = NULL, VInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) @@ -36,6 +37,7 @@ runINMF(object, k = 20, lambda = 5, ...) WInit = NULL, VInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) @@ -77,6 +79,9 @@ each element is the initial \eqn{V} matrix of each dataset. Default \item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + \item{verbose}{Logical. Whether to show information of the progress. Default \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} diff --git a/man/runOnlineINMF.Rd b/man/runOnlineINMF.Rd index 12904b9..2751a94 100644 --- a/man/runOnlineINMF.Rd +++ b/man/runOnlineINMF.Rd @@ -22,6 +22,7 @@ runOnlineINMF(object, k = 20, lambda = 5, ...) AInit = NULL, BInit = NULL, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) @@ -38,6 +39,7 @@ runOnlineINMF(object, k = 20, lambda = 5, ...) HALSiter = 1, minibatchSize = 5000, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) @@ -79,6 +81,9 @@ See detail. Default \code{NULL}.} \item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + \item{verbose}{Logical. Whether to show information of the progress. Default \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} diff --git a/man/runUINMF.Rd b/man/runUINMF.Rd index 05e0e63..3a13b1c 100644 --- a/man/runUINMF.Rd +++ b/man/runUINMF.Rd @@ -5,16 +5,7 @@ \alias{runUINMF.liger} \title{Perform Mosaic iNMF (UINMF) on scaled datasets with unshared features} \usage{ -runUINMF( - object, - k = 20, - lambda = 5, - nIteration = 30, - nRandomStarts = 1, - seed = 1, - verbose = getOption("ligerVerbose", TRUE), - ... -) +runUINMF(object, k = 20, lambda = 5, ...) \method{runUINMF}{liger}( object, @@ -23,6 +14,7 @@ runUINMF( nIteration = 30, nRandomStarts = 1, seed = 1, + nCores = 2L, verbose = getOption("ligerVerbose", TRUE), ... ) @@ -40,6 +32,8 @@ higher \code{k} will be needed for datasets with more sub-structure. Default dataset-specific effects more strongly (i.e. alignment should increase as \code{lambda} increases). Default \code{5}.} +\item{...}{Arguments passed to other methods and wrapped functions.} + \item{nIteration}{Total number of block coordinate descent iterations to perform. Default \code{30}.} @@ -51,10 +45,11 @@ of the same dataset can be run with one rep if necessary. Default \code{1}.} \item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + \item{verbose}{Logical. Whether to show information of the progress. Default \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} - -\item{...}{Arguments passed to other methods and wrapped functions.} } \value{ \itemize{ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 62e6146..c179d26 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -43,19 +43,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// select_factor_cpp -Rcpp::LogicalVector select_factor_cpp(arma::mat all_data, arma::mat knn, double threshold); -RcppExport SEXP _rliger2_select_factor_cpp(SEXP all_dataSEXP, SEXP knnSEXP, SEXP thresholdSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< arma::mat >::type all_data(all_dataSEXP); - Rcpp::traits::input_parameter< arma::mat >::type knn(knnSEXP); - Rcpp::traits::input_parameter< double >::type threshold(thresholdSEXP); - rcpp_result_gen = Rcpp::wrap(select_factor_cpp(all_data, knn, threshold)); - return rcpp_result_gen; -END_RCPP -} // colAggregateMedian_dense_cpp arma::mat colAggregateMedian_dense_cpp(const arma::mat& x, const arma::uvec& group, const arma::uword n); RcppExport SEXP _rliger2_colAggregateMedian_dense_cpp(SEXP xSEXP, SEXP groupSEXP, SEXP nSEXP) { @@ -336,7 +323,6 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_rliger2_RunModularityClusteringCpp", (DL_FUNC) &_rliger2_RunModularityClusteringCpp, 9}, {"_rliger2_colNormalize_dense_cpp", (DL_FUNC) &_rliger2_colNormalize_dense_cpp, 2}, - {"_rliger2_select_factor_cpp", (DL_FUNC) &_rliger2_select_factor_cpp, 3}, {"_rliger2_colAggregateMedian_dense_cpp", (DL_FUNC) &_rliger2_colAggregateMedian_dense_cpp, 3}, {"_rliger2_scaleNotCenter_byRow_rcpp", (DL_FUNC) &_rliger2_scaleNotCenter_byRow_rcpp, 1}, {"_rliger2_scaleNotCenter_byRow_perDataset_rcpp", (DL_FUNC) &_rliger2_scaleNotCenter_byRow_perDataset_rcpp, 3}, diff --git a/src/cinmf_util.cpp b/src/cinmf_util.cpp index dfe8f57..d11b73e 100644 --- a/src/cinmf_util.cpp +++ b/src/cinmf_util.cpp @@ -16,40 +16,6 @@ arma::mat colNormalize_dense_cpp(arma::mat& x, const arma::uword L) { return result; } -// all_data: n features by nFactor * nRep factors -// knn: nFactor * nRep factors by k + 1 nearest neighbors. The first column is -// the index of the factor itself -// threshold: threshold of maximum mean euclidean distance from a factor to its -// nearest neighbors -// Return: The one-based index of selected factors -// [[Rcpp::export()]] -Rcpp::LogicalVector select_factor_cpp( - arma::mat all_data, - arma::mat knn, - double threshold) { - int k = knn.n_cols - 1; - arma::vec obs(all_data.n_rows); - arma::vec neighbor(all_data.n_rows); - double dist, mean_dist; - Rcpp::LogicalVector selected_factors(all_data.n_cols); - for (int j = 0; j < knn.n_rows; ++j) { - obs = all_data.col(j); - mean_dist = 0; - for (int i = 1; i < knn.n_cols; ++i) { - neighbor = all_data.col(knn(j, i)); - dist = arma::norm(obs - neighbor, 2); - mean_dist += dist; - } - mean_dist /= k; - if (mean_dist < threshold) { - selected_factors(j) = true; - } else { - selected_factors(j) = false; - } - } - return selected_factors; -} - // x: n features by n selected factors // group: n-selected-factor integer vector, pre-transformed to 0-base, // from upstream kmeans clustering.