diff --git a/R/RcppExports.R b/R/RcppExports.R
index 929a5c7..874af8e 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -9,10 +9,6 @@ colNormalize_dense_cpp <- function(x, L) {
     .Call(`_rliger2_colNormalize_dense_cpp`, x, L)
 }
 
-select_factor_cpp <- function(all_data, knn, threshold) {
-    .Call(`_rliger2_select_factor_cpp`, all_data, knn, threshold)
-}
-
 colAggregateMedian_dense_cpp <- function(x, group, n) {
     .Call(`_rliger2_colAggregateMedian_dense_cpp`, x, group, n)
 }
diff --git a/R/cINMF.R b/R/cINMF.R
index f8ef85c..b8d5013 100644
--- a/R/cINMF.R
+++ b/R/cINMF.R
@@ -3,9 +3,9 @@
 #' Performs consensus integrative non-negative matrix factorization (c-iNMF)
 #' to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. We run the
 #' regular iNMF multiple times with different random starts, and then take the
-#' consensus of the factorized matrices factor gene loading matrix \eqn{W} and
-#' \eqn{V} matrices for each dataset. The cell factor loading \eqn{H} matrices
-#' are eventually solved with the consensus \eqn{W} and \eqn{V} matrices.
+#' consensus of frequently appearing factors from gene loading matrices, \eqn{W}
+#' and \eqn{V}. The cell factor loading \eqn{H} matrices are eventually solved
+#' with the consensus \eqn{W} and \eqn{V} matrices.
 #'
 #' Please see \code{\link{runINMF}} for detailed introduction to the regular
 #' iNMF algorithm which is run multiple times in this function.
@@ -21,11 +21,13 @@
 #' @param lambda Regularization parameter. Larger values penalize
 #' dataset-specific effects more strongly (i.e. alignment should increase as
 #' \code{lambda} increases). Default \code{5}.
+#' @param rho Numeric number between 0 and 1. Fraction for determining the
+#' number of nearest neighbors to look at for consensus (by
+#' \code{rho * nRandomStarts}). Default \code{0.3}.
 #' @param nIteration Total number of block coordinate descent iterations to
 #' perform. Default \code{30}.
-#' @param nRandomStarts Number of restarts to perform (iNMF objective function
-#' is non-convex, so taking the best objective from multiple successive
-#' initialization is recommended). Default \code{1}.
+#' @param nRandomStarts Number of replicate runs for creating the pool of
+#' factorization results. Default \code{10}.
 #' @param HInit Initial values to use for \eqn{H} matrices. A list object where
 #' each element is the initial \eqn{H} matrix of each dataset. Default
 #' \code{NULL}.
@@ -35,6 +37,8 @@
 #' each element is the initial \eqn{V} matrix of each dataset. Default
 #' \code{NULL}.
 #' @param seed Random seed to allow reproducible results. Default \code{1}.
+#' @param nCores The number of parallel tasks to speed up the computation.
+#' Default \code{2L}. Only supported for platform with OpenMP support.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
 #' @param ... Arguments passed to methods.
@@ -63,8 +67,12 @@
 #'          \code{misc} slot of the same DimReduc object.}
 #'  }}
 #' }
-#' @references Joshua D. Welch and et al., Single-Cell Multi-omic Integration
-#' Compares and Contrasts Features of Brain Cell Identity, Cell, 2019
+#' @references
+#' Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares and
+#' Contrasts Features of Brain Cell Identity, Cell, 2019
+#'
+#' Dylan Kotliar and et al., Identifying gene expression programs of cell-type
+#' identity and cellular activity with single-cell RNA-Seq, eLife, 2019
 #' @examples
 #' pbmc <- normalize(pbmc)
 #' pbmc <- selectGenes(pbmc)
@@ -77,7 +85,6 @@ runCINMF <- function(
         k = 20,
         lambda = 5.0,
         rho = 0.3,
-        tao = 0.1,
         ...
 ) {
     UseMethod("runCINMF", object)
@@ -91,7 +98,6 @@ runCINMF.liger <- function(
         k = 20,
         lambda = 5.0,
         rho = 0.3,
-        tao = 0.1,
         nIteration = 30,
         nRandomStarts = 10,
         HInit = NULL,
@@ -119,7 +125,6 @@ runCINMF.liger <- function(
         k = k,
         lambda = lambda,
         rho = rho,
-        tao = tao,
         nIteration = nIteration,
         nRandomStarts = nRandomStarts,
         HInit = HInit,
@@ -129,6 +134,7 @@ runCINMF.liger <- function(
         nCores = nCores,
         verbose = verbose
     )
+    # return(out)
     object@W <- out$W
     for (d in names(object)) {
         object@datasets[[d]]@H <- out$H[[d]]
@@ -159,7 +165,6 @@ runCINMF.Seurat <- function(
         k = 20,
         lambda = 5.0,
         rho = 0.3,
-        tao = 0.1,
         datasetVar = "orig.ident",
         layer = "ligerScaleData",
         assay = NULL,
@@ -201,7 +206,6 @@ runCINMF.Seurat <- function(
         k = k,
         lambda = lambda,
         rho = rho,
-        tao = tao,
         nIteration = nIteration,
         nRandomStarts = nRandomStarts,
         HInit = HInit,
@@ -238,7 +242,6 @@ runCINMF.Seurat <- function(
         k = 20,
         lambda = 5.0,
         rho = 0.3,
-        tao = 0.1,
         nIteration = 30,
         nRandomStarts = 10,
         HInit = NULL,
@@ -287,19 +290,70 @@ runCINMF.Seurat <- function(
                                verbose = FALSE)
         Ws[[repName]] <- out$W
         for (n in names(object)) Vs[[n]][[repName]] <- out$V[[n]]
-        # for (n in names(object)) Hs[[n]][[repName]] <- out$H[[n]]
+        for (n in names(object)) Hs[[n]][[repName]] <- out$H[[n]]
         if (isTRUE(verbose))
             cli::cli_progress_update(
                 status = sprintf("[ %d / %d ]", i, nRandomStarts), set = i
             )
     }
+    # return(list(W = Ws, V = Vs, H = Hs))
     if (isTRUE(verbose)) cliID <- cli::cli_process_start("Taking the consensus")
-    W <- takeConsensus(Ws, rho = rho, tao = tao)
-    Vs <- lapply(Vs, takeConsensus, rho = rho, tao = tao)
+    # Attempt 1 ####
+    # W <- takeConsensus(Ws, rho = rho, tao = tao)
+    # Vs <- lapply(Vs, takeConsensus, rho = rho, tao = tao)
+
+    # Attempt 2 ####
+    # factorSelect <- takeConsensus2(Ws, rho = rho, tao = tao)
+    # W <- Reduce(cbind, Ws)[, factorSelect$idx]
+    # W <- colNormalize_dense_cpp(W, L = 2)
+    # W <- colAggregateMedian_dense_cpp(W, group = factorSelect$cluster - 1, n = k)
+    # W <- colNormalize_dense_cpp(W, L = 1)
+    # for (i in seq_along(object)) {
+    #     V <- Reduce(cbind, Vs[[i]])[, factorSelect$idx]
+    #     V <- colNormalize_dense_cpp(V, L = 2)
+    #     V <- colAggregateMedian_dense_cpp(V, group = factorSelect$cluster - 1, n = k)
+    #     Vs[[i]] <- colNormalize_dense_cpp(V, L = 1)
+    # }
+
+    # Attempt 3 ####
+    # W <- Reduce(cbind, Ws)
+    # selection <- cluster_sel(W, nNeighbor = 3, k = k, resolution = 0.8)
+    # W <- W[, selection$idx]
+    # W <- colNormalize_dense_cpp(W, L = 2)
+    # print(table(selection$cluster - 1))
+    # W <- colAggregateMedian_dense_cpp(W, group = selection$cluster, n = k)
+    # W <- colNormalize_dense_cpp(W, L = 1)
+    # for (i in seq_along(Vs)) {
+    #     V <- Reduce(cbind, Vs[[i]])
+    #     V <- V[, selection$idx]
+    #     V <- colNormalize_dense_cpp(V, L = 2)
+    #     V <- colAggregateMedian_dense_cpp(V, group = selection$cluster, n = k)
+    #     Vs[[i]] <- colNormalize_dense_cpp(V, L = 1)
+    # }
+
+    # Attempt 4 ####
+    nNeighbor <- round(rho * nRandomStarts)
+    geneLoadings <- list(Reduce(cbind, Ws))
+    for (i in seq_along(object)) {
+        geneLoadings[[i + 1]] <- Reduce(cbind, Vs[[i]])
+    }
+    geneLoadings <- lapply(geneLoadings, colNormalize_dense_cpp, L = 2)
+    # geneLoadings is a list, each element is a ngene by (nFactor * nRandomStart) matrix
+    # The first is for W, the rest are for Vs
+    selection <- factor_cluster_sel(geneLoadings, nNeighbor = nNeighbor,
+                                    minWeight = 0.6, k = k, resolution = 0.2)
+    W <- geneLoadings[[1]][, selection$idx]
+    W <- colAggregateMedian_dense_cpp(W, group = selection$cluster, n = k)
+    W <- colNormalize_dense_cpp(W, L = 1)
+    for (i in seq_along(Vs)) {
+        V <- geneLoadings[[i + 1]][, selection$idx]
+        V <- colAggregateMedian_dense_cpp(V, group = selection$cluster, n = k)
+        Vs[[i]] <- colNormalize_dense_cpp(V, L = 1)
+    }
 
-    # if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
-    #
-    msg <- "Solving last round of NNLS problems"
+    if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
+
+    msg <- "Solving the last ANLS iterations"
     if (isTRUE(verbose)) cliID <- cli::cli_process_start(msg = msg)
     for (i in seq_along(object)) {
         Hs[[i]] <- inmf_solveH(NULL, W, Vs[[i]], object[[i]], lambda, nCores = nCores)
@@ -307,15 +361,12 @@ runCINMF.Seurat <- function(
     # for (i in seq_along(object)) {
     #     Vs[[i]] <- inmf_solveV(Hs[[i]], W, NULL, object[[i]], lambda, nCores = nCores)
     # }
-    # W <- inmf_solveW(Hs, NULL, Vs, object, lambda, nCores = nCores)
     # objErr <- sum(sapply(seq_along(object), function(i)
     #     inmf_objErr_i(H = Hs[[i]], W = W, V = Vs[[i]], E = object[[i]],
     #                   lambda = lambda)
     # ))
-    out <- .runINMF.list(object, k = k, lambda = lambda, nIteration = 30,
-                         HInit = Hs, WInit = W, VInit = Vs, verbose = FALSE)
-    if (isTRUE(verbose))
-        cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {out$objErr}"))
+    # if (isTRUE(verbose))
+    #     cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {objErr}"))
 
     # factors <- paste0("Factor_", seq(k))
     # dimnames(W) <- list(features, factors)
@@ -326,48 +377,100 @@ runCINMF.Seurat <- function(
     # }
     # names(Hs) <- names(Vs) <- names(object)
     # result <- list(W = W, H = Hs, V = Vs, objErr = objErr)
+    # return(result)
+
+    out <- .runINMF.list(object, k = k, lambda = lambda, nIteration = 1,
+                         HInit = Hs, WInit = W, VInit = Vs, verbose = FALSE)
+    if (isTRUE(verbose))
+        cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {out$objErr}"))
     return(out)
-    return(result)
 }
 
 
-takeConsensus <- function(matList, rho = 0.3, tao = 0.1) {
-    ## According to cNMF method.
-    ## G - The program matrix. what we call W or V
-    ## rho - fraction parameter to set number of nearest neighbors to look at
-    ## tao - distance threshold to filter out factors
-
-    ## matList are matrices of dimensionality gene x factor
-    # Step 1: Concatenate and l2 normalize
-    ## R - a number of replicates
-    G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart)
-    G_R <- colNormalize_dense_cpp(G_R, L = 2)
+# takeConsensus <- function(matList, rho = 0.3, tao = 0.1) {
+#     ## According to cNMF method.
+#     ## G - The program matrix. what we call W or V
+#     ## rho - fraction parameter to set number of nearest neighbors to look at
+#     ## tao - distance threshold to filter out factors
+#
+#     ## matList are matrices of dimensionality gene x factor
+#     # Step 1: Concatenate and l2 normalize
+#     ## R - a number of replicates
+#     G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart)
+#     G_R <- colNormalize_dense_cpp(G_R, L = 2)
+#
+#     # Step 2: Find kNN matching for each component (factor)
+#     # and filter by mean distance against the kNN of each
+#     nNeighbor <- round(rho * length(matList))
+#     knn <- RANN::nn2(t(G_R), k = nNeighbor + 1)
+#
+#     # `select_factor_cpp` is a C++ function that returns the indices of the
+#     # factors that are selected by examining whether the mean euclidean distance
+#     # to its NNs is less than the threshold `tao`.
+#     selectedIdx <- rowMeans(knn$nn.dist[,2:(nNeighbor + 1)]) < tal
+#     # selectedIdx <- select_factor_cpp(all_data = G_R,
+#     #                                  knn = knn$nn.idx - 1,
+#     #                                  threshold = tao)
+#
+#     G_R <- G_R[, selectedIdx] # genes by selected factors
+#     if (ncol(G_R) < ncol(matList[[1]])) {
+#         cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.",
+#                          "i" = "Please try: ",
+#                          "*" = "a larger {.var tao} for loosen threshold",
+#                          "*" = "a larger {.var nRandomStarts} for more replicates to be used."))
+#     }
+#     # Step 3: Kmeans clustering to get the k consensus
+#     cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster
+#     G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]]))
+#     G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1)
+#     return(G_consensus)
+# }
 
-    # Step 2: Find kNN matching for each component (factor)
-    # and filter by mean distance against the kNN of each
-    nNeighbor <- round(rho * length(matList))
-    knn <- RANN::nn2(t(G_R), k = nNeighbor + 1)
-
-    # `select_factor_cpp` is a C++ function that returns the indices of the
-    # factors that are selected by examining whether the mean euclidean distance
-    # to its NNs is less than the threshold `tao`.
-    selectedIdx <- select_factor_cpp(all_data = G_R,
-                                     knn = knn$nn.idx - 1,
-                                     threshold = tao)
-
-    G_R <- G_R[, selectedIdx] # genes by selected factors
-    if (ncol(G_R) < ncol(matList[[1]])) {
-        cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.",
-                         "i" = "Please try: ",
-                         "*" = "a larger {.var tao} for loosen threshold",
-                         "*" = "a larger {.var nRandomStarts} for more replicates to be used."))
-    }
-    # Step 3: Kmeans clustering to get the k consensus
-    cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster
-    G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]]))
-    G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1)
-    return(G_consensus)
-}
+# takeConsensus2 <- function(matList, rho = 0.3, tao = 0.1) {
+#     # 2nd attempt of methods
+#     # Return the selection of factors from the replicate pool as well as the
+#     # grouping on the selected ones. So this part of information will be used
+#     # to select the V factors that represent the same thing as what we want
+#     # from W
+#
+#     ## According to cNMF method.
+#     ## G - The program matrix. what we call W or V
+#     ## rho - fraction parameter to set number of nearest neighbors to look at
+#     ## tao - distance threshold to filter out factors
+#
+#     ## matList are matrices of dimensionality gene x factor
+#     # Step 1: Concatenate and l2 normalize
+#     ## R - a number of replicates
+#     G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart)
+#     G_R <- colNormalize_dense_cpp(G_R, L = 2)
+#
+#     # Step 2: Find kNN matching for each component (factor)
+#     # and filter by mean distance against the kNN of each
+#     nNeighbor <- round(rho * length(matList))
+#     knn <- RANN::nn2(t(G_R), k = nNeighbor + 1)
+#     nnDistMeans <- rowMeans(knn$nn.dist[,2:(nNeighbor + 1)])
+#     selectedIdx <- nnDistMeans < tao
+#     # `select_factor_cpp` is a C++ function that returns the indices of the
+#     # factors that are selected by examining whether the mean euclidean distance
+#     # to its NNs is less than the threshold `tao`.
+#     # selectedIdx <- select_factor_cpp(all_data = G_R,
+#     #                                  knn = knn$nn.idx - 1,
+#     #                                  threshold = tao)
+#
+#     G_R <- G_R[, selectedIdx] # genes by selected factors
+#     if (ncol(G_R) < ncol(matList[[1]])) {
+#         cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.",
+#                          "i" = "Please try: ",
+#                          "*" = "a larger {.var tao} for loosen threshold",
+#                          "*" = "a larger {.var nRandomStarts} for more replicates to be used."))
+#     }
+#     # Step 3: Kmeans clustering to get the k consensus
+#     cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster
+#     return(list(idx = selectedIdx, cluster = cluster))
+#     # G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]]))
+#     # G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1)
+#     # return(G_consensus)
+# }
 
 inmf_solveH <- function(H, W, V, E, lambda, nCores = 2L) {
     WV <- W + V
@@ -377,54 +480,116 @@ inmf_solveH <- function(H, W, V, E, lambda, nCores = 2L) {
     return(t(H)) # return cell x factor
 }
 
-inmf_solveV <- function(H, W, V, E, lambda, nCores = 2L) {
-    HtH <- t(H) %*% H
-    CtC <- (1 + lambda) * HtH
-    CtB <- t(H) %*% t(E) - HtH %*% t(W)
-    V <- RcppPlanc::bppnnls_prod(CtC, as.matrix(CtB), nCores = nCores)
-    return(t(V))
-}
+# inmf_solveV <- function(H, W, V, E, lambda, nCores = 2L) {
+#     HtH <- t(H) %*% H
+#     CtC <- (1 + lambda) * HtH
+#     CtB <- t(H) %*% t(E) - HtH %*% t(W)
+#     V <- RcppPlanc::bppnnls_prod(CtC, as.matrix(CtB), nCores = nCores)
+#     return(t(V))
+# }
 
-# W <- solveNNLS(
-#     C = rbindlist(mat_list = H),
-#     B = rbindlist(mat_list = lapply(
-#         X = 1:N,
-#         FUN = function(i) {
-#             return(E[[i]] - H[[i]] %*% V[[i]])
+# inmf_solveW <- function(Hs, W, Vs, Es, lambda, nCores = 2L) {
+#     CtC <- matrix(0, ncol(Vs[[1]]), ncol(Vs[[1]]))
+#     CtB <- matrix(0, ncol(Vs[[1]]), nrow(Vs[[1]]))
+#     for (i in seq_along(Es)) {
+#         HtH <- t(Hs[[i]]) %*% Hs[[i]]
+#         CtC <- CtC + HtH
+#         CtB <- CtB + as.matrix(t(Hs[[i]]) %*% t(Es[[i]])) - HtH %*% t(Vs[[i]])
+#     }
+#     W <- RcppPlanc::bppnnls_prod(CtC, CtB, nCores = nCores)
+#     return(t(W))
+# }
+
+# inmf_objErr_i <- function(H, W, V, E, lambda) {
+#     # Objective error function was originally stated as:
+#     # obj_i = ||E_i - (W + V_i)*H_i||_F^2 + lambda * ||V_i*H_i||_F^2
+#     # (Use caution with the matrix dimensionality, as we might transpose them
+#     # occasionally in different implementation.)
+#     #
+#     # Let L = W + V
+#     # ||E - LH||_F^2 = ||E||_F^2 - 2*Tr(Ht*(Et*L)) + Tr((Lt*L)*(Ht*H))
+#     # ||V*H||_F^2 = Tr((Vt*V)*(Ht*H))
+#     # This improves the performance in both speed and memory usage.
+#     L <- W + V
+#     sqnormE <- Matrix::norm(E, "F")^2
+#     LtL <- t(L) %*% L
+#     HtH <- t(H) %*% H
+#     TrLtLHtH <- sum(diag(LtL %*% HtH))
+#     EtL <- t(E) %*% L
+#     TrHtEtL <- sum(Matrix::diag(t(H) %*% EtL))
+#     VtV <- t(V) %*% V
+#     TrVtVHtH <- sum(diag(VtV %*% HtH))
+#     obj <- sqnormE - 2 * TrHtEtL + TrLtLHtH + lambda * TrVtVHtH
+#     return(obj)
+# }
+
+
+# cluster_sel <- function(
+#         W,
+#         nNeighbor = 3,
+#         k = 20,
+#         resolution = 0.8
+# ) {
+#     knn <- RANN::nn2(t(W), k = nNeighbor + 1)
+#     edges <- numeric()
+#     for (i in 1:nrow(knn$nn.idx)) {
+#         for (j in 2:(nNeighbor + 1)) {
+#             edges <- c(edges, i, knn$nn.idx[i, j])
 #         }
-#     ))
-# )
-inmf_solveW <- function(Hs, W, Vs, Es, lambda, nCores = 2L) {
-    CtC <- matrix(0, ncol(Vs[[1]]), ncol(Vs[[1]]))
-    CtB <- matrix(0, ncol(Vs[[1]]), nrow(Vs[[1]]))
-    for (i in seq_along(Es)) {
-        HtH <- t(Hs[[i]]) %*% Hs[[i]]
-        CtC <- CtC + HtH
-        CtB <- CtB + as.matrix(t(Hs[[i]]) %*% t(Es[[i]])) - HtH %*% t(Vs[[i]])
-    }
-    W <- RcppPlanc::bppnnls_prod(CtC, CtB, nCores = nCores)
-    return(t(W))
-}
+#     }
+#     weights <- numeric()
+#     for (i in 1:nrow(knn$nn.idx)) {
+#         for (j in 2:(nNeighbor + 1)) {
+#             weights <- c(weights, knn$nn.dist[i, j])
+#         }
+#     }
+#     weights <- exp(-weights/0.5)
+#     graph <- igraph::graph(edges, directed = FALSE)
+#     cluster <- clusterGraph(graph, weights, resolution, k)
+#     select <- names(sort(table(cluster), decreasing = TRUE))[1:k]
+#     idx <- cluster %in% select
+#     cluster <- cluster[cluster %in% select]
+#     cluster <- as.integer(factor(cluster)) - 1L
+#     return(list(idx = idx, cluster = cluster))
+# }
+
+# clusterGraph <- function(graph, weights, resolution, minCluster) {
+#     cluster <- igraph::cluster_leiden(graph, resolution_parameter = resolution,
+#                                       weights = weights, objective_function = "mod")
+#     cluster <- cluster$membership
+#     if (length(unique(cluster)) <= minCluster) {
+#         cluster <- clusterGraph(graph, weights, resolution + 0.1, minCluster)
+#     }
+#     return(cluster)
+# }
+
+
+
+factor_cluster_sel <- function(geneLoadings, nNeighbor = 3, minWeight = 0.6, k = 20,
+                               resolution = 0.2) {
+    graphList <- lapply(geneLoadings, function(w) {
+        knn <- RANN::nn2(t(w), k = nNeighbor + 1)
+        target <- as.integer(t(knn$nn.idx))
+        source <- rep(seq_len(nrow(knn$nn.idx)), each = ncol(knn$nn.idx))
+        weight <- as.numeric(t(knn$nn.dists))
+        weight <- exp(-weight/0.5)
+        graphmat <- cbind(source, target, weight)
+        graphmat <- graphmat[graphmat[,3] > minWeight,]
+        return(graphmat)
+    })
+    graphmat <- Reduce(rbind, graphList)
+
+    # Leiden cluster. CPP implementation expects 0-based vertex index
+    edgevec <- as.integer(t(graphmat[,1:2])) - 1L
+    cluster <- leidenAlg::find_partition_with_rep_rcpp(
+        edgelist = edgevec, edgelist_length = length(edgevec),
+        num_vertices = ncol(geneLoadings[[1]]), direction = FALSE, niter = 5,
+        edge_weights = graphmat[,3], resolution = resolution, nrep = 20
+    )
 
-inmf_objErr_i <- function(H, W, V, E, lambda) {
-    # Objective error function was originally stated as:
-    # obj_i = ||E_i - (W + V_i)*H_i||_F^2 + lambda * ||V_i*H_i||_F^2
-    # (Use caution with the matrix dimensionality, as we might transpose them
-    # occasionally in different implementation.)
-    #
-    # Let L = W + V
-    # ||E - LH||_F^2 = ||E||_F^2 - 2*Tr(Ht*(Et*L)) + Tr((Lt*L)*(Ht*H))
-    # ||V*H||_F^2 = Tr((Vt*V)*(Ht*H))
-    # This improves the performance in both speed and memory usage.
-    L <- W + V
-    sqnormE <- Matrix::norm(E, "F")^2
-    LtL <- t(L) %*% L
-    HtH <- t(H) %*% H
-    TrLtLHtH <- sum(diag(LtL %*% HtH))
-    EtL <- t(E) %*% L
-    TrHtEtL <- sum(Matrix::diag(t(H) %*% EtL))
-    VtV <- t(V) %*% V
-    TrVtVHtH <- sum(diag(VtV %*% HtH))
-    obj <- sqnormE - 2 * TrHtEtL + TrLtLHtH + lambda * TrVtVHtH
-    return(obj)
+    # Select top k clusters
+    selIdx <- cluster %in% names(sort(table(cluster), decreasing = TRUE))[seq_len(k)]
+    cluster <- cluster[selIdx]
+    cluster <- as.integer(factor(cluster)) - 1L
+    return(list(idx = selIdx, cluster = cluster))
 }
diff --git a/R/clustering.R b/R/clustering.R
index d6efee0..c16e0e7 100644
--- a/R/clustering.R
+++ b/R/clustering.R
@@ -98,9 +98,7 @@ runCluster <- function(
             niter = nIterations, nrep = nRandomStarts
         )
     } else {
-        edgeOutPath <- paste0("edge_", sub("\\s", "_", Sys.time()), '.txt')
-        edgeOutPath <- gsub("-", "", edgeOutPath)
-        edgeOutPath <- gsub(":", "", edgeOutPath)
+        edgeOutPath <- tempfile(pattern = "edge_", fileext = ".txt")
         WriteEdgeFile(snn, edgeOutPath, display_progress = FALSE)
         clusts <- RunModularityClusteringCpp(
             snn,
diff --git a/R/integration.R b/R/integration.R
index e7e148b..9e817ac 100644
--- a/R/integration.R
+++ b/R/integration.R
@@ -176,6 +176,8 @@ runIntegration.Seurat <- function(
 #' each element is the initial \eqn{V} matrix of each dataset. Default
 #' \code{NULL}.
 #' @param seed Random seed to allow reproducible results. Default \code{1}.
+#' @param nCores The number of parallel tasks to speed up the computation.
+#' Default \code{2L}. Only supported for platform with OpenMP support.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
 #' @param ... Arguments passed to methods.
@@ -235,6 +237,7 @@ runINMF.liger <- function(
         WInit = NULL,
         VInit = NULL,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
@@ -260,6 +263,7 @@ runINMF.liger <- function(
         WInit = WInit,
         VInit = VInit,
         seed = seed,
+        nCores = nCores,
         verbose = verbose
     )
 
@@ -305,6 +309,7 @@ runINMF.Seurat <- function(
         WInit = NULL,
         VInit = NULL,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
@@ -337,6 +342,7 @@ runINMF.Seurat <- function(
         WInit = WInit,
         VInit = VInit,
         seed = seed,
+        nCores = nCores,
         verbose = verbose
     )
     Hconcat <- t(Reduce(cbind, res$H))
@@ -371,6 +377,7 @@ runINMF.Seurat <- function(
         WInit = NULL,
         VInit = NULL,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE)
 ) {
     if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start
@@ -393,7 +400,7 @@ runINMF.Seurat <- function(
         set.seed(seed = seed + i - 1)
         out <- RcppPlanc::inmf(objectList = object, k = k, lambda = lambda,
                                niter = nIteration, Hinit = HInit,
-                               Vinit = VInit, Winit = WInit,
+                               Vinit = VInit, Winit = WInit, nCores = nCores,
                                verbose = verbose)
         if (out$objErr < bestObj) {
             bestResult <- out
@@ -591,6 +598,8 @@ optimizeALS <- function( # nocov start
 #' @param minibatchSize Total number of cells in each minibatch. See detail.
 #' Default \code{5000}.
 #' @param seed Random seed to allow reproducible results. Default \code{1}.
+#' @param nCores The number of parallel tasks to speed up the computation.
+#' Default \code{2L}. Only supported for platform with OpenMP support.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
 #' @param ... Arguments passed to other S3 methods of this function.
@@ -668,6 +677,7 @@ runOnlineINMF.liger <- function(
         AInit = NULL,
         BInit = NULL,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
@@ -748,7 +758,7 @@ runOnlineINMF.liger <- function(
                                minibatchSize = minibatchSize,
                                HALSiter = HALSiter, verbose = verbose,
                                WInit = WInit, VInit = VInit, AInit = AInit,
-                               BInit = BInit, seed = seed)
+                               BInit = BInit, seed = seed, nCores = nCores)
     if (!isTRUE(projection)) {
         # Scenario 1&2, everything updated
         for (i in seq_along(object)) {
@@ -797,6 +807,7 @@ runOnlineINMF.liger <- function(
         HALSiter = 1,
         minibatchSize = 5000,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
@@ -819,7 +830,7 @@ runOnlineINMF.liger <- function(
                                  minibatchSize = minibatchSize,
                                  maxHALSIter = HALSiter, Vinit = VInit,
                                  Winit = WInit, Ainit = AInit, Binit = BInit,
-                                 verbose = verbose)
+                                 nCores = nCores, verbose = verbose)
     factorNames <- paste0("Factor_", seq(k))
     if (isTRUE(projection)) {
         # Scenario 3 only got H for new datasets
@@ -869,6 +880,7 @@ runOnlineINMF.Seurat <- function(
         HALSiter = 1,
         minibatchSize = 5000,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
@@ -896,6 +908,7 @@ runOnlineINMF.Seurat <- function(
         newDatasets = NULL, projection = FALSE,
         maxEpochs = maxEpochs, HALSiter = HALSiter,
         minibatchSize = minibatchSize, seed = seed, verbose = verbose,
+        nCores = nCores,
         WInit = NULL, VInit = NULL, AInit = NULL, BInit = NULL,
     )
     Hconcat <- t(Reduce(cbind, res$H))
@@ -1062,6 +1075,8 @@ online_iNMF <- function( # nocov start
 #' the random seed by 1 for each consecutive restart, so future factorization
 #' of the same dataset can be run with one rep if necessary. Default \code{1}.
 #' @param seed Random seed to allow reproducible results. Default \code{1}.
+#' @param nCores The number of parallel tasks to speed up the computation.
+#' Default \code{2L}. Only supported for platform with OpenMP support.
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
 #' @param ... Arguments passed to other methods and wrapped functions.
@@ -1103,10 +1118,6 @@ runUINMF <- function(
         object,
         k = 20,
         lambda = 5,
-        nIteration = 30,
-        nRandomStarts = 1,
-        seed = 1,
-        verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
     UseMethod("runUINMF", object)
@@ -1122,6 +1133,7 @@ runUINMF.liger <- function(
         nIteration = 30,
         nRandomStarts = 1,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE),
         ...
 ) {
@@ -1144,7 +1156,7 @@ runUINMF.liger <- function(
     }
     res <- .runUINMF.list(Elist, Ulist, k = k, lambda = lambda,
                           nIteration = nIteration,
-                          nRandomStarts = nRandomStarts,
+                          nRandomStarts = nRandomStarts, nCores = nCores,
                           seed = seed, verbose = verbose, ...)
     for (d in names(object)) {
         ld <- dataset(object, d)
@@ -1170,6 +1182,7 @@ runUINMF.liger <- function(
         nIteration = 30,
         nRandomStarts = 1,
         seed = 1,
+        nCores = 2L,
         verbose = getOption("ligerVerbose", TRUE)
 ) {
     if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start
@@ -1189,7 +1202,8 @@ runUINMF.liger <- function(
         seed <- seed + i - 1
         set.seed(seed)
         res <- RcppPlanc::uinmf(object, unsharedList, k = k, lambda = lambda,
-                                niter = nIteration, verbose = verbose)
+                                niter = nIteration, nCores = nCores,
+                                verbose = verbose)
         if (res$objErr < bestObj) {
             bestRes <- res
             bestObj <- res$objErr
diff --git a/man/runCINMF.Rd b/man/runCINMF.Rd
index 783626a..8142b61 100644
--- a/man/runCINMF.Rd
+++ b/man/runCINMF.Rd
@@ -6,14 +6,13 @@
 \alias{runCINMF.Seurat}
 \title{Perform consensus iNMF on scaled datasets}
 \usage{
-runCINMF(object, k = 20, lambda = 5, rho = 0.3, tao = 0.1, ...)
+runCINMF(object, k = 20, lambda = 5, rho = 0.3, ...)
 
 \method{runCINMF}{liger}(
   object,
   k = 20,
   lambda = 5,
   rho = 0.3,
-  tao = 0.1,
   nIteration = 30,
   nRandomStarts = 10,
   HInit = NULL,
@@ -30,7 +29,6 @@ runCINMF(object, k = 20, lambda = 5, rho = 0.3, tao = 0.1, ...)
   k = 20,
   lambda = 5,
   rho = 0.3,
-  tao = 0.1,
   datasetVar = "orig.ident",
   layer = "ligerScaleData",
   assay = NULL,
@@ -59,14 +57,17 @@ higher \code{k} will be needed for datasets with more sub-structure. Default
 dataset-specific effects more strongly (i.e. alignment should increase as
 \code{lambda} increases). Default \code{5}.}
 
+\item{rho}{Numeric number between 0 and 1. Fraction for determining the
+number of nearest neighbors to look at for consensus (by
+\code{rho * nRandomStarts}). Default \code{0.3}.}
+
 \item{...}{Arguments passed to methods.}
 
 \item{nIteration}{Total number of block coordinate descent iterations to
 perform. Default \code{30}.}
 
-\item{nRandomStarts}{Number of restarts to perform (iNMF objective function
-is non-convex, so taking the best objective from multiple successive
-initialization is recommended). Default \code{1}.}
+\item{nRandomStarts}{Number of replicate runs for creating the pool of
+factorization results. Default \code{10}.}
 
 \item{HInit}{Initial values to use for \eqn{H} matrices. A list object where
 each element is the initial \eqn{H} matrix of each dataset. Default
@@ -81,6 +82,9 @@ each element is the initial \eqn{V} matrix of each dataset. Default
 
 \item{seed}{Random seed to allow reproducible results. Default \code{1}.}
 
+\item{nCores}{The number of parallel tasks to speed up the computation.
+Default \code{2L}. Only supported for platform with OpenMP support.}
+
 \item{verbose}{Logical. Whether to show information of the progress. Default
 \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.}
 
@@ -125,9 +129,9 @@ feature key. Default \code{"cinmf"}.}
 Performs consensus integrative non-negative matrix factorization (c-iNMF)
 to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. We run the
 regular iNMF multiple times with different random starts, and then take the
-consensus of the factorized matrices factor gene loading matrix \eqn{W} and
-\eqn{V} matrices for each dataset. The cell factor loading \eqn{H} matrices
-are eventually solved with the consensus \eqn{W} and \eqn{V} matrices.
+consensus of frequently appearing factors from gene loading matrices, \eqn{W}
+and \eqn{V}. The cell factor loading \eqn{H} matrices are eventually solved
+with the consensus \eqn{W} and \eqn{V} matrices.
 
 Please see \code{\link{runINMF}} for detailed introduction to the regular
 iNMF algorithm which is run multiple times in this function.
@@ -144,6 +148,9 @@ if (requireNamespace("RcppPlanc", quietly = TRUE)) {
 }
 }
 \references{
-Joshua D. Welch and et al., Single-Cell Multi-omic Integration
-Compares and Contrasts Features of Brain Cell Identity, Cell, 2019
+Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares and
+Contrasts Features of Brain Cell Identity, Cell, 2019
+
+Dylan Kotliar and et al., Identifying gene expression programs of cell-type
+identity and cellular activity with single-cell RNA-Seq, eLife, 2019
 }
diff --git a/man/runINMF.Rd b/man/runINMF.Rd
index d46a572..78dcea4 100644
--- a/man/runINMF.Rd
+++ b/man/runINMF.Rd
@@ -18,6 +18,7 @@ runINMF(object, k = 20, lambda = 5, ...)
   WInit = NULL,
   VInit = NULL,
   seed = 1,
+  nCores = 2L,
   verbose = getOption("ligerVerbose", TRUE),
   ...
 )
@@ -36,6 +37,7 @@ runINMF(object, k = 20, lambda = 5, ...)
   WInit = NULL,
   VInit = NULL,
   seed = 1,
+  nCores = 2L,
   verbose = getOption("ligerVerbose", TRUE),
   ...
 )
@@ -77,6 +79,9 @@ each element is the initial \eqn{V} matrix of each dataset. Default
 
 \item{seed}{Random seed to allow reproducible results. Default \code{1}.}
 
+\item{nCores}{The number of parallel tasks to speed up the computation.
+Default \code{2L}. Only supported for platform with OpenMP support.}
+
 \item{verbose}{Logical. Whether to show information of the progress. Default
 \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.}
 
diff --git a/man/runOnlineINMF.Rd b/man/runOnlineINMF.Rd
index 12904b9..2751a94 100644
--- a/man/runOnlineINMF.Rd
+++ b/man/runOnlineINMF.Rd
@@ -22,6 +22,7 @@ runOnlineINMF(object, k = 20, lambda = 5, ...)
   AInit = NULL,
   BInit = NULL,
   seed = 1,
+  nCores = 2L,
   verbose = getOption("ligerVerbose", TRUE),
   ...
 )
@@ -38,6 +39,7 @@ runOnlineINMF(object, k = 20, lambda = 5, ...)
   HALSiter = 1,
   minibatchSize = 5000,
   seed = 1,
+  nCores = 2L,
   verbose = getOption("ligerVerbose", TRUE),
   ...
 )
@@ -79,6 +81,9 @@ See detail. Default \code{NULL}.}
 
 \item{seed}{Random seed to allow reproducible results. Default \code{1}.}
 
+\item{nCores}{The number of parallel tasks to speed up the computation.
+Default \code{2L}. Only supported for platform with OpenMP support.}
+
 \item{verbose}{Logical. Whether to show information of the progress. Default
 \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.}
 
diff --git a/man/runUINMF.Rd b/man/runUINMF.Rd
index 05e0e63..3a13b1c 100644
--- a/man/runUINMF.Rd
+++ b/man/runUINMF.Rd
@@ -5,16 +5,7 @@
 \alias{runUINMF.liger}
 \title{Perform Mosaic iNMF (UINMF) on scaled datasets with unshared features}
 \usage{
-runUINMF(
-  object,
-  k = 20,
-  lambda = 5,
-  nIteration = 30,
-  nRandomStarts = 1,
-  seed = 1,
-  verbose = getOption("ligerVerbose", TRUE),
-  ...
-)
+runUINMF(object, k = 20, lambda = 5, ...)
 
 \method{runUINMF}{liger}(
   object,
@@ -23,6 +14,7 @@ runUINMF(
   nIteration = 30,
   nRandomStarts = 1,
   seed = 1,
+  nCores = 2L,
   verbose = getOption("ligerVerbose", TRUE),
   ...
 )
@@ -40,6 +32,8 @@ higher \code{k} will be needed for datasets with more sub-structure. Default
 dataset-specific effects more strongly (i.e. alignment should increase as
 \code{lambda} increases). Default \code{5}.}
 
+\item{...}{Arguments passed to other methods and wrapped functions.}
+
 \item{nIteration}{Total number of block coordinate descent iterations to
 perform. Default \code{30}.}
 
@@ -51,10 +45,11 @@ of the same dataset can be run with one rep if necessary. Default \code{1}.}
 
 \item{seed}{Random seed to allow reproducible results. Default \code{1}.}
 
+\item{nCores}{The number of parallel tasks to speed up the computation.
+Default \code{2L}. Only supported for platform with OpenMP support.}
+
 \item{verbose}{Logical. Whether to show information of the progress. Default
 \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.}
-
-\item{...}{Arguments passed to other methods and wrapped functions.}
 }
 \value{
 \itemize{
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 62e6146..c179d26 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -43,19 +43,6 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
-// select_factor_cpp
-Rcpp::LogicalVector select_factor_cpp(arma::mat all_data, arma::mat knn, double threshold);
-RcppExport SEXP _rliger2_select_factor_cpp(SEXP all_dataSEXP, SEXP knnSEXP, SEXP thresholdSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< arma::mat >::type all_data(all_dataSEXP);
-    Rcpp::traits::input_parameter< arma::mat >::type knn(knnSEXP);
-    Rcpp::traits::input_parameter< double >::type threshold(thresholdSEXP);
-    rcpp_result_gen = Rcpp::wrap(select_factor_cpp(all_data, knn, threshold));
-    return rcpp_result_gen;
-END_RCPP
-}
 // colAggregateMedian_dense_cpp
 arma::mat colAggregateMedian_dense_cpp(const arma::mat& x, const arma::uvec& group, const arma::uword n);
 RcppExport SEXP _rliger2_colAggregateMedian_dense_cpp(SEXP xSEXP, SEXP groupSEXP, SEXP nSEXP) {
@@ -336,7 +323,6 @@ END_RCPP
 static const R_CallMethodDef CallEntries[] = {
     {"_rliger2_RunModularityClusteringCpp", (DL_FUNC) &_rliger2_RunModularityClusteringCpp, 9},
     {"_rliger2_colNormalize_dense_cpp", (DL_FUNC) &_rliger2_colNormalize_dense_cpp, 2},
-    {"_rliger2_select_factor_cpp", (DL_FUNC) &_rliger2_select_factor_cpp, 3},
     {"_rliger2_colAggregateMedian_dense_cpp", (DL_FUNC) &_rliger2_colAggregateMedian_dense_cpp, 3},
     {"_rliger2_scaleNotCenter_byRow_rcpp", (DL_FUNC) &_rliger2_scaleNotCenter_byRow_rcpp, 1},
     {"_rliger2_scaleNotCenter_byRow_perDataset_rcpp", (DL_FUNC) &_rliger2_scaleNotCenter_byRow_perDataset_rcpp, 3},
diff --git a/src/cinmf_util.cpp b/src/cinmf_util.cpp
index dfe8f57..d11b73e 100644
--- a/src/cinmf_util.cpp
+++ b/src/cinmf_util.cpp
@@ -16,40 +16,6 @@ arma::mat colNormalize_dense_cpp(arma::mat& x, const arma::uword L) {
     return result;
 }
 
-// all_data: n features by nFactor * nRep factors
-// knn: nFactor * nRep factors by k + 1 nearest neighbors. The first column is
-// the index of the factor itself
-// threshold: threshold of maximum mean euclidean distance from a factor to its
-// nearest neighbors
-// Return: The one-based index of selected factors
-// [[Rcpp::export()]]
-Rcpp::LogicalVector select_factor_cpp(
-        arma::mat all_data,
-        arma::mat knn,
-        double threshold) {
-    int k = knn.n_cols - 1;
-    arma::vec obs(all_data.n_rows);
-    arma::vec neighbor(all_data.n_rows);
-    double dist, mean_dist;
-    Rcpp::LogicalVector selected_factors(all_data.n_cols);
-    for (int j = 0; j < knn.n_rows; ++j) {
-        obs = all_data.col(j);
-        mean_dist = 0;
-        for (int i = 1; i < knn.n_cols; ++i) {
-            neighbor = all_data.col(knn(j, i));
-            dist = arma::norm(obs - neighbor, 2);
-            mean_dist += dist;
-        }
-        mean_dist /= k;
-        if (mean_dist < threshold) {
-            selected_factors(j) = true;
-        } else {
-            selected_factors(j) = false;
-        }
-    }
-    return selected_factors;
-}
-
 // x: n features by n selected factors
 // group: n-selected-factor integer vector, pre-transformed to 0-base,
 // from upstream kmeans clustering.