Skip to content

Commit

Permalink
improved gene embedding and smoothed counts functions
Browse files Browse the repository at this point in the history
  • Loading branch information
jr-leary7 committed Oct 18, 2023
1 parent 719a5e5 commit b38cdf2
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 16 deletions.
55 changes: 39 additions & 16 deletions R/embedGenes.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' Generate a table of fitted values and celltype metadata for genes of interest.
#' Generate PCA & UMAP embeddings of fitted gene dynamics.
#'
#' @name embedGenes
#' @author Jack Leary
Expand All @@ -9,8 +9,9 @@
#' @description Embed genes in dimension-reduced space given a smoothed counts matrix.
#' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL.
#' @param genes A character vector of genes to embed. If not specified, all genes in \code{smoothed.counts} are used. Defaults to NULL.
#' @param pca.init A boolean specifying whether or not the embedded PCs should be used as initialization for clustering and UMAP. The default is to cluster / embed the raw dynamics i.e., defaults to FALSE.
#' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30.
#' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2.
#' @param pc.return (Optional) How many principal components should be included in the output? Defaults to 2.
#' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE.
#' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL.
#' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.
Expand All @@ -27,8 +28,9 @@

embedGenes <- function(smoothed.counts = NULL,
genes = NULL,
pca.init = FALSE,
pc.embed = 30,
pcs.return = 2,
pc.return = 2,
cluster.genes = TRUE,
gene.meta.data = NULL,
k.param = 20,
Expand All @@ -44,20 +46,41 @@ embedGenes <- function(smoothed.counts = NULL,
n = pc.embed,
center = TRUE,
scale. = TRUE)
smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x,
n_components = 2,
metric = "cosine",
n_neighbors = k.param,
init = "spectral")
if (pca.init) {
smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x,
n_components = 2,
metric = "cosine",
n_neighbors = k.param,
init = "spectral",
nn_method = "annoy")
} else {
smoothed_counts_umap <- uwot::umap(smoothed.counts,
n_components = 2,
metric = "cosine",
n_neighbors = k.param,
init = "spectral",
nn_method = "annoy")
}
# clustering w/ silhouette score parameter tuning
if (cluster.genes) {
smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x,
k = k.param,
type = "jaccard",
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
if (pca.init) {
smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x,
k = k.param,
type = "jaccard",
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
} else {
smoothed_counts_snn <- bluster::makeSNNGraph(smoothed.counts,
k = k.param,
type = "jaccard",
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
}
if (is.null(resolution.param)) {
dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7), \(r) {
if (pca.init) {
dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
} else {
dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed.counts))
}
clust_runs <- purrr::map(seq(0.1, 0.7, by = 0.1), \(r) {
smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn,
objective_function = "modularity",
resolution_parameter = r)
Expand All @@ -83,8 +106,8 @@ embedGenes <- function(smoothed.counts = NULL,
cluster_vec <- NA_integer_
}
# prepare results
pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)])
colnames(pca_df) <- paste0("pc", seq(pcs.return))
pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pc.return)])
colnames(pca_df) <- paste0("pc", seq(pc.return))
gene_df <- data.frame(gene = genes,
leiden = cluster_vec,
umap1 = smoothed_counts_umap[, 1],
Expand Down
5 changes: 5 additions & 0 deletions R/smoothedCountsMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#' @param pt A data.frame of pseudotime values for each cell. Defaults to NULL.
#' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.
#' @param genes (Optional) A character vector of genes with which to subset the results. Defaults to NULL.
#' @param log1p.norm A boolean specifying whether the smoothed counts should be log1p-transformed after depth normalization. Defaults to FALSE.
#' @param parallel.exec Should \code{furrr} be used to speed up execution? Defaults to TRUE.
#' @param n.cores If parallel execution is desired, how many cores should be utilized? Defaults to 2.
#' @return A list of matrices of smoothed counts, with each element of the list being a single pseudotime lineage.
Expand All @@ -31,6 +32,7 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL,
size.factor.offset = NULL,
pt = NULL,
genes = NULL,
log1p.norm = FALSE,
parallel.exec = TRUE,
n.cores = 2) {
# check inputs
Expand Down Expand Up @@ -65,6 +67,9 @@ smoothedCountsMatrix <- function(test.dyn.res = NULL,
}) %>%
purrr::reduce(cbind)
colnames(fitted_vals_mat) <- names(fitted_vals_list)
if (log1p.norm) {
fitted_vals_mat <- log1p(fitted_vals_mat)
}
return(fitted_vals_mat)
})
names(lineage_mat_list) <- paste0("Lineage_", lineages)
Expand Down

0 comments on commit b38cdf2

Please sign in to comment.