Skip to content

Commit

Permalink
Merge pull request #127 from jr-leary7/dev
Browse files Browse the repository at this point in the history
updated tests, docs, and added gene embedding fn -- closes #126
  • Loading branch information
jr-leary7 authored Sep 25, 2023
2 parents 63c3c18 + 3e83ffa commit 5248ffe
Show file tree
Hide file tree
Showing 8 changed files with 206 additions and 65 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ URL: https://github.com/jr-leary7/scLANE
BugReports: https://github.com/jr-leary7/scLANE/issues
Suggests:
covr,
coop,
uwot,
ggh4x,
knitr,
irlba,
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(clusterGenes)
export(createCellOffset)
export(embedGenes)
export(enrichDynamicGenes)
export(extractBreakpoints)
export(fitGLMM)
Expand Down Expand Up @@ -95,11 +96,11 @@ importFrom(purrr,reduce)
importFrom(scales,label_comma)
importFrom(scales,label_number)
importFrom(splines,bs)
importFrom(stats,as.dist)
importFrom(stats,as.formula)
importFrom(stats,coef)
importFrom(stats,cutree)
importFrom(stats,deviance)
importFrom(stats,dist)
importFrom(stats,fitted)
importFrom(stats,fitted.values)
importFrom(stats,hclust)
Expand Down
15 changes: 11 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
# scLANE 0.7.2
# `scLANE` 0.7.3

* Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering.
* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity.
* Added `theme_scLANE()` for output plots.
* Enhanced documentation.

# `scLANE` 0.7.2

* Added a function named `sortGenesHeatmap()` that aids in the creation of expression cascade heatmaps by sorting genes according to where in pseudotime their peak expression is.
* Changed the parameter `approx.knot` in the `testDynamic()` function to use (stochasticity-controlled) subsampling instead of `seq()` to reduce candidate knot space.
* Added `summarizeModels()` to sum up slopes across pseudotime intervals
* Added `summarizeModels()` to sum up slopes across pseudotime intervals.

# scLANE 0.7.1
# `scLANE` 0.7.1

* Changed input format of all functions to allow counts matrices formatted as `SingleCellExperiment` or `Seurat` objects, sparse matrices, or dense matrices.
* Updated visualization functions to reflect changes made in `ggplot2` v3.4 (mostly changing the `size` parameter in line-based geoms to be `linewidth` instead).

# scLANE 0.6.3
# `scLANE` 0.6.3

* Added a `NEWS.md` file to track changes to the package.
71 changes: 25 additions & 46 deletions R/clusterGenes.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
#'
#' @name clusterGenes
#' @author Jack Leary
#' @description This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score.
#' @description This function takes as input the output from \code{\link{testDynamic}} and clusters the fitted values from the model for each gene using one of several user-chosen algorithms. An approximately optimal clustering is determined by iterating over reasonable hyperparameter values & choosing the value with the highest mean silhouette score based on the cosine distance.
#' @import magrittr
#' @importFrom purrr map discard map2 reduce
#' @importFrom stats setNames hclust cutree kmeans dist
#' @importFrom stats setNames hclust cutree kmeans as.dist
#' @param test.dyn.res The list returned by \code{\link{testDynamic}} - no extra processing required. Defaults to NULL.
#' @param pt A data.frame containing the pseudotime or latent time estimates for each cell. Defaults to NULL.
#' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.
Expand All @@ -19,6 +19,7 @@
#' }
#' @return A data.frame of with three columns: \code{Gene}, \code{Lineage}, and \code{Cluster}.
#' @seealso \code{\link{testDynamic}}
#' @seealso \code{\link{embedGenes}}
#' @seealso \code{\link{plotClusteredGenes}}
#' @export
#' @examples
Expand Down Expand Up @@ -74,102 +75,80 @@ clusterGenes <- function(test.dyn.res = NULL,
n = n.PC,
center = TRUE,
scale. = TRUE)
dist_matrix <- stats::as.dist(1 - coop::tcosine(x = fitted_vals_pca$x))
} else {
dist_matrix <- stats::as.dist(1 - coop::tcosine(x = fitted_vals_mat))
}
# hierarchical clustering routine w/ Ward's linkage
if (clust.algo == "hclust") {
if (use.pca) {
hclust_tree <- stats::hclust(stats::dist(fitted_vals_pca$x), method = "ward.D2")
} else {
hclust_tree <- stats::hclust(stats::dist(fitted_vals_mat), method = "ward.D2")
}
hclust_tree <- stats::hclust(dist_matrix, method = "ward.D2")
k_vals <- c(2:10)
sil_vals <- vector("numeric", 9L)
clust_list <- vector("list", 9L)
for (k in seq_along(k_vals)) {
clust_res <- stats::cutree(hclust_tree, k = k_vals[k])
if (use.pca) {
sil_res <- cluster::silhouette(clust_res, stats::dist(fitted_vals_pca$x))
} else {
sil_res <- cluster::silhouette(clust_res, stats::dist(fitted_vals_mat))
}
sil_vals[k] <- mean(sil_res[, 3]) # silhouette widths stored in third column
sil_res <- cluster::silhouette(clust_res, dist_matrix)
sil_vals[k] <- mean(sil_res[, 3])
clust_list[[k]] <- clust_res
}
k_to_use <- k_vals[which.max(sil_vals)]
clust_res <- stats::cutree(hclust_tree, k = k_to_use)
gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),
Lineage = lineages[l],
Cluster = clust_res)
Cluster = clust_list[[which.max(sil_vals)]])
# k-means clustering routine w/ Hartigan-Wong algorithm
} else if (clust.algo == "kmeans") {
k_vals <- c(2:10)
sil_vals <- vector("numeric", 9L)
clust_list <- vector("list", 9L)
for (k in seq_along(k_vals)) {
if (use.pca) {
clust_res <- stats::kmeans(fitted_vals_pca$x,
centers = k_vals[k],
nstart = 5,
algorithm = "Hartigan-Wong")
sil_res <- cluster::silhouette(clust_res$cluster, stats::dist(fitted_vals_pca$x))
} else {
clust_res <- stats::kmeans(fitted_vals_mat,
centers = k_vals[k],
nstart = 5,
algorithm = "Hartigan-Wong")
sil_res <- cluster::silhouette(clust_res$cluster, stats::dist(fitted_vals_mat))
}
sil_vals[k] <- mean(sil_res[, 3]) # silhouette widths stored in third column
}
k_to_use <- k_vals[which.max(sil_vals)]
if (use.pca) {
clust_res <- stats::kmeans(fitted_vals_pca$x,
centers = k_to_use,
nstart = 10,
algorithm = "Hartigan-Wong")
} else {
clust_res <- stats::kmeans(fitted_vals_mat,
centers = k_to_use,
nstart = 10,
algorithm = "Hartigan-Wong")
clust_list[[k]] <- clust_res
sil_res <- cluster::silhouette(clust_res$cluster, dist_matrix)
sil_vals[k] <- mean(sil_res[, 3])
}
gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),
Lineage = lineages[l],
Cluster = clust_res$cluster)
Cluster = clust_list[[which.max(sil_vals)]]$cluster)
# Leiden clustering routine
} else if (clust.algo == "leiden") {
if (use.pca) {
fitted_vals_graph <- bluster::makeSNNGraph(x = fitted_vals_pca$x,
k = 30,
k = 20,
type = "jaccard",
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Euclidean"))
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
} else {
fitted_vals_graph <- bluster::makeSNNGraph(x = fitted_vals_mat,
k = 30,
k = 20,
type = "jaccard",
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Euclidean"))
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
}
res_vals <- seq(0.1, 1, by = 0.1)
sil_vals <- vector("numeric", 10L)
clust_list <- vector("list", 10L)
for (r in seq_along(res_vals)) {
clust_res <- igraph::cluster_leiden(graph = fitted_vals_graph,
objective_function = "modularity",
resolution_parameter = res_vals[r])
if (clust_res$nb_clusters == 1) {
sil_vals[r] <- 0
} else {
if (use.pca) {
sil_res <- cluster::silhouette(clust_res$membership, stats::dist(fitted_vals_pca$x))
} else {
sil_res <- cluster::silhouette(clust_res$membership, stats::dist(fitted_vals_mat))
}
sil_res <- cluster::silhouette(clust_res$membership, dist_matrix)
sil_vals[r] <- mean(sil_res[, 3])
}
clust_list[[r]] <- clust_res$membership
}
res_to_use <- res_vals[which.max(sil_vals)]
clust_res <- igraph::cluster_leiden(graph = fitted_vals_graph,
objective_function = "modularity",
resolution_parameter = res_to_use)
gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),
Lineage = lineages[l],
Cluster = clust_res$membership)
Cluster = clust_list[[which.max(sil_vals)]])
}
gene_cluster_list[[l]] <- gene_clusters
}
Expand Down
78 changes: 78 additions & 0 deletions R/embedGenes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#' Generate a table of fitted values and celltype metadata for genes of interest.
#'
#' @name embedGenes
#' @author Jack Leary
#' @import magrittr
#' @importFrom purrr map map_dbl
#' @importFrom dplyr bind_cols
#' @importFrom stats as.dist
#' @description Embed genes in dimension-reduced space given a smoothed counts matrix.
#' @param smoothed.counts The output from \code{\link{smoothedCountsMatrix}}. Defaults to NULL.
#' @param genes A character vector of genes to embed. If not specified, all genes in \code{test.dyn.res} are used. Defaults to NULL.
#' @param pc.embed (Optional) How many PCs should be used to cluster the genes and run UMAP? Defaults to 30.
#' @param pcs.return (Optional) How many principal components should be included in the output? Defaults to 2.
#' @param cluster.genes (Optional) Should genes be clustered in PCA space using the Leiden algorithm? Defaults to TRUE.
#' @param gene.meta.data (Optional) A data.frame of metadata values for each gene (HVG status, Ensembl ID, gene biotype, etc.) that will be included in the result table. Defaults to NULL.
#' @param k.param (Optional) The value of nearest-neighbors used in creating the SNN graph prior to clustering & in running UMAP. Defaults to 20.
#' @param random.seed (Optional) The random seed used to control stochasticity in the clustering algorithm. Defaults to 312.
#' @return A data.frame containing embedding coordinates, cluster IDs, and metadata for each gene.
#' @export
#' @examples
#' \dontrun{
#' embedGenes(smoothed_counts$Lineage_A,
#' pcs.return = 3,
#' cluster.genes = TRUE)
#' }

embedGenes <- function(smoothed.counts = NULL,
genes = NULL,
pc.embed = 30,
pcs.return = 2,
cluster.genes = TRUE,
gene.meta.data = NULL,
k.param = 20,
random.seed = 312) {
# check inputs
if (is.null(smoothed.counts)) { stop("You forgot to provide a smoothed counts matrix to embedGenes().") }
# embeddings
set.seed(random.seed)
smoothed_counts_pca <- irlba::prcomp_irlba(t(smoothed.counts),
pc.embed = pc.embed,
center = TRUE,
scale. = TRUE)
smoothed_counts_umap <- uwot::umap(smoothed_counts_pca$x,
n_components = 2,
metric = "cosine",
n_neighbors = k.param,
init = "spectral")
# clustering w/ silhouette score parameter tuning
smoothed_counts_snn <- bluster::makeSNNGraph(smoothed_counts_pca$x,
k = k.param,
type = "jaccard",
BNPARAM = BiocNeighbors::AnnoyParam(distance = "Cosine"))
dist_matrix <- stats::as.dist(1 - coop::tcosine(x = smoothed_counts_pca$x))
clust_runs <- purrr::map(c(0.1, 0.2, 0.3, 0.4, 0.5), \(r) {
smoothed_counts_clust <- igraph::cluster_leiden(smoothed_counts_snn,
objective_function = "modularity",
resolution_parameter = r)
if (smoothed_counts_clust$nb_clusters == 1) {
sil_val <- 0
} else {
sil_val <- mean(cluster::silhouette(as.integer(smoothed_counts_clust$membership - 1L), dist_matrix)[, 3])
}
clust_res <- list(clusters = as.factor(smoothed_counts_clust$membership - 1L),
resolution = r,
silhouette = sil_val)
return(clust_res)
})
best_clustering <- which.max(purrr::map_dbl(clust_runs, \(x) x$silhouette))
# prepare results
pca_df <- as.data.frame(smoothed_counts_pca$x[, seq(pcs.return)])
colnames(pca_df) <- paste0("pc", seq(pcs.return))
gene_clust_df <- data.frame(gene = colnames(smoothed.counts),
leiden = clust_runs[[best_clustering]]$clusters,
umap1 = smoothed_counts_umap[, 1],
umap2 = smoothed_counts_umap[, 2])
gene_clust_df <- dplyr::bind_cols(gene_clust_df, pca_df)
return(gene_clust_df)
}
4 changes: 3 additions & 1 deletion man/clusterGenes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 50 additions & 0 deletions man/embedGenes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 5248ffe

Please sign in to comment.