From 408a1af4403c3642195f59b3f89ec85496363c2f Mon Sep 17 00:00:00 2001 From: jr-leary7 Date: Thu, 30 Dec 2021 14:14:31 -0500 Subject: [PATCH] Major documentation update --- .Rbuildignore | 1 + DESCRIPTION | 30 ++++--- LICENSE | 2 + LICENSE.md | 21 +++++ NAMESPACE | 15 +++- R/AnnotateMarkerGenes.R | 44 +++++----- R/ChoosePCs.R | 4 +- R/ComputeSilhouetteScores.R | 11 ++- R/ConvertGeneOrthologs.R | 7 +- R/CosineDist.R | 5 +- R/FindSpecificMarkers.R | 10 ++- R/FindSubpopulationMarkers.R | 53 ++++++------ R/IntegrateSubclusters.R | 13 ++- R/PrepareData.R | 140 ++++++++++++++++---------------- R/ReclusterCells.R | 104 ++++++++++++------------ R/ReduceDimensions.R | 7 +- R/RunPHATE.R | 5 +- R/theme_yehlab.R | 19 +++-- man/AnnotateMarkerGenes.Rd | 9 +- man/ChoosePCs.Rd | 6 +- man/ComputeSilhouetteScores.Rd | 7 +- man/ConvertGeneOrthologs.Rd | 7 +- man/CosineDist.Rd | 2 +- man/FindSpecificMarkers.Rd | 7 +- man/FindSubpopulationMarkers.Rd | 13 +-- man/IntegrateSubclusters.Rd | 9 +- man/PrepareData.Rd | 16 +++- man/ReclusterCells.Rd | 22 +++-- man/ReduceDimensions.Rd | 10 ++- man/RunPHATE.Rd | 5 +- 30 files changed, 348 insertions(+), 256 deletions(-) create mode 100644 LICENSE create mode 100644 LICENSE.md diff --git a/.Rbuildignore b/.Rbuildignore index 31f11cd..008c3d6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^YehLabClust\.Rproj$ ^\.Rproj\.user$ +^LICENSE\.md$ diff --git a/DESCRIPTION b/DESCRIPTION index ef591e2..e77cd68 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,26 @@ Package: SCISSORS -Title: A workflow to identify cell subpopulations in single cell RNA-seq. -Version: 0.0.2.0 -Authors@R: - person(given = "Jack", - family = "Leary", - role = c("aut", "cre"), - email = "jrleary@live.unc.edu") -Depends: R (>= 3.5.0), Seurat (>= 3.0), biomaRt, cluster, data.table, ggplot2, SingleCellExperiment -Suggests: phateR +Title: Identify cell subpopulations in single cell RNA-seq data +Version: 1.0.0 +Author: Jack Leary [aut] +Maintainer: Jack Leary Description: This package implements a method (to be published soon) that allows users to easily identify cell subtypes and / or subpopulations in scRNA-seq data. After running the necessary `Seurat` processing steps, the user decides which clusters they think are good candidates for subpopulation-detection based on cluster size, t-SNE visualizations, and / or identified cell cluster type. For example, if the user sees a large, fairly homogeneous cluster that they are fairly sure contains immune cells, they can use our method to tease out T cell subtypes, NK cells, B cells, etc. from within the original cluster. The method is predicated on the usage of a cosine-distance based silhouette score that is calculated for each cluster. Several different sets of parameters are used, and the subclustering that maximizes this mean silhouette score is considered optimal. The package is built around the `Seurat` package, but `SingleCellExperiment`-formatted data is accepted as input as well (it will be converted to `Seurat` format, though). We also support the identification of differentially-expressed marker genes for each identified subpopulation, which can be used to characterize known or novel cell subtypes at the transcriptomic level. -License: `use_mit_license()` +License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 +Depends: + magrittr +Imports: + Seurat, + stats, + cluster, + phateR, + SingleCellExperiment, + biomaRt, + Matrix, + matrixStats, + dplyr, + ggplot2, + SeuratObject +URL: https://github.com/jr-leary7/SCISSORS diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b0381a5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2021 +COPYRIGHT HOLDER: SCISSORS authors diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..039e22a --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2021 SCISSORS authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE index a552c32..8ee2a30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,22 +12,29 @@ export(ReclusterCells) export(ReduceDimensions) export(RunPHATE) export(theme_yehlab) -import(Seurat) -import(biomaRt) import(magrittr) -importClassesFrom(SingleCellExperiment,SingleCellExperiment) importFrom(Matrix,t) +importFrom(Seurat,CellCycleScoring) importFrom(Seurat,CreateDimReducObject) +importFrom(Seurat,DefaultAssay) importFrom(Seurat,DimPlot) importFrom(Seurat,Embeddings) importFrom(Seurat,FindAllMarkers) +importFrom(Seurat,FindClusters) +importFrom(Seurat,FindMarkers) +importFrom(Seurat,FindNeighbors) importFrom(Seurat,GetAssayData) importFrom(Seurat,Idents) +importFrom(Seurat,PercentageFeatureSet) importFrom(Seurat,RunPCA) importFrom(Seurat,RunTSNE) importFrom(Seurat,RunUMAP) +importFrom(Seurat,SCTransform) importFrom(Seurat,Stdev) importFrom(Seurat,VariableFeatures) +importFrom(SeuratObject,as.Seurat) +importFrom(SeuratObject,colSums) +importFrom(biomaRt,getBM) importFrom(biomaRt,getLDS) importFrom(biomaRt,useMart) importFrom(cluster,silhouette) @@ -46,4 +53,6 @@ importFrom(ggplot2,labs) importFrom(ggplot2,theme) importFrom(matrixStats,rowVars) importFrom(phateR,phate) +importFrom(stats,as.dist) importFrom(stats,dist) +importFrom(stats,quantile) diff --git a/R/AnnotateMarkerGenes.R b/R/AnnotateMarkerGenes.R index 5f3d39a..69015b6 100644 --- a/R/AnnotateMarkerGenes.R +++ b/R/AnnotateMarkerGenes.R @@ -1,8 +1,10 @@ -#' Annotate differentially expressed genes using biomaRt +#' Annotate differentially expressed genes using \code{biomaRt}. #' -#' This function uses the `biomaRt` package to fetch a user-defined list of attributes for a list of dataframes containing differentially expressed genes. Intended to be run directly after `FindSubpopulationMarkers()`. -#' @import biomaRt -#' @param marker.genes The dataframe of marker genes generated by `FindSubpopulationMarkers()` +#' @name AnnotateMarkerGenes +#' @author Jack Leary +#' @description This function uses the \code{biomaRt} package to fetch a user-defined list of attributes for a list of dataframes containing differentially expressed genes. Intended to be run directly after \code{\link{FindSubpopulationMarkers}}. +#' @importFrom biomaRt useMart getBM +#' @param marker.genes The dataframe of marker genes generated by \code{\link{FindSubpopulationMarkers}}. #' @param species The species of the cells being analyzed. Defaults to "human", but also supports "mouse". #' @param desired.annos The vector containing the annotations you'd like to retrieve for each gene. @@ -12,33 +14,33 @@ AnnotateMarkerGenes <- function(marker.genes = NULL, # check inputs if (is.null(marker.genes)) { stop("Please supply a list of dataframes containing marker genes.") } if (is.null(desired.attrs)) { stop("Please supply a vector of annotations you'd like to generate.") } - # run function + # create marts if (species == "human") { - mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") + mart <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl") } else if (species == "mouse") { - mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl") + mart <- biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl") } + # retrieve annotations genes <- marker.genes$gene if (species == "human") { - annos <- getBM(attributes = c("hgnc_symbol", desired.annos), - mart = mart, - filters = "hgnc_symbol", - values = genes) + annos <- biomaRt::getBM(attributes = c("hgnc_symbol", desired.annos), + mart = mart, + filters = "hgnc_symbol", + values = genes) } else if (species == "mouse") { - annos <- getBM(attributes = desired.annos, - mart = mart, - filters = "mgi_symbol", - values = genes) + annos <- biomaRt::getBM(attributes = desired.annos, + mart = mart, + filters = "mgi_symbol", + values = genes) } - + # prepare result dataframe for (i in seq(unique(marker.genes$cluster))) { clust_df <- marker.genes[marker.genes$cluster == unique(marker.genes$cluster)[i], ] genes <- clust_df$gene - annos <- getBM(attributes = c("hgnc_symbol", desired.annos), - mart = mart, - filters = "hgnc_symbol", - values = genes) + annos <- biomaRt::getBM(attributes = c("hgnc_symbol", desired.annos), + mart = mart, + filters = "hgnc_symbol", + values = genes) } - return(anno_genes) } diff --git a/R/ChoosePCs.R b/R/ChoosePCs.R index da015c3..3e355a7 100644 --- a/R/ChoosePCs.R +++ b/R/ChoosePCs.R @@ -2,10 +2,10 @@ #' #' @name ChoosePCs #' @author Jack Leary -#' @description This function uses the eigenvalues of the principal component matrix to determine the best number of PCs. The default can be chosen automatically, or given by the user. It is intended to be run after `RunPCA()`. +#' @description This function uses the eigenvalues of the principal component matrix to determine the best number of PCs. The default can be chosen automatically, or given by the user. It is intended to be run after \code{\link[Seurat]{RunPCA}}. #' @importFrom matrixStats rowVars #' @importFrom Seurat GetAssayData Stdev -#' @param seurat.object The object containing our single cell counts and principal component matrix. Defaults to NULL. +#' @param seurat.obj The object containing our single cell counts and principal component matrix. Defaults to NULL. #' @param cutoff The cutoff value for cumulative proportion of variance explained. Can be set by the user, or can be determine automatically. Defaults to NULL. #' @export #' @examples diff --git a/R/ComputeSilhouetteScores.R b/R/ComputeSilhouetteScores.R index 1464d33..e3eb5e2 100644 --- a/R/ComputeSilhouetteScores.R +++ b/R/ComputeSilhouetteScores.R @@ -1,4 +1,4 @@ -#' Calculate the mean silhouette score of a clustering. +#' Calculate the silhouette score of a clustering. #' #' @name ComputeSilhouetteScores #' @author Jack Leary @@ -7,9 +7,9 @@ #' @importFrom stats dist #' @importFrom cluster silhouette #' @param seurat.obj The input object for which silhouette score will be computed. Defaults to NULL. -#' @param dist.metric Which distanec metric should be used? Defaults to "cosine", but any of the metrics used by \code{\link[stats]{dist}} will work. +#' @param dist.metric Which distance metric should be used? Defaults to "cosine", but any of the metrics used by \code{\link[stats]{dist}} will work. #' @param avg Should the average scores for each cluster be returned, or should a dataframe of every observation's cluster identity and score be returned? Defaults to TRUE. -#' @importFrom cluster silhouette +#' @seealso \code{\link{CosineDist}}. #' @export #' @examples #' \dontrun{ComputeSilhouetteScores(seurat.obj)} @@ -23,7 +23,7 @@ ComputeSilhouetteScores <- function(seurat.obj = NULL, dist.metric = "cosine", a pca_mat <- as.matrix(pca_df) # calculate distance matrix -- default is cosine dissimilarity if (dist.metric == "cosine") { - pc_dists <- CosineDist(input = pca_mat) + pc_dists <- CosineDist(input.mat = pca_mat) } else { pc_dists <- stats::dist(x = pca_mat, method = dist.metric) } @@ -35,8 +35,7 @@ ComputeSilhouetteScores <- function(seurat.obj = NULL, dist.metric = "cosine", a avg_widths <- unlist(avg_widths) val <- avg_widths } else { - val <- data.frame(Cluster = as.factor(res[, 1]), - Score = res[, 3]) + val <- data.frame(Cluster = as.factor(res[, 1]), Score = res[, 3]) } return(val) } diff --git a/R/ConvertGeneOrthologs.R b/R/ConvertGeneOrthologs.R index 1d53a73..24fb8d5 100644 --- a/R/ConvertGeneOrthologs.R +++ b/R/ConvertGeneOrthologs.R @@ -1,20 +1,23 @@ #' Convert a vector of MGI symbols to their HGNC orthologs and vice versa. #' #' @name ConvertGeneOrthologs +#' @author Jack Leary #' @description Converts gene names from human -> mouse and mouse -> human. #' @importFrom biomaRt useMart getLDS #' @param gene.vec A vector of genes to convert. Default to NULL. #' @param species One of "mm" or "hs". Defaults to "mm" (and thus converts to human). #' @export #' @examples -#' ConvertGeneOrthologs(gene.vec = mouse_genes) -#' ConvertGeneOrthologs(gene.vec = human_genes, species = "hs") +#' \dontrun{ConvertGeneOrthologs(gene.vec = mouse_genes)} +#' \dontrun{ConvertGeneOrthologs(gene.vec = human_genes, species = "hs")} ConvertGeneOrthologs <- function(gene.vec = NULL, species = "mm") { # check inputs species <- tolower(species) + # get marts human <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl") mouse <- biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl") + # convert gene names if (species == "mm") { genesV2 <- biomaRt::getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", diff --git a/R/CosineDist.R b/R/CosineDist.R index 7328936..3da6fdc 100644 --- a/R/CosineDist.R +++ b/R/CosineDist.R @@ -3,15 +3,16 @@ #' @name CosineDist #' @author Jack Leary #' @description This function takes a matrix as input, and computes the cosine distance (1 - cosine similarity) between the observations. +#' @importFrom stats as.dist #' @param input.mat The input matrix. Defaults to NULL. #' @export #' @examples -#' \dontrun{CosineDist(input = pca_matrix)} +#' \dontrun{CosineDist(input.mat = pca_matrix)} CosineDist <- function(input.mat = NULL) { # check inputs -- although as this is a helper function, it should never be called incorrectly if (is.null(input.mat)) { stop("You must provide a matrix to CosineDist().") } - # run function + # compute cosine distance dist_mat <- stats::as.dist(1 - input.mat %*% t(input.mat) / (sqrt(rowSums(input.mat^2) %*% t(rowSums(input.mat^2))))) return(dist_mat) } diff --git a/R/FindSpecificMarkers.R b/R/FindSpecificMarkers.R index e9ffb87..90be9e6 100644 --- a/R/FindSpecificMarkers.R +++ b/R/FindSpecificMarkers.R @@ -7,15 +7,17 @@ #' @importFrom Matrix t #' @importFrom dplyr mutate group_by summarise across filter pull bind_rows #' @importFrom Seurat FindAllMarkers +#' @importFrom stats quantile #' @param seurat.object The \code{Seurat} object containing clusters for which you'd like marker genes identified. Defaults to NULL. #' @param ident.use The cell identity to group by. Defaults to "seurat_clusters". -#' @param de.method The differential expression method used in \code{FindAllMarkers()}. Defaults to "wilcox". +#' @param de.method The differential expression method used in \code{\link[Seurat]{FindAllMarkers}}. Defaults to "wilcox". #' @param perc.cutoff The percentile cutoff used to find highly expressed genes in other cluster. Defaults to 0.9. #' @param log2fc.cutoff The log2FC cutoff used, in part, to determine whether a gene is differentially expressed. Defaults to 0.25. #' @param fdr.cutoff The cutoff used to remove DE genes with non-significant adjusted \emph{p}-values. Defaults to 0.05. +#' @seealso \code{\link[Seurat]{FindAllMarkers}} #' @export #' @examples -#' FindSpecificMarkers(seurat_object, method = "wilcox") +#' \dontrun{FindSpecificMarkers(seurat_object, method = "wilcox")} FindSpecificMarkers <- function(seurat.object = NULL, ident.use = "seurat_clusters", @@ -30,7 +32,7 @@ FindSpecificMarkers <- function(seurat.object = NULL, as.data.frame() %>% dplyr::mutate(cell_ident = unname(unlist(seurat.object[[ident.use]]))) %>% dplyr::group_by(cell_ident) %>% - dplyr::summarise(across(where(is.numeric), mean)) + dplyr::summarise(dplyr::across(where(is.numeric), mean)) # find list of genes w/ mean expression above 90th percentile of expression in each celltype high_exp_genes <- c() cluster_labels <- c() @@ -40,7 +42,7 @@ FindSpecificMarkers <- function(seurat.object = NULL, dplyr::select(-cell_ident) %>% t() %>% as.data.frame() %>% - dplyr::filter(V1 > quantile(V1, perc.cutoff)) %>% + dplyr::filter(V1 > stats::quantile(V1, perc.cutoff)) %>% dplyr::mutate(gene = rownames(.)) %>% dplyr::pull(gene) high_exp_genes <- c(high_exp_genes, top_exp_genes) diff --git a/R/FindSubpopulationMarkers.R b/R/FindSubpopulationMarkers.R index 83525b4..51aceb3 100644 --- a/R/FindSubpopulationMarkers.R +++ b/R/FindSubpopulationMarkers.R @@ -1,20 +1,21 @@ -#' Identify marker genes for previosuly identified subpopulations. +#' Identify marker genes for previously identified subpopulations. #' #' @name FindSubpopulationMarkers #' @author Jack Leary -#' @description This function determines which cells characterize the subpopulations identified using `ReclusterCells`. It is intended to be run on a single re-clustered `Seurat` object at a time, though if you wish you could +#' @description This function determines which cells characterize the subpopulations identified using \code{\link{ReclusterCells}}. It is intended to be run on a single re-clustered \code{Seurat} object at a time, though if you wish you could #' iterate over the list of reclustering results, and save the outputs from this function in a matching array of lists. The function returns a list of dataframes, one dataframe per cluster, containing normal and Bonferroni-adjusted #' p-values, gene prevalence, and effect size in the form of log2 fold change. -#' @import Seurat -#' @param seurat.object The original `Seurat` object containing the entire cell population and related metadata. -#' @param reclust.data A specific`Seurat` object from the list of objects returned by `ReclusterCells`. +#' @importFrom Seurat FindMarkers FindAllMarkers VariableFeatures +#' @param seurat.object The original \code{Seurat} object containing the entire cell population and related metadata. +#' @param reclust.data A specific \code{Seurat} object from the list of objects returned by \code{\link{ReclusterCells}}. #' @param which.compare Should subpopulation marker genes be determined in the context of the entire sample, or solely the single cluster? Defaults to "all cells"; choose "within cluster" to determine marker genes at the cluster level. -#' @param diff.exp.test The test used to calculate differential expression using `FindMarkers`. Defaults to "wilcox". +#' @param diff.exp.test The test used to calculate differential expression using \code{\link[Seurat]{FindMarkers}}. Defaults to "wilcox". #' @param logfc.thresh The log2 fold-change cutoff used when performing differential expression analysis. Defaults to 2. #' @param random.seed (Optional) The seed used to control stochasticity in several functions. Defaults to 629. +#' @seealso \code{\link{FindSpecificMarkers}} #' @export #' @examples -#' FindSubpopulationMarkers(seurat.object, reclust.data = reclust_results) +#' \dontrun{FindSubpopulationMarkers(seurat.object, reclust.data = reclust_results)} FindSubpopulationMarkers <- function(seurat.object = NULL, reclust.data = NULL, @@ -23,7 +24,7 @@ FindSubpopulationMarkers <- function(seurat.object = NULL, logfc.thresh = 2, random.seed = 629) { # check inputs - if (is.null(seurat.object) | is.null(reclust.data)) { stop("Please provide the correct inputs.") } + if (is.null(seurat.object) | is.null(reclust.data)) { stop("Please provide 2 Seurat objects to FindSubpopulationMarkers().") } # run function temp_obj <- reclust.data marker_gene_list <- list() @@ -37,17 +38,17 @@ FindSubpopulationMarkers <- function(seurat.object = NULL, print(sprintf("Finding markers genes for subcluster %s using the %s test", unique_clusts[i], diff.exp.test)) - markers <- FindMarkers(big_temp_obj, - slot = "data", - assay = "SCT", - ident.1 = "Subpopulation", - ident.2 = "Other", - group.by = "clust_indicator", - logfc.threshold = logfc.thresh, - verbose = FALSE, - test.use = "wilcox", - only.pos = TRUE, - random.seed = random.seed) + markers <- Seurat::FindMarkers(big_temp_obj, + slot = "data", + assay = "SCT", + ident.1 = "Subpopulation", + ident.2 = "Other", + group.by = "clust_indicator", + logfc.threshold = logfc.thresh, + verbose = FALSE, + test.use = "wilcox", + only.pos = TRUE, + random.seed = random.seed) markers$cluster <- unique_clusts[i] markers$gene <- rownames(markers) marker_gene_list[[i]] <- markers @@ -56,13 +57,13 @@ FindSubpopulationMarkers <- function(seurat.object = NULL, names(marker_gene_list) <- as.character(unique_clusts) } else if (which.compare == "within cluster") { # calculate subpopulation vs. single cluster marker genes - markers <- FindAllMarkers(temp_obj, - features = VariableFeatures(temp_obj), - logfc.threshold = logfc.thresh, - test.use = diff.exp.test, - verbose = FALSE, - random.seed = random.seed, - only.pos = TRUE) + markers <- Seurat::FindAllMarkers(temp_obj, + features = Seurat::VariableFeatures(temp_obj), + logfc.threshold = logfc.thresh, + test.use = diff.exp.test, + verbose = FALSE, + random.seed = random.seed, + only.pos = TRUE) } return(markers) } diff --git a/R/IntegrateSubclusters.R b/R/IntegrateSubclusters.R index 489e910..b23b23e 100644 --- a/R/IntegrateSubclusters.R +++ b/R/IntegrateSubclusters.R @@ -2,24 +2,25 @@ #' #' @name IntegrateSubclusters #' @author Jack Leary -#' @description This function takes a list of outputs from ReclusterCells() and integrates the new subcluster identities into the original Seurat object. +#' @description This function takes a list of outputs from \code{\link{ReclusterCells}} and integrates the new subcluster identities into the original Seurat object. #' @importFrom Seurat Idents DimPlot #' @importFrom dplyr case_when #' @importFrom ggplot2 labs theme element_text #' @param original.object The original Seurat object. Defaults to NULL. -#' @param reclust.results A list of reclustering results as output from ReclusterCells(). Defaults to NULL. +#' @param reclust.results A list of reclustering results as output from \code{\link{ReclusterCells}}. Defaults to NULL. #' @param do.plot Should the results be plotted on a dimension reduction plot? Defaults to FALSE. +#' @seealso \code{\link{ReclusterCells}} #' @export #' @examples -#' IntegrateSubclusters(original.object = pbmc, reclust.results = my_subclusts) +#' \dontrun{IntegrateSubclusters(original.object = pbmc, reclust.results = my_subclusts)} IntegrateSubclusters <- function(original.object = NULL, reclust.results = NULL, do.plot = FALSE) { # check inputs - if (is.null(original.object) | is.null(reclust.results)) stop("Arguments to IntegrateSubclusters() are missing.") + if (is.null(original.object) | is.null(reclust.results)) { stop("Arguments to IntegrateSubclusters() are missing.") } if (class(reclust.results) != "list") { stop("reclust.results must be of class list.") } if (any(sapply(reclust.results, class) != "Seurat")) { stop("All elements of reclust.results must be Seurat objects.") } - max_clust <- max(as.numeric(original.object$seurat_clusters) - 1) # identify new subclusters + max_clust <- max(as.numeric(original.object$seurat_clusters) - 1) cell_df <- NULL for (i in seq_along(reclust.results)) { n_clust <- length(unique(reclust.results[[i]]$seurat_clusters)) @@ -51,7 +52,6 @@ IntegrateSubclusters <- function(original.object = NULL, reclust.results = NULL, } original.object@meta.data$seurat_clusters <- as.factor(original.object@meta.data$seurat_clusters - 1) Seurat::Idents(original.object) <- "seurat_clusters" - # plot results if desired if (do.plot) { p <- Seurat::DimPlot(original.object) + @@ -60,6 +60,5 @@ IntegrateSubclusters <- function(original.object = NULL, reclust.results = NULL, ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) print(p) } - return(original.object) } diff --git a/R/PrepareData.R b/R/PrepareData.R index 8ef10cc..f076a3b 100644 --- a/R/PrepareData.R +++ b/R/PrepareData.R @@ -1,28 +1,32 @@ #' Prepare scRNA-seq data for reclustering. #' +#' @name PrepareData +#' @author Jack Leary #' @description This function prepares single cell data for reclustering analysis. The input is a \code{Seurat} object in any stage of pre-processing, or even a \code{SingleCellExperiment} object that will be converted to \code{Seurat} format. The function checks which metadata features (% mitochondrial DNA, cell cycle scores) and assays are present (normalized counts, PCA & t-SNE embeddings), then runs an initial graph-based clustering. -#' @import Seurat -#' @importClassesFrom SingleCellExperiment SingleCellExperiment +#' @importFrom SeuratObject as.Seurat colSums +#' @importFrom Seurat GetAssayData VariableFeatures CellCycleScoring PercentageFeatureSet SCTransform RunPCA RunTSNE RunUMAP FindNeighbors FindClusters DimPlot #' @param seurat.object The object containing the cells you'd like to analyze. Defaults to NULL. #' @param n.HVG The number of highly variable genes to compute. Defaults to 4000. #' @param regress.mt Should the percentage of mitochondrial DNA be computed and regressed out? Works for mouse / human gene names. Defaults to TRUE. #' @param regress.cc Should cell cycle scores be computed & regressed out? NOTE: uses human cell cycle genes. Defaults to TRUE. -#' @param n.PC The number of PCs used as input to non-linear dimension reduction and clustering algorithms. Can be chosen by user, or set automatically using \code{ChoosePCs()}. Defaults to "auto". +#' @param n.PC The number of PCs used as input to non-linear dimension reduction and clustering algorithms. Can be chosen by user, or set automatically using \code{\link{ChoosePCs}}. Defaults to "auto". #' @param var.cutoff (Optional) The proportion of variance explained cutoff to be used when n.PC is set to "auto". Defaults to .15. #' @param which.dim.reduc (Optional) Which non-linear dimension reduction algorithms should be used? Supports "tsne", "umap", "phate", and "all". Plots will be generated using the t-SNE embedding. Defaults to c("umap"), as most users will likely not have \code{phateR} installed. #' @param perplexity (Optional) What perplexity value should be used when embedding cells in t-SNE space? Defaults to 30. #' @param umap.lr (Optional) What learning rate should be used for the UMAP embedding? Defaults to 0.05. -#' @param initial.resolution The initial resolution parameter used in the \code{FindClusters} function. Defaults to 0.3. +#' @param initial.resolution The initial resolution parameter used in the \code{\link[Seurat]{FindAllMarkers}} function. Defaults to 0.3. +#' @param nn.metric (Optional) The distance metric to be used in computing the SNN graph. Defaults to "cosine". #' @param k.val (Optional) The nearest-neighbors parameter \emph{k} to be used when creating the shared nearest-neighbor graph. Defaults to \eqn{k \approx \sqrt{n}}. #' @param do.plot (Optional) The dimension reduction view you'd like plotted. Should be one of "tsne", "umap", "phate", or "pca". Defaults to NULL. #' @param random.seed The seed used to control stochasticity in several functions. Defaults to 629. +#' @seealso \code{\link{ChoosePCs}} +#' @seealso \code{\link[Seurat]{FindAllMarkers}} #' @export #' @examples -#' PrepareData(seurat.object, n.variable.genes = 5000, do.plot = TRUE) -#' PrepareData(seurat.object, initial.resolution = .5, k.val = 25, random.seed = 100) +#' \dontrun{PrepareData(seurat.object, n.variable.genes = 3000, n.PC = 20, do.plot = TRUE)} +#' \dontrun{PrepareData(seurat.object, initial.resolution = .5, k.val = 25, random.seed = 100)} #' @references #' Stuart *et al* (2019). Comprehensive integration of single-cell data. *Cell*. -#' @seealso \code{\link{ChoosePCs}} PrepareData <- function(seurat.object = NULL, n.HVG = 4000, @@ -34,92 +38,89 @@ PrepareData <- function(seurat.object = NULL, perplexity = 30, umap.lr = 0.05, initial.resolution = .3, + nn.metric = "cosine", k.val = NULL, do.plot = NULL, random.seed = 629) { # check inputs & assays present in Seurat object - if (is.null(seurat.object)) stop ("You forgot to supply a Seurat object!") - - # convert SCE object to Seurat if necessary + if (is.null(seurat.object)) { stop("You forgot to supply a Seurat object!") } + # convert SingleCellExperiment object to Seurat if necessary if (class(seurat.object)[1] == "SingleCellExperiment") { print("Converting user-supplied SingleCellExperiment object to Seurat object") - seurat.object <- as.Seurat(seurat.object, data = NULL) + seurat.object <- SeuratObject::as.Seurat(seurat.object, data = NULL) # add necessary metadata for normalization - RNA_counts <- colSums(x = seurat.object, slot = "counts") - feature_counts <- colSums(x = GetAssayData(object = seurat.object, slot = "counts") > 0) + RNA_counts <- SeuratObject::colSums(x = seurat.object, slot = "counts") + feature_counts <- colSums(x = Seurat::GetAssayData(object = seurat.object, slot = "counts") > 0) seurat.object@meta.data$nCount_RNA <- RNA_counts seurat.object@meta.data$nFeature_RNA <- feature_counts } - # add cell metadata and normalize - if (is.null(seurat.object@assays$SCT) && length(VariableFeatures(seurat.object)) == 0) { + if (is.null(seurat.object@assays$SCT) && length(Seurat::VariableFeatures(seurat.object)) == 0) { regression_vars <- c() # add cell cycle scores if (regress.cc) { - seurat.object <- CellCycleScoring(seurat.object, - s.features = cc.genes.updated.2019$s.genes, - g2m.features = cc.genes.updated.2019$g2m.genes, - set.ident = FALSE) + seurat.object <- Seurat::CellCycleScoring(seurat.object, + s.features = cc.genes.updated.2019$s.genes, + g2m.features = cc.genes.updated.2019$g2m.genes, + set.ident = FALSE) seurat.object$CC_difference <- seurat.object$S.Score - seurat.object$G2M.Score regression_vars <- c(regression_vars, "CC_difference") } # add % mitochondrial DNA if (regress.mt) { - seurat.object[["percent_MT"]] <- PercentageFeatureSet(seurat.object, pattern = "^MT-|^mt-") # works for human & mouse + seurat.object[["percent_MT"]] <- Seurat::PercentageFeatureSet(seurat.object, pattern = "^MT-|^mt-") # works for human & mouse regression_vars <- c(regression_vars, "percent_MT") } # normalize counts if (length(regression_vars) > 0) { - seurat.object <- SCTransform(seurat.object, - variable.features.n = n.HVG, - vars.to.regress = regression_vars, - seed.use = random.seed, - verbose = FALSE) + seurat.object <- Seurat::SCTransform(seurat.object, + variable.features.n = n.HVG, + vars.to.regress = regression_vars, + seed.use = random.seed, + verbose = FALSE) } else { - seurat.object <- SCTransform(seurat.object, - variable.features.n = n.HVG, - seed.use = random.seed, - verbose = FALSE) + seurat.object <- Seurat::SCTransform(seurat.object, + variable.features.n = n.HVG, + seed.use = random.seed, + verbose = FALSE) } - } - # dimension reduction - PCA, t-SNE, UMAP, and/or PHATE if (is.null(seurat.object@reductions$pca)) { if (n.PC != "auto") { - seurat.object <- RunPCA(seurat.object, - features = VariableFeatures(seurat.object), - npcs = n.PC, - verbose = FALSE, - seed.use = random.seed) + seurat.object <- Seurat::RunPCA(seurat.object, + features = Seurat::VariableFeatures(seurat.object), + npcs = n.PC, + verbose = FALSE, + seed.use = random.seed) } else { - seurat.object <- RunPCA(seurat.object, - features = VariableFeatures(seurat.object), - npcs = 50, - verbose = FALSE, - seed.use = random.seed) + seurat.object <- Seurat::RunPCA(seurat.object, + features = Seurat::VariableFeatures(seurat.object), + npcs = 50, + verbose = FALSE, + seed.use = random.seed) n.PC <- ChoosePCs(seurat.object, cutoff = var.cutoff) } } if ("tsne" %in% which.dim.reduc) { print(sprintf("Running t-SNE on %s principal components with perplexity = %s", n.PC, perplexity)) - seurat.object <- RunTSNE(seurat.object, - reduction = "pca", - dims = 1:n.PC, - dim.embed = 2, - seed.use = random.seed, - perplexity = perplexity) + seurat.object <- Seurat::RunTSNE(seurat.object, + reduction = "pca", + dims = 1:n.PC, + dim.embed = 2, + seed.use = random.seed, + perplexity = perplexity) } if ("umap" %in% which.dim.reduc) { print(sprintf("Running UMAP on %s principal components", n.PC)) - seurat.object <- RunUMAP(seurat.object, - umap.method = "uwot", - dims = 1:n.PC, - n.components = 2, - learning.rate = umap.lr, - reduction = "pca", - verbose = FALSE, - seed.use = random.seed) + seurat.object <- Seurat::RunUMAP(seurat.object, + umap.method = "uwot", + dims = 1:n.PC, + n.components = 2, + learning.rate = umap.lr, + reduction = "pca", + verbose = FALSE, + seed.use = random.seed) } if ("phate" %in% which.dim.reduc) { seurat.object <- RunPHATE(seurat.object, @@ -127,26 +128,23 @@ PrepareData <- function(seurat.object = NULL, n.PC = n.PC, random.seed = random.seed) } - # initial clustering if (is.null(k.val)) k.val <- round(sqrt(ncol(seurat.object))) # set k if not defined - seurat.object <- FindNeighbors(seurat.object, - reduction = "pca", - dims = 1:n.PC, - k.param = k.val, - annoy.metric = "cosine", - nn.method = "annoy", - verbose = FALSE) - seurat.object <- FindClusters(seurat.object, - resolution = initial.resolution, - algorithm = 1, - random.seed = random.seed, - verbose = FALSE) + seurat.object <- Seurat::FindNeighbors(seurat.object, + reduction = "pca", + dims = 1:n.PC, + k.param = k.val, + annoy.metric = nn.metric, + nn.method = "annoy", + verbose = FALSE) + seurat.object <- Seurat::FindClusters(seurat.object, + resolution = initial.resolution, + algorithm = 1, + random.seed = random.seed, + verbose = FALSE) print(sprintf("Found %s unique clusters", length(unique(seurat.object$seurat_clusters)))) - # plot results if desired - if (!is.null(do.plot)) print(DimPlot(seurat.object, reduction = do.plot)) - + if (!is.null(do.plot)) print(Seurat::DimPlot(seurat.object, reduction = do.plot)) # return prepared object return(seurat.object) } diff --git a/R/ReclusterCells.R b/R/ReclusterCells.R index adc8e5e..dd1feb9 100644 --- a/R/ReclusterCells.R +++ b/R/ReclusterCells.R @@ -1,23 +1,26 @@ #' Identify subpopulations in single cell clusters. #' -#' This function identifies subclusters of cell types by recalculating the *n* most highly variable genes for each cluster using `sctransform` as implemented in `Seurat`. The function returns a list of `Seurat` objects, one for each cluster the user wants to investigate. -#' @import Seurat -#' @param seurat.object The `Seurat` object containing cells and their assigned cluster IDs. +#' @name ReclusterCells +#' @author Jack Leary +#' @description This function identifies subclusters of cell types by recalculating the *n* most highly variable genes for each cluster using \code{\link[Seurat]{SCTransform}}. The function returns a list of \code{Seurat} objects, one for each cluster the user wants to investigate. +#' @importFrom Seurat DefaultAssay SCTransform FindNeighbors FindClusters +#' @param seurat.object The \code{Seurat} object containing cells and their assigned cluster IDs. #' @param which.clust Which clusters should undergo subpopulation detection analysis? A user-provided list or single integer. Defaults to NULL. -#' @param auto Should the clusters on which to run SCISSORS be determined automatically? If so, `which.clust` will be chosen through silhouette score analysis. Not recommended for large datasets as the distance matrix calculation is computationally expensive. Defaults to FALSE. -#' @param merge.clusters (Optional). If multiple clusters are specified, should the clusters be grouped as one before running SCISSORS? Defaults to FALSE. +#' @param auto Should the clusters on which to run SCISSORS be determined automatically? If so, \code{which.clust} will be chosen through silhouette score analysis. Not recommended for large datasets as the distance matrix calculation is computationally expensive. Defaults to FALSE. +#' @param merge.clusters (Optional) If multiple clusters are specified, should the clusters be grouped as one before running SCISSORS? Defaults to FALSE. #' @param n.HVG How many variable genes should be detected in each subcluster? Defaults to 4000. -#' @param n.PC How many PCs should be used as input to non-linear to non-linear dimension reduction and clustering algorithms. Can be provided by the user, or set automatically by `ChoosePCs()`. Defaults to "auto". +#' @param n.PC How many PCs should be used as input to non-linear to non-linear dimension reduction and clustering algorithms. Can be provided by the user, or set automatically by \code{\link{ChoosePCs}}. Defaults to "auto". #' @param redo.embedding (Optional) Should a cluster-specific dimension reduction embeddings be generated? Sometimes subpopulations appear mixed together on the original coordinates, but separate clearly when re-embedded. Defaults to TRUE. -#' @param resolution.vals (Optional) A user-defined vector of resolution values to compare when clustering cells. Defaults to c(.1, .2, .3, .4). -#' @param k.vals (Optional) The parameters *k* to be tested. Defaults to c(10, 25, 50). -#' @param cutoff.score (Optional) The lowest mean silhouette score accepted as evidence of subclusters. Defaults to .25, reasonable values are [.1, .3]. +#' @param resolution.vals A user-defined vector of resolution values to compare when clustering cells. Defaults to c(.1, .2, .3, .4). +#' @param k.vals The values of the number of nearest neighbors \emph{k} to be tested. Defaults to c(10, 25, 50). +#' @param cutoff.score The lowest mean silhouette score accepted as evidence of subclusters. Defaults to .25, reasonable values are \[.1, .3\]. #' @param nn.metric (Optional) The distance metric to be used in computing the SNN graph. Defaults to "cosine". #' @param random.seed The seed used to control stochasticity in several functions. Defaults to 629. +#' @seealso \code{\link{ComputeSilhouetteScores}} #' @export #' @examples -#' ReclusterCells(seurat.object, which.clust = 5, resolution.vals = c(.1, .2, .5), k.vals = c(10, 20, 30)) -#' ReclusterCells(seurat.object, which.clust = list(0, 3, 5), merge.clusters = TRUE +#' \dontrun{ReclusterCells(seurat.object, which.clust = 5, resolution.vals = c(.1, .2, .5), k.vals = c(10, 20, 30))} +#' \dontrun{ReclusterCells(seurat.object, which.clust = list(0, 3, 5), merge.clusters = TRUE)} ReclusterCells <- function(seurat.object = NULL, which.clust = NULL, @@ -32,15 +35,13 @@ ReclusterCells <- function(seurat.object = NULL, nn.metric = "cosine", random.seed = 629) { # check inputs - if (any(sapply(c(seurat.object, which.clust), is.null))) stop("Please provide a Seurat object and clusters to investigate.") - + if (is.null(seurat.object) | is.null(which.clust)) { stop("Please provide a Seurat object and clusters to investigate to ReclusterCells().") } # auto-choose clusters to investigate if desired if (auto) { print("Choosing clusters automatically.") scores <- ComputeSilhouetteScores(seurat.object) which.clust <- which(scores < .5) } - # set up result list, account for case when clusters are to be merged, identify covariates reclust_list <- list() if (merge.clusters) { @@ -64,49 +65,48 @@ ReclusterCells <- function(seurat.object = NULL, } else if ("phate" %in% names(seurat.object@reductions)) { dim_red_algs <- c(dim_red_algs, "phate") } - # iterate and recluster cells for (i in seq_along(which.clust)) { if (!merge.clusters) { temp_obj <- subset(seurat.object, subset = seurat_clusters == which.clust[[i]]) } # reprocess data - if (DefaultAssay(temp_obj) != "integrated") { + if (Seurat::DefaultAssay(temp_obj) != "integrated") { if (length(regress_vars) > 0) { - temp_obj <- SCTransform(temp_obj, - vars.to.regress = regress_vars, - variable.features.n = n.HVG, - seed.use = random.seed, - verbose = FALSE) + temp_obj <- Seurat::SCTransform(temp_obj, + vars.to.regress = regress_vars, + variable.features.n = n.HVG, + seed.use = random.seed, + verbose = FALSE) } else { - temp_obj <- SCTransform(temp_obj, - variable.features.n = n.HVG, - seed.use = random.seed, - verbose = FALSE) + temp_obj <- Seurat::SCTransform(temp_obj, + variable.features.n = n.HVG, + seed.use = random.seed, + verbose = FALSE) } } temp_obj <- ReduceDimensions(temp_obj, n.PC = n.PC, - which.algs = dim_red_algs, - seed = random.seed) + which.algos = dim_red_algs, + random.seed = random.seed) # silhouette score various clusterings to find best results sil_scores <- c() j <- 1 for (k in seq_along(k.vals)) { for (r in seq_along(resolution.vals)) { - temp_obj <- FindNeighbors(temp_obj, - reduction = "pca", - dims = 1:n.PC, - k.param = k.vals[k], - annoy.metric = nn.metric, - nn.method = "annoy", - verbose = FALSE) - temp_obj <- FindClusters(temp_obj, - resolution = resolution.vals[r], - random.seed = random.seed, - algorithm = 1, - verbose = FALSE) + temp_obj <- Seurat::FindNeighbors(temp_obj, + reduction = "pca", + dims = 1:n.PC, + k.param = k.vals[k], + annoy.metric = nn.metric, + nn.method = "annoy", + verbose = FALSE) + temp_obj <- Seurat::FindClusters(temp_obj, + resolution = resolution.vals[r], + random.seed = random.seed, + algorithm = 1, + verbose = FALSE) if (length(unique(levels(temp_obj$seurat_clusters))) > 1) { sil_res <- ComputeSilhouetteScores(seurat.obj = temp_obj) mean_sil <- mean(sil_res) @@ -119,7 +119,6 @@ ReclusterCells <- function(seurat.object = NULL, j <- j + 1 } } - # extract best parameter set if (max(sil_scores) > cutoff.score) { best_params <- names(sil_scores[sil_scores == max(sil_scores)]) @@ -147,18 +146,18 @@ ReclusterCells <- function(seurat.object = NULL, best_res, round(max(sil_scores), 3))) } - temp_obj <- FindNeighbors(temp_obj, - reduction = "pca", - dims = 1:n.PC, - annoy.metric = nn.metric, - nn.method = "annoy", - k.param = best_k, - verbose = FALSE) - temp_obj <- FindClusters(temp_obj, - resolution = best_res, - algorithm = 1, - random.seed = random.seed, - verbose = FALSE) + temp_obj <- Seurat::FindNeighbors(temp_obj, + reduction = "pca", + dims = 1:n.PC, + annoy.metric = nn.metric, + nn.method = "annoy", + k.param = best_k, + verbose = FALSE) + temp_obj <- Seurat::FindClusters(temp_obj, + resolution = best_res, + algorithm = 1, + random.seed = random.seed, + verbose = FALSE) } else { # replace new object w/ original one, as no subclusters were found if (merge.clusters) { @@ -175,7 +174,6 @@ ReclusterCells <- function(seurat.object = NULL, } reclust_list[[i]] <- temp_obj } - # add names to list of Seurat objects if (!merge.clusters && length(which.clust) > 1) { names(reclust_list) <- as.character(unlist(which.clust)) diff --git a/R/ReduceDimensions.R b/R/ReduceDimensions.R index 37de766..953e96d 100644 --- a/R/ReduceDimensions.R +++ b/R/ReduceDimensions.R @@ -1,14 +1,17 @@ #' A function to run PCA / t-SNE / UMAP / PHATE. #' -#' This function simplifies the running of various dimension reduction algorithms. It exists mostly to make the body of ReclusterCells() easier to read. +#' @name ReduceDimensions +#' @author Jack Leary +#' @description This function simplifies the running of various dimension reduction algorithms. It exists mostly to make the body of \code{\link{ReclusterCells}} easier to read. #' @importFrom Seurat RunPCA VariableFeatures RunTSNE RunUMAP #' @param obj The Seurat object to run dimension reduction algorithms on. Defaults to NULL. #' @param n.PC How many principal components should be used? Can be an integer or "auto". Defaults to NULL. #' @param which.algos Which nonlinear dimension algorithms can be used? Should be some combination of "tsne", "umap", and "phate". Defaults to "umap". #' @param random.seed The random seed to use. Defaults to 312. +#' @seealso \code{\link{RunPHATE}} #' @export #' @examples -#' \dontrun{ReduceDimensions(pbmc3k, n.PC = 10, which.algos = "umap", seed = 629)} +#' \dontrun{ReduceDimensions(pbmc3k, n.PC = 10, which.algos = "umap", random.seed = 629)} ReduceDimensions <- function(obj = NULL, n.PC = NULL, which.algos = "umap", diff --git a/R/RunPHATE.R b/R/RunPHATE.R index a1d2d79..9f48e08 100644 --- a/R/RunPHATE.R +++ b/R/RunPHATE.R @@ -11,9 +11,10 @@ #' @param mds.method The solver used for MDS. Defaults to SMACOF, but SGD can be used to speed up the algorithm. #' @param dist.metric The distance metric to use for KNN and MDS. Defaults to the cosine distance. #' @param random.seed The random seed used to control stochasticity. Defaults to 629. +#' @seealso \code{\link[phateR]{phate}} #' @export #' @examples -#' RunPhate(object = pbmc3k, n.components = 2, n.PC = 10) +#' \dontrun{RunPhate(object = pbmc3k, n.components = 2, n.PC = 10)} RunPHATE <- function(object = NULL, n.components = 2, @@ -23,7 +24,7 @@ RunPHATE <- function(object = NULL, random.seed = 629) { # check inputs if (is.null(object) | is.null(n.PC)) { stop("Please provide a Seurat object and a number of PCs to use to RunPHATE().") } - # run PHATE + # run PHATE on PCs pca_df <- data.frame(Seurat::Embeddings(object, reduction = "pca"))[, 1:n.PC] phate_res <- phateR::phate(pca_df, ndim = n.components, diff --git a/R/theme_yehlab.R b/R/theme_yehlab.R index 08e0354..8dc0206 100644 --- a/R/theme_yehlab.R +++ b/R/theme_yehlab.R @@ -1,4 +1,5 @@ #' A clean \code{ggplot2} theme for dimension reduction plots. +#' #' @name theme_yehlab #' @author Jack Leary #' @description This is the Yeh Lab's default \code{ggplot2} theme for dimension reduction scatterplots, and was used throughout the SCISSORS manuscript. @@ -9,13 +10,13 @@ #' \dontrun{DimPlot(pbmc, reduction = "umap") + theme_yehlab()} theme_yehlab <- function() { - theme(legend.position = "bottom", - axis.text = element_blank(), - axis.line = element_blank(), - panel.grid = element_blank(), - axis.ticks = element_blank(), - legend.direction = "horizontal", - legend.justification = "center", - panel.background = element_blank(), - panel.border = element_rect(colour = "black", linetype = 1, size = 1)) + ggplot2::theme(legend.position = "bottom", + axis.text = ggplot2::element_blank(), + axis.line = ggplot2::element_blank(), + panel.grid = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + legend.direction = "horizontal", + legend.justification = "center", + panel.background = ggplot2::element_blank(), + panel.border = ggplot2::element_rect(colour = "black", linetype = 1, size = 1)) } diff --git a/man/AnnotateMarkerGenes.Rd b/man/AnnotateMarkerGenes.Rd index 276575a..df05428 100644 --- a/man/AnnotateMarkerGenes.Rd +++ b/man/AnnotateMarkerGenes.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/AnnotateMarkerGenes.R \name{AnnotateMarkerGenes} \alias{AnnotateMarkerGenes} -\title{Annotate differentially expressed genes using biomaRt} +\title{Annotate differentially expressed genes using \code{biomaRt}.} \usage{ AnnotateMarkerGenes( marker.genes = NULL, @@ -11,12 +11,15 @@ AnnotateMarkerGenes( ) } \arguments{ -\item{marker.genes}{The dataframe of marker genes generated by \code{FindSubpopulationMarkers()}} +\item{marker.genes}{The dataframe of marker genes generated by \code{\link{FindSubpopulationMarkers}}.} \item{species}{The species of the cells being analyzed. Defaults to "human", but also supports "mouse".} \item{desired.annos}{The vector containing the annotations you'd like to retrieve for each gene.} } \description{ -This function uses the \code{biomaRt} package to fetch a user-defined list of attributes for a list of dataframes containing differentially expressed genes. Intended to be run directly after \code{FindSubpopulationMarkers()}. +This function uses the \code{biomaRt} package to fetch a user-defined list of attributes for a list of dataframes containing differentially expressed genes. Intended to be run directly after \code{\link{FindSubpopulationMarkers}}. +} +\author{ +Jack Leary } diff --git a/man/ChoosePCs.Rd b/man/ChoosePCs.Rd index f75c0c4..b37f36c 100644 --- a/man/ChoosePCs.Rd +++ b/man/ChoosePCs.Rd @@ -7,12 +7,12 @@ ChoosePCs(seurat.obj = NULL, cutoff = NULL) } \arguments{ -\item{cutoff}{The cutoff value for cumulative proportion of variance explained. Can be set by the user, or can be determine automatically. Defaults to NULL.} +\item{seurat.obj}{The object containing our single cell counts and principal component matrix. Defaults to NULL.} -\item{seurat.object}{The object containing our single cell counts and principal component matrix. Defaults to NULL.} +\item{cutoff}{The cutoff value for cumulative proportion of variance explained. Can be set by the user, or can be determine automatically. Defaults to NULL.} } \description{ -This function uses the eigenvalues of the principal component matrix to determine the best number of PCs. The default can be chosen automatically, or given by the user. It is intended to be run after \code{RunPCA()}. +This function uses the eigenvalues of the principal component matrix to determine the best number of PCs. The default can be chosen automatically, or given by the user. It is intended to be run after \code{\link[Seurat]{RunPCA}}. } \examples{ \dontrun{ChoosePCs(seurat.obj = pbmc, cutoff = .15)} diff --git a/man/ComputeSilhouetteScores.Rd b/man/ComputeSilhouetteScores.Rd index 421a054..c2258b5 100644 --- a/man/ComputeSilhouetteScores.Rd +++ b/man/ComputeSilhouetteScores.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/ComputeSilhouetteScores.R \name{ComputeSilhouetteScores} \alias{ComputeSilhouetteScores} -\title{Calculate the mean silhouette score of a clustering.} +\title{Calculate the silhouette score of a clustering.} \usage{ ComputeSilhouetteScores(seurat.obj = NULL, dist.metric = "cosine", avg = TRUE) } \arguments{ \item{seurat.obj}{The input object for which silhouette score will be computed. Defaults to NULL.} -\item{dist.metric}{Which distanec metric should be used? Defaults to "cosine", but any of the metrics used by \code{\link[stats]{dist}} will work.} +\item{dist.metric}{Which distance metric should be used? Defaults to "cosine", but any of the metrics used by \code{\link[stats]{dist}} will work.} \item{avg}{Should the average scores for each cluster be returned, or should a dataframe of every observation's cluster identity and score be returned? Defaults to TRUE.} } @@ -19,6 +19,9 @@ This function will compute the silhouette score for each cluster identified by \ \examples{ \dontrun{ComputeSilhouetteScores(seurat.obj)} } +\seealso{ +\code{\link{CosineDist}}. +} \author{ Jack Leary } diff --git a/man/ConvertGeneOrthologs.Rd b/man/ConvertGeneOrthologs.Rd index 3c7ac4c..848d7e1 100644 --- a/man/ConvertGeneOrthologs.Rd +++ b/man/ConvertGeneOrthologs.Rd @@ -15,6 +15,9 @@ ConvertGeneOrthologs(gene.vec = NULL, species = "mm") Converts gene names from human -> mouse and mouse -> human. } \examples{ -ConvertGeneOrthologs(gene.vec = mouse_genes) -ConvertGeneOrthologs(gene.vec = human_genes, species = "hs") +\dontrun{ConvertGeneOrthologs(gene.vec = mouse_genes)} +\dontrun{ConvertGeneOrthologs(gene.vec = human_genes, species = "hs")} +} +\author{ +Jack Leary } diff --git a/man/CosineDist.Rd b/man/CosineDist.Rd index b83b7a0..8163a2f 100644 --- a/man/CosineDist.Rd +++ b/man/CosineDist.Rd @@ -13,7 +13,7 @@ CosineDist(input.mat = NULL) This function takes a matrix as input, and computes the cosine distance (1 - cosine similarity) between the observations. } \examples{ -\dontrun{CosineDist(input = pca_matrix)} +\dontrun{CosineDist(input.mat = pca_matrix)} } \author{ Jack Leary diff --git a/man/FindSpecificMarkers.Rd b/man/FindSpecificMarkers.Rd index 274af73..09ef9ca 100644 --- a/man/FindSpecificMarkers.Rd +++ b/man/FindSpecificMarkers.Rd @@ -18,7 +18,7 @@ FindSpecificMarkers( \item{ident.use}{The cell identity to group by. Defaults to "seurat_clusters".} -\item{de.method}{The differential expression method used in \code{FindAllMarkers()}. Defaults to "wilcox".} +\item{de.method}{The differential expression method used in \code{\link[Seurat]{FindAllMarkers}}. Defaults to "wilcox".} \item{perc.cutoff}{The percentile cutoff used to find highly expressed genes in other cluster. Defaults to 0.9.} @@ -30,7 +30,10 @@ FindSpecificMarkers( This function finds marker genes for all clusters, and then filters those markers on a per-cluster basis against the most highly expressed genes in other clusters. } \examples{ -FindSpecificMarkers(seurat_object, method = "wilcox") +\dontrun{FindSpecificMarkers(seurat_object, method = "wilcox")} +} +\seealso{ +\code{\link[Seurat]{FindAllMarkers}} } \author{ Jack Leary diff --git a/man/FindSubpopulationMarkers.Rd b/man/FindSubpopulationMarkers.Rd index e11c9e4..0f815d3 100644 --- a/man/FindSubpopulationMarkers.Rd +++ b/man/FindSubpopulationMarkers.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/FindSubpopulationMarkers.R \name{FindSubpopulationMarkers} \alias{FindSubpopulationMarkers} -\title{Identify marker genes for previosuly identified subpopulations.} +\title{Identify marker genes for previously identified subpopulations.} \usage{ FindSubpopulationMarkers( seurat.object = NULL, @@ -16,23 +16,26 @@ FindSubpopulationMarkers( \arguments{ \item{seurat.object}{The original \code{Seurat} object containing the entire cell population and related metadata.} -\item{reclust.data}{A specific\code{Seurat} object from the list of objects returned by \code{ReclusterCells}.} +\item{reclust.data}{A specific \code{Seurat} object from the list of objects returned by \code{\link{ReclusterCells}}.} \item{which.compare}{Should subpopulation marker genes be determined in the context of the entire sample, or solely the single cluster? Defaults to "all cells"; choose "within cluster" to determine marker genes at the cluster level.} -\item{diff.exp.test}{The test used to calculate differential expression using \code{FindMarkers}. Defaults to "wilcox".} +\item{diff.exp.test}{The test used to calculate differential expression using \code{\link[Seurat]{FindMarkers}}. Defaults to "wilcox".} \item{logfc.thresh}{The log2 fold-change cutoff used when performing differential expression analysis. Defaults to 2.} \item{random.seed}{(Optional) The seed used to control stochasticity in several functions. Defaults to 629.} } \description{ -This function determines which cells characterize the subpopulations identified using \code{ReclusterCells}. It is intended to be run on a single re-clustered \code{Seurat} object at a time, though if you wish you could +This function determines which cells characterize the subpopulations identified using \code{\link{ReclusterCells}}. It is intended to be run on a single re-clustered \code{Seurat} object at a time, though if you wish you could iterate over the list of reclustering results, and save the outputs from this function in a matching array of lists. The function returns a list of dataframes, one dataframe per cluster, containing normal and Bonferroni-adjusted p-values, gene prevalence, and effect size in the form of log2 fold change. } \examples{ -FindSubpopulationMarkers(seurat.object, reclust.data = reclust_results) +\dontrun{FindSubpopulationMarkers(seurat.object, reclust.data = reclust_results)} +} +\seealso{ +\code{\link{FindSpecificMarkers}} } \author{ Jack Leary diff --git a/man/IntegrateSubclusters.Rd b/man/IntegrateSubclusters.Rd index 2ca8630..bc7ab84 100644 --- a/man/IntegrateSubclusters.Rd +++ b/man/IntegrateSubclusters.Rd @@ -13,15 +13,18 @@ IntegrateSubclusters( \arguments{ \item{original.object}{The original Seurat object. Defaults to NULL.} -\item{reclust.results}{A list of reclustering results as output from ReclusterCells(). Defaults to NULL.} +\item{reclust.results}{A list of reclustering results as output from \code{\link{ReclusterCells}}. Defaults to NULL.} \item{do.plot}{Should the results be plotted on a dimension reduction plot? Defaults to FALSE.} } \description{ -This function takes a list of outputs from ReclusterCells() and integrates the new subcluster identities into the original Seurat object. +This function takes a list of outputs from \code{\link{ReclusterCells}} and integrates the new subcluster identities into the original Seurat object. } \examples{ -IntegrateSubclusters(original.object = pbmc, reclust.results = my_subclusts) +\dontrun{IntegrateSubclusters(original.object = pbmc, reclust.results = my_subclusts)} +} +\seealso{ +\code{\link{ReclusterCells}} } \author{ Jack Leary diff --git a/man/PrepareData.Rd b/man/PrepareData.Rd index bf32f11..4b361d3 100644 --- a/man/PrepareData.Rd +++ b/man/PrepareData.Rd @@ -15,6 +15,7 @@ PrepareData( perplexity = 30, umap.lr = 0.05, initial.resolution = 0.3, + nn.metric = "cosine", k.val = NULL, do.plot = NULL, random.seed = 629 @@ -29,7 +30,7 @@ PrepareData( \item{regress.cc}{Should cell cycle scores be computed & regressed out? NOTE: uses human cell cycle genes. Defaults to TRUE.} -\item{n.PC}{The number of PCs used as input to non-linear dimension reduction and clustering algorithms. Can be chosen by user, or set automatically using \code{ChoosePCs()}. Defaults to "auto".} +\item{n.PC}{The number of PCs used as input to non-linear dimension reduction and clustering algorithms. Can be chosen by user, or set automatically using \code{\link{ChoosePCs}}. Defaults to "auto".} \item{var.cutoff}{(Optional) The proportion of variance explained cutoff to be used when n.PC is set to "auto". Defaults to .15.} @@ -39,7 +40,9 @@ PrepareData( \item{umap.lr}{(Optional) What learning rate should be used for the UMAP embedding? Defaults to 0.05.} -\item{initial.resolution}{The initial resolution parameter used in the \code{FindClusters} function. Defaults to 0.3.} +\item{initial.resolution}{The initial resolution parameter used in the \code{\link[Seurat]{FindAllMarkers}} function. Defaults to 0.3.} + +\item{nn.metric}{(Optional) The distance metric to be used in computing the SNN graph. Defaults to "cosine".} \item{k.val}{(Optional) The nearest-neighbors parameter \emph{k} to be used when creating the shared nearest-neighbor graph. Defaults to \eqn{k \approx \sqrt{n}}.} @@ -51,12 +54,17 @@ PrepareData( This function prepares single cell data for reclustering analysis. The input is a \code{Seurat} object in any stage of pre-processing, or even a \code{SingleCellExperiment} object that will be converted to \code{Seurat} format. The function checks which metadata features (\% mitochondrial DNA, cell cycle scores) and assays are present (normalized counts, PCA & t-SNE embeddings), then runs an initial graph-based clustering. } \examples{ -PrepareData(seurat.object, n.variable.genes = 5000, do.plot = TRUE) -PrepareData(seurat.object, initial.resolution = .5, k.val = 25, random.seed = 100) +\dontrun{PrepareData(seurat.object, n.variable.genes = 3000, n.PC = 20, do.plot = TRUE)} +\dontrun{PrepareData(seurat.object, initial.resolution = .5, k.val = 25, random.seed = 100)} } \references{ Stuart \emph{et al} (2019). Comprehensive integration of single-cell data. \emph{Cell}. } \seealso{ \code{\link{ChoosePCs}} + +\code{\link[Seurat]{FindAllMarkers}} +} +\author{ +Jack Leary } diff --git a/man/ReclusterCells.Rd b/man/ReclusterCells.Rd index ea69842..c299320 100644 --- a/man/ReclusterCells.Rd +++ b/man/ReclusterCells.Rd @@ -26,28 +26,34 @@ ReclusterCells( \item{auto}{Should the clusters on which to run SCISSORS be determined automatically? If so, \code{which.clust} will be chosen through silhouette score analysis. Not recommended for large datasets as the distance matrix calculation is computationally expensive. Defaults to FALSE.} -\item{merge.clusters}{(Optional). If multiple clusters are specified, should the clusters be grouped as one before running SCISSORS? Defaults to FALSE.} +\item{merge.clusters}{(Optional) If multiple clusters are specified, should the clusters be grouped as one before running SCISSORS? Defaults to FALSE.} \item{n.HVG}{How many variable genes should be detected in each subcluster? Defaults to 4000.} -\item{n.PC}{How many PCs should be used as input to non-linear to non-linear dimension reduction and clustering algorithms. Can be provided by the user, or set automatically by \code{ChoosePCs()}. Defaults to "auto".} +\item{n.PC}{How many PCs should be used as input to non-linear to non-linear dimension reduction and clustering algorithms. Can be provided by the user, or set automatically by \code{\link{ChoosePCs}}. Defaults to "auto".} \item{redo.embedding}{(Optional) Should a cluster-specific dimension reduction embeddings be generated? Sometimes subpopulations appear mixed together on the original coordinates, but separate clearly when re-embedded. Defaults to TRUE.} -\item{resolution.vals}{(Optional) A user-defined vector of resolution values to compare when clustering cells. Defaults to c(.1, .2, .3, .4).} +\item{resolution.vals}{A user-defined vector of resolution values to compare when clustering cells. Defaults to c(.1, .2, .3, .4).} -\item{k.vals}{(Optional) The parameters \emph{k} to be tested. Defaults to c(10, 25, 50).} +\item{k.vals}{The values of the number of nearest neighbors \emph{k} to be tested. Defaults to c(10, 25, 50).} -\item{cutoff.score}{(Optional) The lowest mean silhouette score accepted as evidence of subclusters. Defaults to .25, reasonable values are \link{.1, .3}.} +\item{cutoff.score}{The lowest mean silhouette score accepted as evidence of subclusters. Defaults to .25, reasonable values are [.1, .3].} \item{nn.metric}{(Optional) The distance metric to be used in computing the SNN graph. Defaults to "cosine".} \item{random.seed}{The seed used to control stochasticity in several functions. Defaults to 629.} } \description{ -This function identifies subclusters of cell types by recalculating the \emph{n} most highly variable genes for each cluster using \code{sctransform} as implemented in \code{Seurat}. The function returns a list of \code{Seurat} objects, one for each cluster the user wants to investigate. +This function identifies subclusters of cell types by recalculating the \emph{n} most highly variable genes for each cluster using \code{\link[Seurat]{SCTransform}}. The function returns a list of \code{Seurat} objects, one for each cluster the user wants to investigate. } \examples{ -ReclusterCells(seurat.object, which.clust = 5, resolution.vals = c(.1, .2, .5), k.vals = c(10, 20, 30)) -ReclusterCells(seurat.object, which.clust = list(0, 3, 5), merge.clusters = TRUE +\dontrun{ReclusterCells(seurat.object, which.clust = 5, resolution.vals = c(.1, .2, .5), k.vals = c(10, 20, 30))} +\dontrun{ReclusterCells(seurat.object, which.clust = list(0, 3, 5), merge.clusters = TRUE)} +} +\seealso{ +\code{\link{ComputeSilhouetteScores}} +} +\author{ +Jack Leary } diff --git a/man/ReduceDimensions.Rd b/man/ReduceDimensions.Rd index 298b30d..addcd6b 100644 --- a/man/ReduceDimensions.Rd +++ b/man/ReduceDimensions.Rd @@ -21,8 +21,14 @@ ReduceDimensions( \item{random.seed}{The random seed to use. Defaults to 312.} } \description{ -This function simplifies the running of various dimension reduction algorithms. It exists mostly to make the body of ReclusterCells() easier to read. +This function simplifies the running of various dimension reduction algorithms. It exists mostly to make the body of \code{\link{ReclusterCells}} easier to read. } \examples{ -\dontrun{ReduceDimensions(pbmc3k, n.PC = 10, which.algos = "umap", seed = 629)} +\dontrun{ReduceDimensions(pbmc3k, n.PC = 10, which.algos = "umap", random.seed = 629)} +} +\seealso{ +\code{\link{RunPHATE}} +} +\author{ +Jack Leary } diff --git a/man/RunPHATE.Rd b/man/RunPHATE.Rd index 40d1d1b..b4557fd 100644 --- a/man/RunPHATE.Rd +++ b/man/RunPHATE.Rd @@ -30,7 +30,10 @@ RunPHATE( This function wraps the PHATE dimension reduction algorithm in the typical Seurat syntax. } \examples{ -RunPhate(object = pbmc3k, n.components = 2, n.PC = 10) +\dontrun{RunPhate(object = pbmc3k, n.components = 2, n.PC = 10)} +} +\seealso{ +\code{\link[phateR]{phate}} } \author{ Jack Leary