Major documentation update

jr-leary7 · Dec 30, 2021 · 408a1af · 408a1af
1 parent bbb2192
commit 408a1af
Show file tree

Hide file tree

Showing 30 changed files with 348 additions and 256 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,3 @@
 ^YehLabClust\.Rproj$
 ^\.Rproj\.user$
+^LICENSE\.md$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,26 @@
 Package: SCISSORS
-Title: A workflow to identify cell subpopulations in single cell RNA-seq.
-Version: 0.0.2.0
-Authors@R: 
-    person(given = "Jack",
-           family = "Leary",
-           role = c("aut", "cre"),
-           email = "jrleary@live.unc.edu")
-Depends: R (>= 3.5.0), Seurat (>= 3.0), biomaRt, cluster, data.table, ggplot2, SingleCellExperiment
-Suggests: phateR
+Title: Identify cell subpopulations in single cell RNA-seq data
+Version: 1.0.0
+Author: Jack Leary [aut]
+Maintainer: Jack Leary <jrleary@live.unc.edu>
 Description: This package implements a method (to be published soon) that allows users to easily identify cell subtypes and / or subpopulations in scRNA-seq data. After running the necessary `Seurat` processing steps, the user decides which clusters they think are good candidates for subpopulation-detection based on cluster size, t-SNE visualizations, and / or identified cell cluster type. For example, if the user sees a large, fairly homogeneous cluster that they are fairly sure contains immune cells, they can use our method to tease out T cell subtypes, NK cells, B cells, etc. from within the original cluster. The method is predicated on the usage of a cosine-distance based silhouette score that is calculated for each cluster. Several different sets of parameters are used, and the subclustering that maximizes this mean silhouette score is considered optimal. The package is built around the `Seurat` package, but `SingleCellExperiment`-formatted data is accepted as input as well (it will be converted to `Seurat` format, though). We also support the identification of differentially-expressed marker genes for each identified subpopulation, which can be used to characterize known or novel cell subtypes at the transcriptomic level.
-License: `use_mit_license()`
+License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.1.1
+Depends:
+  magrittr
+Imports:
+  Seurat,
+  stats, 
+  cluster, 
+  phateR, 
+  SingleCellExperiment, 
+  biomaRt, 
+  Matrix, 
+  matrixStats, 
+  dplyr, 
+  ggplot2, 
+  SeuratObject
+URL: https://github.com/jr-leary7/SCISSORS
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2021
+COPYRIGHT HOLDER: SCISSORS authors
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2021 SCISSORS authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,22 +12,29 @@ export(ReclusterCells)
 export(ReduceDimensions)
 export(RunPHATE)
 export(theme_yehlab)
-import(Seurat)
-import(biomaRt)
 import(magrittr)
-importClassesFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(Matrix,t)
+importFrom(Seurat,CellCycleScoring)
 importFrom(Seurat,CreateDimReducObject)
+importFrom(Seurat,DefaultAssay)
 importFrom(Seurat,DimPlot)
 importFrom(Seurat,Embeddings)
 importFrom(Seurat,FindAllMarkers)
+importFrom(Seurat,FindClusters)
+importFrom(Seurat,FindMarkers)
+importFrom(Seurat,FindNeighbors)
 importFrom(Seurat,GetAssayData)
 importFrom(Seurat,Idents)
+importFrom(Seurat,PercentageFeatureSet)
 importFrom(Seurat,RunPCA)
 importFrom(Seurat,RunTSNE)
 importFrom(Seurat,RunUMAP)
+importFrom(Seurat,SCTransform)
 importFrom(Seurat,Stdev)
 importFrom(Seurat,VariableFeatures)
+importFrom(SeuratObject,as.Seurat)
+importFrom(SeuratObject,colSums)
+importFrom(biomaRt,getBM)
 importFrom(biomaRt,getLDS)
 importFrom(biomaRt,useMart)
 importFrom(cluster,silhouette)
@@ -46,4 +53,6 @@ importFrom(ggplot2,labs)
 importFrom(ggplot2,theme)
 importFrom(matrixStats,rowVars)
 importFrom(phateR,phate)
+importFrom(stats,as.dist)
 importFrom(stats,dist)
+importFrom(stats,quantile)
diff --git a/R/AnnotateMarkerGenes.R b/R/AnnotateMarkerGenes.R
@@ -1,8 +1,10 @@
-#' Annotate differentially expressed genes using biomaRt
+#' Annotate differentially expressed genes using \code{biomaRt}.
 #'
-#' This function uses the `biomaRt` package to fetch a user-defined list of attributes for a list of dataframes containing differentially expressed genes. Intended to be run directly after `FindSubpopulationMarkers()`.
-#' @import biomaRt
-#' @param marker.genes The dataframe of marker genes generated by `FindSubpopulationMarkers()`
+#' @name AnnotateMarkerGenes
+#' @author Jack Leary
+#' @description This function uses the \code{biomaRt} package to fetch a user-defined list of attributes for a list of dataframes containing differentially expressed genes. Intended to be run directly after \code{\link{FindSubpopulationMarkers}}.
+#' @importFrom biomaRt useMart getBM
+#' @param marker.genes The dataframe of marker genes generated by \code{\link{FindSubpopulationMarkers}}.
 #' @param species The species of the cells being analyzed. Defaults to "human", but also supports "mouse".
 #' @param desired.annos The vector containing the annotations you'd like to retrieve for each gene.
 
@@ -12,33 +14,33 @@ AnnotateMarkerGenes <- function(marker.genes = NULL,
   # check inputs
   if (is.null(marker.genes)) { stop("Please supply a list of dataframes containing marker genes.") }
   if (is.null(desired.attrs)) { stop("Please supply a vector of annotations you'd like to generate.") }
-  # run function
+  # create marts
   if (species == "human") {
-    mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
+    mart <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
   } else if (species == "mouse") {
-    mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
+    mart <- biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
   }
+  # retrieve annotations
   genes <- marker.genes$gene
   if (species == "human") {
-    annos <- getBM(attributes = c("hgnc_symbol", desired.annos),
-                   mart = mart,
-                   filters = "hgnc_symbol",
-                   values = genes)
+    annos <- biomaRt::getBM(attributes = c("hgnc_symbol", desired.annos),
+                            mart = mart,
+                            filters = "hgnc_symbol",
+                            values = genes)
   } else if (species == "mouse") {
-    annos <- getBM(attributes = desired.annos,
-                   mart = mart,
-                   filters = "mgi_symbol",
-                   values = genes)
+    annos <- biomaRt::getBM(attributes = desired.annos,
+                            mart = mart,
+                            filters = "mgi_symbol",
+                            values = genes)
   }
-
+  # prepare result dataframe
   for (i in seq(unique(marker.genes$cluster))) {
     clust_df <- marker.genes[marker.genes$cluster == unique(marker.genes$cluster)[i], ]
     genes <- clust_df$gene
-    annos <- getBM(attributes = c("hgnc_symbol", desired.annos),
-                   mart = mart,
-                   filters = "hgnc_symbol",
-                   values = genes)
+    annos <- biomaRt::getBM(attributes = c("hgnc_symbol", desired.annos),
+                            mart = mart,
+                            filters = "hgnc_symbol",
+                            values = genes)
   }
-
   return(anno_genes)
 }
diff --git a/R/ChoosePCs.R b/R/ChoosePCs.R
@@ -2,10 +2,10 @@
 #'
 #' @name ChoosePCs
 #' @author Jack Leary
-#' @description This function uses the eigenvalues of the principal component matrix to determine the best number of PCs. The default can be chosen automatically, or given by the user. It is intended to be run after `RunPCA()`.
+#' @description This function uses the eigenvalues of the principal component matrix to determine the best number of PCs. The default can be chosen automatically, or given by the user. It is intended to be run after \code{\link[Seurat]{RunPCA}}.
 #' @importFrom matrixStats rowVars
 #' @importFrom Seurat GetAssayData Stdev
-#' @param seurat.object The object containing our single cell counts and principal component matrix. Defaults to NULL.
+#' @param seurat.obj The object containing our single cell counts and principal component matrix. Defaults to NULL.
 #' @param cutoff The cutoff value for cumulative proportion of variance explained. Can be set by the user, or can be determine automatically. Defaults to NULL.
 #' @export
 #' @examples

diff --git a/R/ComputeSilhouetteScores.R b/R/ComputeSilhouetteScores.R
@@ -1,4 +1,4 @@
-#' Calculate the mean silhouette score of a clustering.
+#' Calculate the silhouette score of a clustering.
 #'
 #' @name ComputeSilhouetteScores
 #' @author Jack Leary
@@ -7,9 +7,9 @@
 #' @importFrom stats dist
 #' @importFrom cluster silhouette
 #' @param seurat.obj The input object for which silhouette score will be computed. Defaults to NULL.
-#' @param dist.metric Which distanec metric should be used? Defaults to "cosine", but any of the metrics used by \code{\link[stats]{dist}} will work.
+#' @param dist.metric Which distance metric should be used? Defaults to "cosine", but any of the metrics used by \code{\link[stats]{dist}} will work.
 #' @param avg Should the average scores for each cluster be returned, or should a dataframe of every observation's cluster identity and score be returned? Defaults to TRUE.
-#' @importFrom cluster silhouette
+#' @seealso \code{\link{CosineDist}}.
 #' @export
 #' @examples
 #' \dontrun{ComputeSilhouetteScores(seurat.obj)}
@@ -23,7 +23,7 @@ ComputeSilhouetteScores <- function(seurat.obj = NULL, dist.metric = "cosine", a
   pca_mat <- as.matrix(pca_df)
   # calculate distance matrix -- default is cosine dissimilarity
   if (dist.metric == "cosine") {
-    pc_dists <- CosineDist(input = pca_mat)
+    pc_dists <- CosineDist(input.mat = pca_mat)
   } else {
     pc_dists <- stats::dist(x = pca_mat, method = dist.metric)
   }
@@ -35,8 +35,7 @@ ComputeSilhouetteScores <- function(seurat.obj = NULL, dist.metric = "cosine", a
     avg_widths <- unlist(avg_widths)
     val <- avg_widths
   } else {
-    val <- data.frame(Cluster = as.factor(res[, 1]),
-                      Score = res[, 3])
+    val <- data.frame(Cluster = as.factor(res[, 1]), Score = res[, 3])
   }
   return(val)
 }
diff --git a/R/ConvertGeneOrthologs.R b/R/ConvertGeneOrthologs.R
@@ -1,20 +1,23 @@
 #' Convert a vector of MGI symbols to their HGNC orthologs and vice versa.
 #'
 #' @name ConvertGeneOrthologs
+#' @author Jack Leary
 #' @description Converts gene names from human -> mouse and mouse -> human.
 #' @importFrom biomaRt useMart getLDS
 #' @param gene.vec A vector of genes to convert. Default to NULL.
 #' @param species One of "mm" or "hs". Defaults to "mm" (and thus converts to human).
 #' @export
 #' @examples
-#' ConvertGeneOrthologs(gene.vec = mouse_genes)
-#' ConvertGeneOrthologs(gene.vec = human_genes, species = "hs")
+#' \dontrun{ConvertGeneOrthologs(gene.vec = mouse_genes)}
+#' \dontrun{ConvertGeneOrthologs(gene.vec = human_genes, species = "hs")}
 
 ConvertGeneOrthologs <- function(gene.vec = NULL, species = "mm") {
   # check inputs
   species <- tolower(species)
+  # get marts
   human <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
   mouse <- biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
+  # convert gene names
   if (species == "mm") {
     genesV2 <- biomaRt::getLDS(attributes = c("mgi_symbol"),
                                filters = "mgi_symbol",

diff --git a/R/CosineDist.R b/R/CosineDist.R
@@ -3,15 +3,16 @@
 #' @name CosineDist
 #' @author Jack Leary
 #' @description This function takes a matrix as input, and computes the cosine distance (1 - cosine similarity) between the observations.
+#' @importFrom stats as.dist
 #' @param input.mat The input matrix. Defaults to NULL.
 #' @export
 #' @examples
-#' \dontrun{CosineDist(input = pca_matrix)}
+#' \dontrun{CosineDist(input.mat = pca_matrix)}
 
 CosineDist <- function(input.mat = NULL) {
   # check inputs -- although as this is a helper function, it should never be called incorrectly
   if (is.null(input.mat)) { stop("You must provide a matrix to CosineDist().") }
-  # run function
+  # compute cosine distance
   dist_mat <- stats::as.dist(1 - input.mat %*% t(input.mat) / (sqrt(rowSums(input.mat^2) %*% t(rowSums(input.mat^2)))))
   return(dist_mat)
 }
diff --git a/R/FindSpecificMarkers.R b/R/FindSpecificMarkers.R
@@ -7,15 +7,17 @@
 #' @importFrom Matrix t
 #' @importFrom dplyr mutate group_by summarise across filter pull bind_rows
 #' @importFrom Seurat FindAllMarkers
+#' @importFrom stats quantile
 #' @param seurat.object The \code{Seurat} object containing clusters for which you'd like marker genes identified. Defaults to NULL.
 #' @param ident.use The cell identity to group by. Defaults to "seurat_clusters".
-#' @param de.method The differential expression method used in \code{FindAllMarkers()}. Defaults to "wilcox".
+#' @param de.method The differential expression method used in \code{\link[Seurat]{FindAllMarkers}}. Defaults to "wilcox".
 #' @param perc.cutoff The percentile cutoff used to find highly expressed genes in other cluster. Defaults to 0.9.
 #' @param log2fc.cutoff The log2FC cutoff used, in part, to determine whether a gene is differentially expressed. Defaults to 0.25.
 #' @param fdr.cutoff The cutoff used to remove DE genes with non-significant adjusted \emph{p}-values. Defaults to 0.05.
+#' @seealso \code{\link[Seurat]{FindAllMarkers}}
 #' @export
 #' @examples
-#' FindSpecificMarkers(seurat_object, method = "wilcox")
+#' \dontrun{FindSpecificMarkers(seurat_object, method = "wilcox")}
 
 FindSpecificMarkers <- function(seurat.object = NULL,
                                 ident.use = "seurat_clusters",
@@ -30,7 +32,7 @@ FindSpecificMarkers <- function(seurat.object = NULL,
                          as.data.frame() %>%
                          dplyr::mutate(cell_ident = unname(unlist(seurat.object[[ident.use]]))) %>%
                          dplyr::group_by(cell_ident) %>%
-                         dplyr::summarise(across(where(is.numeric), mean))
+                         dplyr::summarise(dplyr::across(where(is.numeric), mean))
   # find list of genes w/ mean expression above 90th percentile of expression in each celltype
   high_exp_genes <- c()
   cluster_labels <- c()
@@ -40,7 +42,7 @@ FindSpecificMarkers <- function(seurat.object = NULL,
                      dplyr::select(-cell_ident) %>%
                      t() %>%
                      as.data.frame() %>%
-                     dplyr::filter(V1 > quantile(V1, perc.cutoff)) %>%
+                     dplyr::filter(V1 > stats::quantile(V1, perc.cutoff)) %>%
                      dplyr::mutate(gene = rownames(.)) %>%
                      dplyr::pull(gene)
     high_exp_genes <- c(high_exp_genes, top_exp_genes)