diff --git a/DESCRIPTION b/DESCRIPTION index 8ed545907..95dbbc39e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: Seurat -Version: 3.1.4 -Date: 2020-02-26 +Version: 3.1.5 +Date: 2020-04-14 Title: Tools for Single Cell Genomics -Description: A toolkit for quality control, analysis, and exploration of single cell RNA sequencing data. 'Seurat' aims to enable users to identify and interpret sources of heterogeneity from single cell transcriptomic measurements, and to integrate diverse types of single cell data. See Satija R, Farrell J, Gennert D, et al (2015) , Macosko E, Basu A, Satija R, et al (2015) , and Butler A and Satija R (2017) for more details. Please note: SDMTools is available is available from the CRAN archives with install.packages("https://cran.rstudio.com//src/contrib/Archive/SDMTools/SDMTools_1.1-221.2.tar.gz", repos = NULL); it is not in the standard repositories. +Description: A toolkit for quality control, analysis, and exploration of single cell RNA sequencing data. 'Seurat' aims to enable users to identify and interpret sources of heterogeneity from single cell transcriptomic measurements, and to integrate diverse types of single cell data. See Satija R, Farrell J, Gennert D, et al (2015) , Macosko E, Basu A, Satija R, et al (2015) , and Stuart T, Butler A, et al (2019) for more details. Please note: SDMTools is available is available from the CRAN archives with install.packages(<"https://cran.rstudio.com//src/contrib/Archive/SDMTools/SDMTools_1.1-221.2.tar.gz">, repos = NULL); it is not in the standard repositories. Authors@R: c( person(given = 'Rahul', family = 'Satija', email = 'rsatija@nygenome.org', role = 'aut', comment = c(ORCID = '0000-0001-9448-8833')), person(given = 'Andrew', family = 'Butler', email = 'abutler@nygenome.org', role = 'aut', comment = c(ORCID = '0000-0003-3608-0463')), @@ -42,7 +42,6 @@ Imports: lmtest, MASS, Matrix (>= 1.2-14), - metap, patchwork, pbapply, plotly, @@ -81,7 +80,7 @@ Collate: 'tree.R' 'utilities.R' 'zzz.R' -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 Encoding: UTF-8 biocViews: Suggests: @@ -101,4 +100,6 @@ Suggests: rtracklayer, monocle, Biobase, - VGAM + VGAM, + limma, + metap diff --git a/NAMESPACE b/NAMESPACE index 52dbc5bac..f4d68b414 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ S3method("Key<-",Assay) S3method("Key<-",DimReduc) S3method("Loadings<-",DimReduc) S3method("Misc<-",Assay) +S3method("Misc<-",DimReduc) S3method("Misc<-",Seurat) S3method("Project<-",Seurat) S3method("Tool<-",Seurat) @@ -70,6 +71,7 @@ S3method(Key,Seurat) S3method(Loadings,DimReduc) S3method(Loadings,Seurat) S3method(Misc,Assay) +S3method(Misc,DimReduc) S3method(Misc,Seurat) S3method(NormalizeData,Assay) S3method(NormalizeData,Seurat) @@ -409,6 +411,7 @@ importFrom(ggplot2,guides) importFrom(ggplot2,labs) importFrom(ggplot2,layer) importFrom(ggplot2,margin) +importFrom(ggplot2,position_jitterdodge) importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_distiller) importFrom(ggplot2,scale_color_gradient) @@ -469,7 +472,6 @@ importFrom(igraph,plot.igraph) importFrom(irlba,irlba) importFrom(leiden,leiden) importFrom(lmtest,lrtest) -importFrom(metap,minimump) importFrom(methods,"slot<-") importFrom(methods,.hasSlot) importFrom(methods,as) @@ -496,6 +498,9 @@ importFrom(reticulate,py_module_available) importFrom(reticulate,py_set_seed) importFrom(reticulate,tuple) importFrom(rlang,"!!") +importFrom(rlang,enquo) +importFrom(rlang,eval_tidy) +importFrom(rlang,is_quosure) importFrom(rsvd,rsvd) importFrom(scales,hue_pal) importFrom(scales,zero_range) @@ -557,4 +562,5 @@ importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) importFrom(utils,write.table) importFrom(uwot,umap) +importFrom(uwot,umap_transform) useDynLib(Seurat) diff --git a/NEWS.md b/NEWS.md index fe5af7825..29bae530a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,27 @@ All notable changes to Seurat will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) +## [3.1.5] - 2020-04-14 +### Added +- New `scale` parameter in `DotPlot` +- New `keep.sparse parameter in `CreateGeneActivityMatrix` for a more memory efficient option +- Added ability to store model learned by UMAP and project new data +- New `stip.suffix` option in `Read10X` +- Added `group.by` parameter to `FeatureScatter` + +### Changes +- Replace wilcox.test with limma implementation for a faster FindMarkers default method +- Better point separation for `VlnPlot`s when using the `split.by` option +- Efficiency improvements for anchor pairing +- Deprecate redundant `sort.cell` parameter in `FeaturePlot` +- Fixes to ensure correct class of Matrix passed to c++ functions +- Fixes for underscores in ident labels for `DotPlot` +- Ensure preservation of matrix dimnames in `SampleUMI` +- Fix non-standard evaluation problems in `subset` and `WhichCells` +- Default split violin option is now a multi group option +- Preserve alpha in `FeaturePlot` when using `blend` +- Update `assay.used` slot for `DimReduc`s when Assay is renamed + ## [3.1.4] - 2020-02-20 ### Changes - Fixes to `DoHeatmap` to remain compatible with ggplot2 v3.3 diff --git a/R/differential_expression.R b/R/differential_expression.R index d8f728b68..8cbc3440b 100644 --- a/R/differential_expression.R +++ b/R/differential_expression.R @@ -210,8 +210,6 @@ FindAllMarkers <- function( #' associated output column (e.g. CTRL_p_val). If only one group is tested in the grouping.var, max #' and combined p-values are not returned. #' -#' @importFrom metap minimump -#' #' @export #' #' @examples @@ -229,10 +227,23 @@ FindConservedMarkers <- function( grouping.var, assay = 'RNA', slot = 'data', - meta.method = minimump, + meta.method = metap::minimump, verbose = TRUE, ... ) { + metap.installed <- PackageCheck("metap", error = FALSE) + if (!metap.installed[1]) { + stop( + "Please install the metap package to use FindConservedMarkers.", + "\nThis can be accomplished with the following commands: ", + "\n----------------------------------------", + "\ninstall.packages('BiocManager')", + "\nBiocManager::install('multtest')", + "\ninstall.packages('metap')", + "\n----------------------------------------", + call. = FALSE + ) + } if (!is.function(x = meta.method)) { stop("meta.method should be a function from the metap package. Please see https://cran.r-project.org/web/packages/metap/metap.pdf for a detailed description of the available functions.") } @@ -354,12 +365,13 @@ FindConservedMarkers <- function( return(meta.method(x)$p) } )) - colnames(x = combined.pval) <- paste0( - as.character(x = formals()$meta.method), - "_p_val" - ) + meta.method.name <- as.character(x = formals()$meta.method) + if (length(x = meta.method.name) == 3) { + meta.method.name <- meta.method.name[3] + } + colnames(x = combined.pval) <- paste0(meta.method.name, "_p_val") markers.combined <- cbind(markers.combined, combined.pval) - markers.combined <- markers.combined[order(markers.combined[, paste0(as.character(x = formals()$meta.method), "_p_val")]), ] + markers.combined <- markers.combined[order(markers.combined[, paste0(meta.method.name, "_p_val")]), ] } else { warning("Only a single group was tested", call. = FALSE, immediate. = TRUE) } @@ -1505,7 +1517,9 @@ RegularizedTheta <- function(cm, latent.data, min.theta = 0.01, bin.size = 128) # Differential expression using Wilcoxon Rank Sum # # Identifies differentially expressed genes between two groups of cells using -# a Wilcoxon Rank Sum test +# a Wilcoxon Rank Sum test. Makes use of limma::rankSumTestWithCorrelation for a +# more efficient implementation of the wilcoxon test. Thanks to Yunshun Chen and +# Gordon Smyth for suggesting the limma implementation. # # @param data.use Data matrix to test # @param cells.1 Group 1 cells @@ -1535,21 +1549,47 @@ WilcoxDETest <- function( verbose = TRUE, ... ) { - group.info <- data.frame(row.names = c(cells.1, cells.2)) - group.info[cells.1, "group"] <- "Group1" - group.info[cells.2, "group"] <- "Group2" - group.info[, "group"] <- factor(x = group.info[, "group"]) - data.use <- data.use[, rownames(x = group.info), drop = FALSE] + data.use <- data.use[, c(cells.1, cells.2), drop = FALSE] + j <- seq_len(length.out = length(x = cells.1)) my.sapply <- ifelse( test = verbose && nbrOfWorkers() == 1, yes = pbsapply, no = future_sapply ) - p_val <- my.sapply( - X = 1:nrow(x = data.use), - FUN = function(x) { - return(wilcox.test(data.use[x, ] ~ group.info[, "group"], ...)$p.value) + limma.check <- PackageCheck("limma", error = FALSE) + if (limma.check[1]) { + p_val <- my.sapply( + X = 1:nrow(x = data.use), + FUN = function(x) { + return(min(2 * min(limma::rankSumTestWithCorrelation(index = j, statistics = data.use[x, ])), 1)) + } + ) + } else { + if (getOption('Seurat.limma.wilcox.msg', TRUE)) { + message( + "For a more efficient implementation of the Wilcoxon Rank Sum Test,", + "\n(default method for FindMarkers) please install the limma package", + "\n--------------------------------------------", + "\ninstall.packages('BiocManager')", + "\nBiocManager::install('limma')", + "\n--------------------------------------------", + "\nAfter installation of limma, Seurat will automatically use the more ", + "\nefficient implementation (no further action necessary).", + "\nThis message will be shown once per session" + ) + options(Seurat.limma.wilcox.msg = FALSE) } - ) + group.info <- data.frame(row.names = c(cells.1, cells.2)) + group.info[cells.1, "group"] <- "Group1" + group.info[cells.2, "group"] <- "Group2" + group.info[, "group"] <- factor(x = group.info[, "group"]) + data.use <- data.use[, rownames(x = group.info), drop = FALSE] + p_val <- my.sapply( + X = 1:nrow(x = data.use), + FUN = function(x) { + return(wilcox.test(data.use[x, ] ~ group.info[, "group"], ...)$p.value) + } + ) + } return(data.frame(p_val, row.names = rownames(x = data.use))) } diff --git a/R/dimensional_reduction.R b/R/dimensional_reduction.R index 6009ec248..4e7151d51 100644 --- a/R/dimensional_reduction.R +++ b/R/dimensional_reduction.R @@ -65,6 +65,10 @@ JackStraw <- function( my.sapply <- future_sapply } assay <- assay %||% DefaultAssay(object = object) + if (IsSCT(assay = object[[assay]])) { + stop("JackStraw cannot be run on SCTransform-normalized data. + Please supply a non-SCT assay.") + } if (dims > length(x = object[[reduction]])) { dims <- length(x = object[[reduction]]) warning("Number of dimensions specified is greater than those available. Setting dims to ", dims, " and continuing", immediate. = TRUE) @@ -1137,7 +1141,7 @@ RunTSNE.Seurat <- function( } #' @importFrom reticulate py_module_available py_set_seed import -#' @importFrom uwot umap +#' @importFrom uwot umap umap_transform #' @importFrom future nbrOfWorkers #' #' @rdname RunUMAP @@ -1146,6 +1150,7 @@ RunTSNE.Seurat <- function( #' RunUMAP.default <- function( object, + reduction.model = NULL, assay = NULL, umap.method = 'uwot', n.neighbors = 30L, @@ -1245,8 +1250,75 @@ RunUMAP.default <- function( verbose = verbose ) }, + 'uwot-learn' = { + if (metric == 'correlation') { + warning( + "UWOT does not implement the correlation metric, using cosine instead", + call. = FALSE, + immediate. = TRUE + ) + metric <- 'cosine' + } + umap( + X = object, + n_threads = nbrOfWorkers(), + n_neighbors = as.integer(x = n.neighbors), + n_components = as.integer(x = n.components), + metric = metric, + n_epochs = n.epochs, + learning_rate = learning.rate, + min_dist = min.dist, + spread = spread, + set_op_mix_ratio = set.op.mix.ratio, + local_connectivity = local.connectivity, + repulsion_strength = repulsion.strength, + negative_sample_rate = negative.sample.rate, + a = a, + b = b, + fast_sgd = uwot.sgd, + verbose = verbose, + ret_model = TRUE + ) + }, + 'uwot-predict' = { + if (metric == 'correlation') { + warning( + "UWOT does not implement the correlation metric, using cosine instead", + call. = FALSE, + immediate. = TRUE + ) + metric <- 'cosine' + } + if (is.null(x = reduction.model) || !inherits(x = reduction.model, what = 'DimReduc')) { + stop( + "If using uwot-predict, please pass a DimReduc object with the model stored to reduction.model.", + call. = FALSE + ) + } + model <- reduction.model %||% Misc( + object = reduction.model, + slot = "model" + ) + if (length(x = model) == 0) { + stop( + "The provided reduction.model does not have a model stored. Please try running umot-learn on the object first", + call. = FALSE + ) + } + umap_transform( + X = object, + model = model, + n_threads = nbrOfWorkers(), + n_epochs = n.epochs, + verbose = verbose + ) + }, stop("Unknown umap method: ", umap.method, call. = FALSE) ) + if (umap.method == 'uwot-learn') { + umap.model <- umap.output + umap.output <- umap.output$embedding + } colnames(x = umap.output) <- paste0(reduction.key, 1:ncol(x = umap.output)) if (inherits(x = object, what = 'dist')) { rownames(x = umap.output) <- attr(x = object, "Labels") @@ -1259,6 +1331,9 @@ RunUMAP.default <- function( assay = assay, global = TRUE ) + if (umap.method == 'uwot-learn') { + Misc(umap.reduction, slot = "model") <- umap.model + } return(umap.reduction) } @@ -1350,6 +1425,7 @@ RunUMAP.Graph <- function( return(umap) } +#' @param reduction.model \code{DimReduc} object that contains the umap model #' @param dims Which dimensions to use as input features, used only if #' \code{features} is NULL #' @param reduction Which dimensional reduction (PCA or ICA) to use for the @@ -1363,6 +1439,7 @@ RunUMAP.Graph <- function( #' @param umap.method UMAP implementation to run. Can be #' \describe{ #' \item{\code{uwot}:}{Runs umap via the uwot R package} +#' \item{\code{uwot-learn}:}{Runs umap via the uwot R package and return the learned umap model} #' \item{\code{umap-learn}:}{Run the Seurat wrapper of the python umap-learn package} #' } #' @param n.neighbors This determines the number of neighboring points used in @@ -1423,6 +1500,7 @@ RunUMAP.Graph <- function( #' RunUMAP.Seurat <- function( object, + reduction.model = NULL, dims = NULL, reduction = 'pca', features = NULL, @@ -1487,6 +1565,7 @@ RunUMAP.Seurat <- function( } object[[reduction.name]] <- RunUMAP( object = data.use, + reduction.model = reduction.model, assay = assay, umap.method = umap.method, n.neighbors = n.neighbors, diff --git a/R/integration.R b/R/integration.R index a821adc01..2a73bab3f 100644 --- a/R/integration.R +++ b/R/integration.R @@ -8,10 +8,42 @@ NULL #' Find integration anchors #' -#' Finds the integration anchors +#' Find a set of anchors between a list of \code{\link{Seurat}} objects. +#' These anchors can later be used to integrate the objects using the +#' \code{\link{IntegrateData}} function. +#' +#' The main steps of this procedure are outlined below. For a more detailed +#' description of the methodology, please see Stuart, Butler, et al Cell 2019: +#' \url{https://doi.org/10.1016/j.cell.2019.05.031}; +#' \url{https://doi.org/10.1101/460147} +#' +#' First, determine anchor.features if not explicitly specified using +#' \code{\link{SelectIntegrationFeatures}}. Then for all pairwise combinations +#' of reference and query datasets: +#' +#' \itemize{ +#' \item{Perform dimensional reduction on the dataset pair as specified via +#' the \code{reduction} parameter. If \code{l2.norm} is set to \code{TRUE}, +#' perform L2 normalization of the embedding vectors.} +#' \item{Identify anchors - pairs of cells from each dataset +#' that are contained within each other's neighborhoods (also known as mutual +#' nearest neighbors).} +#' \item{Filter low confidence anchors to ensure anchors in the low dimension +#' space are in broad agreement with the high dimensional measurements. This +#' is done by looking at the neighbors of each query cell in the reference +#' dataset using \code{max.features} to define this space. If the reference +#' cell isn't found within the first \code{k.filter} neighbors, remove the +#' anchor.} +#' \item{Assign each remaining anchor a score. For each anchor cell, determine +#' the nearest \code{k.score} anchors within its own dataset and within its +#' pair's dataset. Based on these neighborhoods, construct an overall neighbor +#' graph and then compute the shared neighbor overlap between anchor and query +#' cells (analogous to an SNN graph). We use the 0.01 and 0.90 quantiles on +#' these scores to dampen outlier effects and rescale to range between 0-1.} +#' } #' -#' @param object.list A list of objects between which to find anchors for -#' downstream integration. +#' @param object.list A list of \code{\link{Seurat}} objects between which to +#' find anchors for downstream integration. #' @param assay A vector of assay names specifying which assay to use when #' constructing anchors. If NULL, the current default assay for each object is #' used. @@ -55,14 +87,45 @@ NULL #' @param eps Error bound on the neighbor finding algorithm (from RANN) #' @param verbose Print progress bars and output #' -#' @return Returns an AnchorSet object -#' +#' @return Returns an \code{\link{AnchorSet}} object that can be used as input to +#' \code{\link{IntegrateData}}. +#' +#' @references Stuart T, Butler A, et al. Comprehensive Integration of +#' Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +#' j.cell.2019.05.031} +#' #' @importFrom pbapply pblapply #' @importFrom future.apply future_lapply #' @importFrom future nbrOfWorkers #' #' @export -#' +#' +#' @examples +#' \dontrun{ +#' # to install the SeuratData package see https://github.com/satijalab/seurat-data +#' library(SeuratData) +#' data("panc8") +#' +#' # panc8 is a merged Seurat object containing 8 separate pancreas datasets +#' # split the object by dataset +#' pancreas.list <- SplitObject(panc8, split.by = "tech") +#' +#' # perform standard preprocessing on each object +#' for (i in 1:length(pancreas.list)) { +#' pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE) +#' pancreas.list[[i]] <- FindVariableFeatures( +#' pancreas.list[[i]], selection.method = "vst", +#' nfeatures = 2000, verbose = FALSE +#' ) +#' } +#' +#' # find anchors +#' anchors <- FindIntegrationAnchors(object.list = pancreas.list) +#' +#' # integrate data +#' integrated <- IntegrateData(anchorset = anchors) +#' } +#' FindIntegrationAnchors <- function( object.list = NULL, assay = NULL, @@ -368,43 +431,119 @@ FindIntegrationAnchors <- function( #' Find transfer anchors #' -#' Finds the transfer anchors +#' Find a set of anchors between a reference and query object. These +#' anchors can later be used to transfer data from the reference to +#' query object using the \code{\link{TransferData}} object. #' -#' @param reference Seurat object to use as the reference -#' @param query Seurat object to use as the query -#' @param reference.assay Assay to use from reference -#' @param query.assay Assay to use from query -#' @param reduction Dimensional reduction to perform when finding anchors. Options are: +#' The main steps of this procedure are outlined below. For a more detailed +#' description of the methodology, please see Stuart, Butler, et al Cell 2019. +#' \url{https://doi.org/10.1016/j.cell.2019.05.031}; +#' \url{https://doi.org/10.1101/460147} +#' #' \itemize{ -#' \item{pcaproject: Project the PCA from the reference onto the query. We recommend using PCA -#' when reference and query datasets are from scRNA-seq} +#' +#' \item{Perform dimensional reduction. Exactly what is done here depends on +#' the values set for the \code{reduction} and \code{project.query} +#' parameters. If \code{reduction = "pcaproject"}, a PCA is performed on +#' either the reference (if \code{project.query = FALSE}) or the query (if +#' \code{project.query = TRUE}), using the \code{features} specified. The data +#' from the other dataset is then projected onto this learned PCA structure. +#' If \code{reduction = "cca"}, then CCA is performed on the reference and +#' query for this dimensional reduction step. If \code{l2.norm} is set to +#' \code{TRUE}, perform L2 normalization of the embedding vectors.} +#' \item{Identify anchors between the reference and query - pairs of cells +#' from each dataset that are contained within each other's neighborhoods +#' (also known as mutual nearest neighbors).} +#' \item{Filter low confidence anchors to ensure anchors in the low dimension +#' space are in broad agreement with the high dimensional measurements. This +#' is done by looking at the neighbors of each query cell in the reference +#' dataset using \code{max.features} to define this space. If the reference +#' cell isn't found within the first \code{k.filter} neighbors, remove the +#' anchor.} +#' \item{Assign each remaining anchor a score. For each anchor cell, determine +#' the nearest \code{k.score} anchors within its own dataset and within its +#' pair's dataset. Based on these neighborhoods, construct an overall neighbor +#' graph and then compute the shared neighbor overlap between anchor and query +#' cells (analogous to an SNN graph). We use the 0.01 and 0.90 quantiles on +#' these scores to dampen outlier effects and rescale to range between 0-1.} +#' } +#' +#' @param reference \code{\link{Seurat}} object to use as the reference +#' @param query \code{\link{Seurat}} object to use as the query +#' @param reference.assay Name of the Assay to use from reference +#' @param query.assay Name of the Assay to use from query +#' @param reduction Dimensional reduction to perform when finding anchors. +#' Options are: +#' \itemize{ +#' \item{pcaproject: Project the PCA from the reference onto the query. We +#' recommend using PCA when reference and query datasets are from scRNA-seq} #' \item{cca: Run a CCA on the reference and query } #' } -#' @param project.query Project the PCA from the query dataset onto the reference. Use only in rare -#' cases where the query dataset has a much larger cell number, but the reference dataset has a -#' unique assay for transfer. -#' @param features Features to use for dimensional reduction -#' @param normalization.method Name of normalization method used: LogNormalize or SCT -#' @param npcs Number of PCs to compute on reference. If null, then use an existing PCA structure in -#' the reference object -#' @param l2.norm Perform L2 normalization on the cell embeddings after dimensional reduction -#' @param dims Which dimensions to use from the reduction to specify the neighbor search space -#' @param k.anchor How many neighbors (k) to use when picking anchors +#' @param project.query Project the PCA from the query dataset onto the +#' reference. Use only in rare cases where the query dataset has a much larger +#' cell number, but the reference dataset has a unique assay for transfer. +#' @param features Features to use for dimensional reduction. If not specified, +#' set as variable features of the reference object which are also present in +#' the query. +#' @param normalization.method Name of normalization method used: LogNormalize +#' or SCT +#' @param npcs Number of PCs to compute on reference. If null, then use an +#' existing PCA structure in the reference object +#' @param l2.norm Perform L2 normalization on the cell embeddings after +#' dimensional reduction +#' @param dims Which dimensions to use from the reduction to specify the +#' neighbor search space +#' @param k.anchor How many neighbors (k) to use when finding anchors #' @param k.filter How many neighbors (k) to use when filtering anchors #' @param k.score How many neighbors (k) to use when scoring anchors -#' @param max.features The maximum number of features to use when specifying the neighborhood search -#' space in the anchor filtering +#' @param max.features The maximum number of features to use when specifying the +#' neighborhood search space in the anchor filtering #'@param nn.method Method for nearest neighbor finding. Options include: rann, #' annoy -#' @param eps Error bound on the neighbor finding algorithm (from RANN) -#' @param approx.pca Use truncated singular value decomposition to approximate PCA +#' @param eps Error bound on the neighbor finding algorithm (from +#' \code{\link{RANN}}) +#' @param approx.pca Use truncated singular value decomposition to approximate +#' PCA #' @param verbose Print progress bars and output #' -#' @return Returns an AnchorSet object -#' -#' +#' @return Returns an \code{AnchorSet} object that can be used as input to +#' \code{\link{TransferData}} +#' +#' @references Stuart T, Butler A, et al. Comprehensive Integration of +#' Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +#' j.cell.2019.05.031}; +#' #' @export -#' +#' @examples +#' \dontrun{ +#' # to install the SeuratData package see https://github.com/satijalab/seurat-data +#' library(SeuratData) +#' data("pbmc3k") +#' +#' # for demonstration, split the object into reference and query +#' pbmc.reference <- pbmc3k[, 1:1350] +#' pbmc.query <- pbmc3k[, 1351:2700] +#' +#' # perform standard preprocessing on each object +#' pbmc.reference <- NormalizeData(pbmc.reference) +#' pbmc.reference <- FindVariableFeatures(pbmc.reference) +#' pbmc.reference <- ScaleData(pbmc.reference) +#' +#' pbmc.query <- NormalizeData(pbmc.query) +#' pbmc.query <- FindVariableFeatures(pbmc.query) +#' pbmc.query <- ScaleData(pbmc.query) +#' +#' # find anchors +#' anchors <- FindTransferAnchors(reference = pbmc.reference, query = pbmc.query) +#' +#' # transfer labels +#' predictions <- TransferData( +#' anchorset = anchors, +#' refdata = pbmc.reference$seurat_annotations +#' ) +#' pbmc.query <- AddMetaData(object = pbmc.query, metadata = predictions) +#' } +#' FindTransferAnchors <- function( reference, query, @@ -611,40 +750,119 @@ FindTransferAnchors <- function( #' Integrate data #' -#' Perform dataset integration using a pre-computed anchorset +#' Perform dataset integration using a pre-computed \code{\link{AnchorSet}}. +#' +#' The main steps of this procedure are outlined below. For a more detailed +#' description of the methodology, please see Stuart, Butler, et al Cell 2019. +#' \url{https://doi.org/10.1016/j.cell.2019.05.031}; +#' \url{https://doi.org/10.1101/460147} +#' +#' For pairwise integration: +#' +#' \itemize{ +#' \item{Construct a weights matrix that defines the association between each +#' query cell and each anchor. These weights are computed as 1 - the distance +#' between the query cell and the anchor divided by the distance of the query +#' cell to the \code{k.weight}th anchor multiplied by the anchor score +#' computed in \code{\link{FindIntegrationAnchors}}. We then apply a Gaussian +#' kernel width a bandwidth defined by \code{sd.weight} and normalize across +#' all \code{k.weight} anchors.} +#' \item{Compute the anchor integration matrix as the difference between the +#' two expression matrices for every pair of anchor cells} +#' \item{Compute the transformation matrix as the product of the integration +#' matrix and the weights matrix.} +#' \item{Subtract the transformation matrix from the original expression +#' matrix.} +#' } +#' +#' For multiple dataset integration, we perform iterative pairwise integration. +#' To determine the order of integration (if not specified via +#' \code{sample.tree}), we +#' \itemize{ +#' \item{Define a distance between datasets as the total number of cells in +#' the smaller dataset divided by the total number of anchors between the two +#' datasets.} +#' \item{Compute all pairwise distances between datasets} +#' \item{Cluster this distance matrix to determine a guide tree} +#' } +#' #' -#' @param anchorset Results from FindIntegrationAnchors +#' @param anchorset An \code{\link{AnchorSet}} object generated by +#' \code{\link{FindIntegrationAnchors}} #' @param new.assay.name Name for the new assay containing the integrated data #' @param normalization.method Name of normalization method used: LogNormalize #' or SCT -#' @param features Vector of features to use when computing the PCA to determine the weights. Only set -#' if you want a different set from those used in the anchor finding process -#' @param features.to.integrate Vector of features to integrate. By default, will use the features -#' used in anchor finding. -#' @param dims Number of PCs to use in the weighting procedure -#' @param k.weight Number of neighbors to consider when weighting -#' @param weight.reduction Dimension reduction to use when calculating anchor weights. -#' This can be either: +#' @param features Vector of features to use when computing the PCA to determine +#' the weights. Only set if you want a different set from those used in the +#' anchor finding process +#' @param features.to.integrate Vector of features to integrate. By default, +#' will use the features used in anchor finding. +#' @param dims Number of dimensions to use in the anchor weighting procedure +#' @param k.weight Number of neighbors to consider when weighting anchors +#' @param weight.reduction Dimension reduction to use when calculating anchor +#' weights. This can be one of: #' \itemize{ -#' \item{A string, specifying the name of a dimension reduction present in all objects to be integrated} -#' \item{A vector of strings, specifying the name of a dimension reduction to use for each object to be integrated} -#' \item{A vector of Dimreduc objects, specifying the object to use for each object in the integration} -#' \item{NULL, in which case a new PCA will be calculated and used to calculate anchor weights} +#' \item{A string, specifying the name of a dimension reduction present in +#' all objects to be integrated} +#' \item{A vector of strings, specifying the name of a dimension reduction to +#' use for each object to be integrated} +#' \item{A vector of \code{\link{DimReduc}} objects, specifying the object to +#' use for each object in the integration} +#' \item{NULL, in which case a new PCA will be calculated and used to +#' calculate anchor weights} #' } -#' Note that, if specified, the requested dimension reduction will only be used for calculating anchor weights in the -#' first merge between reference and query, as the merged object will subsequently contain more cells than was in +#' Note that, if specified, the requested dimension reduction will only be used +#' for calculating anchor weights in the first merge between reference and +#' query, as the merged object will subsequently contain more cells than was in #' query, and weights will need to be calculated for all cells in the object. #' @param sd.weight Controls the bandwidth of the Gaussian kernel for weighting -#' @param sample.tree Specify the order of integration. If NULL, will compute automatically. -#' @param preserve.order Do not reorder objects based on size for each pairwise integration. +#' @param sample.tree Specify the order of integration. If NULL, will compute +#' automatically. +#' @param preserve.order Do not reorder objects based on size for each pairwise +#' integration. #' @param do.cpp Run cpp code where applicable -#' @param eps Error bound on the neighbor finding algorithm (from \code{\link{RANN}}) +#' @param eps Error bound on the neighbor finding algorithm (from +#' \code{\link{RANN}}) #' @param verbose Print progress bars and output #' -#' @return Returns a Seurat object with a new integrated Assay +#' @return Returns a \code{\link{Seurat}} object with a new integrated +#' \code{\link{Assay}}. If \code{normalization.method = "LogNormalize"}, the +#' integrated data is returned to the \code{data} slot and can be treated as +#' log-normalized, corrected data. If \code{normalization.method = "SCT"}, the +#' integrated data is returned to the \code{scale.data} slot and can be treated +#' as centered, corrected Pearson residuals. #' -#' @export +#' @references Stuart T, Butler A, et al. Comprehensive Integration of +#' Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +#' j.cell.2019.05.031} #' +#' @export +#' @examples +#' \dontrun{ +#' # to install the SeuratData package see https://github.com/satijalab/seurat-data +#' library(SeuratData) +#' data("panc8") +#' +#' # panc8 is a merged Seurat object containing 8 separate pancreas datasets +#' # split the object by dataset +#' pancreas.list <- SplitObject(panc8, split.by = "tech") +#' +#' # perform standard preprocessing on each object +#' for (i in 1:length(pancreas.list)) { +#' pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE) +#' pancreas.list[[i]] <- FindVariableFeatures( +#' pancreas.list[[i]], selection.method = "vst", +#' nfeatures = 2000, verbose = FALSE +#' ) +#' } +#' +#' # find anchors +#' anchors <- FindIntegrationAnchors(object.list = pancreas.list) +#' +#' # integrate data +#' integrated <- IntegrateData(anchorset = anchors) +#' } +#' IntegrateData <- function( anchorset, new.assay.name = "integrated", @@ -671,10 +889,28 @@ IntegrateData <- function( x = object.list[[1]], y = object.list[2:length(x = object.list)] ) + if (!is.null(x = features.to.integrate)) { + features.to.integrate <- intersect( + x = features.to.integrate, + y = Reduce( + f = intersect, + x = lapply( + X = object.list, + FUN = rownames + ) + ) + ) + } if (normalization.method == "SCT") { - vst.set <- list() for (i in 1:length(x = object.list)) { assay <- DefaultAssay(object = object.list[[i]]) + if (length(x = setdiff(x = features.to.integrate, y = features)) != 0) { + object.list[[i]] <- GetResidual( + object = object.list[[i]], + features = setdiff(x = features.to.integrate, y = features), + verbose = verbose + ) + } object.list[[i]][[assay]] <- CreateAssayObject( data = GetAssayData(object = object.list[[i]], assay = assay, slot = "scale.data") ) @@ -894,8 +1130,7 @@ LocalStruct <- function( #' @param eps Error bound on the neighbor finding algorithm (from RANN) #' @param verbose Displays progress bar #' -#' @return Returns a vector of values representing the entropy metric from each -#' bootstrapped iteration. +#' @return Returns a vector of values of the mixing metric for each cell #' #' @importFrom RANN nn2 #' @importFrom pbapply pbsapply @@ -943,12 +1178,29 @@ MixingMetric <- function( return(mixing) } -#' Prepare an object list that has been run through SCTransform for integration +#' Prepare an object list normalized with sctransform for integration. +#' +#' This function takes in a list of objects that have been normalized with the +#' \code{\link{SCTransform}} method and performs the following steps: +#' \itemize{ +#' \item{If anchor.features is a numeric value, calls \code{\link{SelectIntegrationFeatures}} +#' to determine the features to use in the downstream integration procedure.} +#' \item{Ensures that the sctransform residuals for the features specified +#' to anchor.features are present in each object in the list. This is +#' necessary because the default behavior of \code{\link{SCTransform}} is to +#' only store the residuals for the features determined to be variable. +#' Residuals are recomputed for missing features using the stored model +#' parameters via the \code{\link{GetResidual}} function.} +#' \item{Subsets the \code{scale.data} slot to only contain the residuals for +#' anchor.features for efficiency in downstream processing. } +#' } #' -#' @param object.list A list of objects to prep for integration -#' @param assay Name or vector of assay names (one for each object) that correspond -#' to the assay that SCTransform has been run on. If NULL, the current default -#' assay for each object is used. +#' @param object.list A list of \code{\link{Seurat}} objects to prepare for integration +#' @param assay The name of the \code{\link{Assay}} to use for integration. This can be a +#' single name if all the assays to be integrated have the same name, or a character vector +#' containing the name of each \code{\link{Assay}} in each object to be integrated. The +#' specified assays must have been normalized using \code{\link{SCTransform}}. +#' If NULL (default), the current default assay for each object is used. #' @param anchor.features Can be either: #' \itemize{ #' \item{A numeric value. This will call \code{\link{SelectIntegrationFeatures}} @@ -960,8 +1212,8 @@ MixingMetric <- function( #' the Pearson residual will be clipped to #' @param verbose Display output/messages #' -#' @return An object list with the \code{scale.data} slots set to the anchor -#' features +#' @return A list of \code{\link{Seurat}} objects with the appropriate \code{scale.data} slots +#' containing only the required \code{anchor.features}. #' #' @importFrom pbapply pblapply #' @importFrom methods slot slot<- @@ -969,7 +1221,35 @@ MixingMetric <- function( #' @importFrom future.apply future_lapply #' #' @export -#' +#' @examples +#' \dontrun{ +#' # to install the SeuratData package see https://github.com/satijalab/seurat-data +#' library(SeuratData) +#' data("panc8") +#' +#' # panc8 is a merged Seurat object containing 8 separate pancreas datasets +#' # split the object by dataset and take the first 2 to integrate +#' pancreas.list <- SplitObject(panc8, split.by = "tech")[1:2] +#' +#' # perform SCTransform normalization +#' pancreas.list <- lapply(X = pancreas.list, FUN = SCTransform) +#' +#' # select integration features and prep step +#' features <- SelectIntegrationFeatures(pancreas.list) +#' pancreas.list <- PrepSCTIntegration( +#' pancreas.list, +#' anchor.features = features +#' ) +#' +#' # downstream integration steps +#' anchors <- FindIntegrationAnchors( +#' pancreas.list, +#' normalization.method = "SCT", +#' anchor.features = features +#' ) +#' pancreas.integrated <- IntegrateData(anchors) +#' } +#' PrepSCTIntegration <- function( object.list, assay = NULL, @@ -1088,20 +1368,44 @@ PrepSCTIntegration <- function( #' Select integration features #' #' Choose the features to use when integrating multiple datasets. This function -#' ranks features by the number of datasets they appear in, breaking ties by the -#' median rank across datasets. It returns the highest features by this ranking. +#' ranks features by the number of datasets they are deemed variable in, +#' breaking ties by the median variable feature rank across datasets. It returns +#' the top scoring features by this ranking. +#' +#' If for any assay in the list, \code{\link{FindVariableFeatures}} hasn't been +#' run, this method will try to run it using the \code{fvf.nfeatures} parameter +#' and any additional ones specified through the \dots. #' #' @param object.list List of seurat objects #' @param nfeatures Number of features to return -#' @param assay Name of assay from which to pull the variable features. +#' @param assay Name or vector of assay names (one for each object) from which +#' to pull the variable features. #' @param verbose Print messages -#' @param fvf.nfeatures nfeatures for FindVariableFeatures. Used if -#' VariableFeatures have not been set for any object in object.list. +#' @param fvf.nfeatures nfeatures for \code{\link{FindVariableFeatures}}. Used +#' if \code{VariableFeatures} have not been set for any object in +#' \code{object.list}. #' @param ... Additional parameters to \code{\link{FindVariableFeatures}} #' #' @return A vector of selected features #' #' @export +#' +#' @examples +#' \dontrun{ +#' # to install the SeuratData package see https://github.com/satijalab/seurat-data +#' library(SeuratData) +#' data("panc8") +#' +#' # panc8 is a merged Seurat object containing 8 separate pancreas datasets +#' # split the object by dataset and take the first 2 +#' pancreas.list <- SplitObject(panc8, split.by = "tech")[1:2] +#' +#' # perform SCTransform normalization +#' pancreas.list <- lapply(X = pancreas.list, FUN = SCTransform) +#' +#' # select integration features +#' features <- SelectIntegrationFeatures(pancreas.list) +#' } #' SelectIntegrationFeatures <- function( object.list, @@ -1170,39 +1474,116 @@ SelectIntegrationFeatures <- function( return(features) } -#' Transfer Labels +#' Transfer data #' -#' Transfers the labels +#' Transfer categorical or continuous data across single-cell datasets. For +#' transferring categorical information, pass a vector from the reference +#' dataset (e.g. \code{refdata = reference$celltype}). For transferring +#' continuous information, pass a matrix from the reference dataset (e.g. +#' \code{refdata = GetAssayData(reference[['RNA']])}). +#' +#' The main steps of this procedure are outlined below. For a more detailed +#' description of the methodology, please see Stuart, Butler, et al Cell 2019. +#' \url{https://doi.org/10.1016/j.cell.2019.05.031}; +#' \url{https://doi.org/10.1101/460147} +#' +#' For both transferring discrete labels and also feature imputation, we first +#' compute the weights matrix. +#' +#' \itemize{ +#' \item{Construct a weights matrix that defines the association between each +#' query cell and each anchor. These weights are computed as 1 - the distance +#' between the query cell and the anchor divided by the distance of the query +#' cell to the \code{k.weight}th anchor multiplied by the anchor score +#' computed in \code{\link{FindIntegrationAnchors}}. We then apply a Gaussian +#' kernel width a bandwidth defined by \code{sd.weight} and normalize across +#' all \code{k.weight} anchors.} +#' } +#' +#' The main difference between label transfer (classification) and feature +#' imputation is what gets multiplied by the weights matrix. For label transfer, +#' we perform the following steps: +#' +#' \itemize{ +#' \item{Create a binary classification matrix, the rows corresponding to each +#' possible class and the columns corresponding to the anchors. If the +#' reference cell in the anchor pair is a member of a certain class, that +#' matrix entry is filled with a 1, otherwise 0.} +#' \item{Multiply this classification matrix by the transpose of weights +#' matrix to compute a prediction score for each class for each cell in the +#' query dataset.} +#' } +#' +#' For feature imputation, we perform the following step: +#' \itemize{ +#' \item{Multiply the expression matrix for the reference anchor cells by the +#' weights matrix. This returns a predicted expression matrix for the +#' specified features for each cell in the query dataset.} +#' } +#' #' -#' @param anchorset Results from FindTransferAnchors +#' @param anchorset An \code{\link{AnchorSet}} object generated by +#' \code{\link{FindTransferAnchors}} #' @param refdata Data to transfer. Should be either a vector where the names #' correspond to reference cells, or a matrix, where the column names correspond #' to the reference cells. -#' @param weight.reduction Dimensional reduction to use for the weighting. -#' Options are: +#' @param weight.reduction Dimensional reduction to use for the weighting +#' anchors. Options are: #' \itemize{ #' \item{pcaproject: Use the projected PCA used for anchor building} #' \item{pca: Use an internal PCA on the query only} #' \item{cca: Use the CCA used for anchor building} -#' \item{custom DimReduc: User provided DimReduc object computed on the query -#' cells} +#' \item{custom DimReduc: User provided \code{\link{DimReduc}} object +#' computed on the query cells} #' } #' @param l2.norm Perform L2 normalization on the cell embeddings after #' dimensional reduction -#' @param dims Number of PCs to use in the weighting procedure -#' @param k.weight Number of neighbors to consider when weighting +#' @param dims Number of dimensions to use in the anchor weighting procedure +#' @param k.weight Number of neighbors to consider when weighting anchors #' @param sd.weight Controls the bandwidth of the Gaussian kernel for weighting -#' @param eps Error bound on the neighbor finding algorithm (from RANN) +#' @param eps Error bound on the neighbor finding algorithm (from +#' \code{\link{RANN}}) #' @param do.cpp Run cpp code where applicable #' @param verbose Print progress bars and output -#' @param slot Slot to store the imputed data +#' @param slot Slot to store the imputed data. Must be either "data" (default) +#' or "counts" #' -#' @return If refdata is a vector, returns a dataframe with label predictions. -#' If refdata is a matrix, returns an Assay object where the imputed data has -#' been stored in the provided slot. +#' @return If \code{refdata} is a vector, returns a data.frame with label +#' predictions. If \code{refdata} is a matrix, returns an Assay object where the +#' imputed data has been stored in the provided slot. #' +#' @references Stuart T, Butler A, et al. Comprehensive Integration of +#' Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +#' j.cell.2019.05.031} +#' #' @export -#' +#' @examples +#' \dontrun{ +#' # to install the SeuratData package see https://github.com/satijalab/seurat-data +#' library(SeuratData) +#' data("pbmc3k") +#' +#' # for demonstration, split the object into reference and query +#' pbmc.reference <- pbmc3k[, 1:1350] +#' pbmc.query <- pbmc3k[, 1351:2700] +#' +#' # perform standard preprocessing on each object +#' pbmc.reference <- NormalizeData(pbmc.reference) +#' pbmc.reference <- FindVariableFeatures(pbmc.reference) +#' pbmc.reference <- ScaleData(pbmc.reference) +#' +#' pbmc.query <- NormalizeData(pbmc.query) +#' pbmc.query <- FindVariableFeatures(pbmc.query) +#' pbmc.query <- ScaleData(pbmc.query) +#' +#' # find anchors +#' anchors <- FindTransferAnchors(reference = pbmc.reference, query = pbmc.query) +#' +#' # transfer labels +#' predictions <- TransferData(anchorset = anchors, refdata = pbmc.reference$seurat_annotations) +#' pbmc.query <- AddMetaData(object = pbmc.query, metadata = predictions) +#' } +#' TransferData <- function( anchorset, refdata, @@ -1693,8 +2074,6 @@ FindAnchors <- function( FindAnchorPairs <- function( object, integration.name = 'integrated', - cells1 = NULL, - cells2 = NULL, k.anchor = 5, verbose = TRUE ) { @@ -1707,23 +2086,10 @@ FindAnchorPairs <- function( if (verbose) { message("Finding anchors") } - if (is.null(x = cells1)) { - cells1 <- colnames(x = object) - } - if (is.null(x = cells2)) { - cells2 <- colnames(x = object) - } - if (!(cells1 %in% colnames(object)) || !(cells2 %in% colnames(object))) { - warning("Requested cells not contained in Seurat object. Subsetting list of cells.") - cells1 <- intersect(x = cells1, y = colnames(x = object)) - cells2 <- intersect(x = cells2, y = colnames(x = object)) - } # convert cell name to neighbor index nn.cells1 <- neighbors$cells1 nn.cells2 <- neighbors$cells2 - cell1.index <- sapply(X = cells1, FUN = function(x) return(which(x == nn.cells1))) - cell2.index <- sapply(X = cells2, FUN = function(x) return(which(x == nn.cells2))) - + cell1.index <- suppressWarnings(which(colnames(x = object) == nn.cells1, arr.ind = TRUE)) ncell <- 1:nrow(x = neighbors$nnab$nn.idx) ncell <- ncell[ncell %in% cell1.index] anchors <- list() @@ -2464,7 +2830,13 @@ ProjectCellEmbeddings <- function( if (is.null(x = feature.mean)) { feature.mean <- rowMeans(x = reference.data) - feature.sd <- sqrt(SparseRowVar2(mat = reference.data, mu = feature.mean, display_progress = FALSE)) + feature.sd <- sqrt( + x = SparseRowVar2( + mat = as(object = reference.data, Class = "dgCMatrix"), + mu = feature.mean, + display_progress = FALSE + ) + ) feature.sd[is.na(x = feature.sd)] <- 1 feature.mean[is.na(x = feature.mean)] <- 1 } @@ -2475,12 +2847,12 @@ ProjectCellEmbeddings <- function( )[features, ] store.names <- dimnames(x = proj.data) if (is.numeric(x = feature.mean) && feature.mean != "SCT") { - proj.data <- FastSparseRowScaleWithKnownStats( - mat = proj.data, - mu = feature.mean, - sigma = feature.sd, - display_progress = FALSE - ) + proj.data <- FastSparseRowScaleWithKnownStats( + mat = as(object = proj.data, Class = "dgCMatrix"), + mu = feature.mean, + sigma = feature.sd, + display_progress = FALSE + ) } dimnames(x = proj.data) <- store.names ref.feature.loadings <- Loadings(object = reference[[reduction]])[features, dims] diff --git a/R/objects.R b/R/objects.R index 0470f935f..8c6a05443 100644 --- a/R/objects.R +++ b/R/objects.R @@ -1309,6 +1309,12 @@ RenameAssays <- function(object, ...) { DefaultAssay(object = object) <- new } Key(object = object[[new]]) <- old.key + # change assay used in any dimreduc object + for (i in Reductions(object = object)) { + if (DefaultAssay(object = object[[i]]) == old) { + DefaultAssay(object = object[[i]]) <- new + } + } object[[old]] <- NULL } return(object) @@ -3333,6 +3339,18 @@ Misc.Assay <- function(object, slot = NULL, ...) { return(slot(object = object, name = 'misc')[[slot]]) } +#' @rdname Misc +#' @export +#' @method Misc DimReduc +#' +Misc.DimReduc <- function(object, slot = NULL, ...) { + CheckDots(...) + if (is.null(x = slot)) { + return(slot(object = object, name = 'misc')) + } + return(slot(object = object, name = 'misc')[[slot]]) +} + #' @rdname Misc #' @export #' @method Misc Seurat @@ -3366,6 +3384,23 @@ Misc.Seurat <- function(object, slot = NULL, ...) { return(object) } +#' @rdname Misc +#' @export +#' @method Misc<- DimReduc +#' +"Misc<-.DimReduc" <- function(object, slot, ..., value) { + CheckDots(...) + if (slot %in% names(x = Misc(object = object))) { + warning("Overwriting miscellanous data for ", slot) + } + if (is.list(x = value)) { + slot(object = object, name = 'misc')[[slot]] <- c(value) + } else { + slot(object = object, name = 'misc')[[slot]] <- value + } + return(object) +} + #' @rdname Misc #' @export #' @method Misc<- Seurat @@ -4754,6 +4789,7 @@ VariableFeatures.Seurat <- function(object, assay = NULL, selection.method = NUL #' @param invert Invert the selection of cells #' #' @importFrom stats na.omit +#' @importFrom rlang is_quosure enquo eval_tidy #' #' @rdname WhichCells #' @export @@ -4770,12 +4806,14 @@ WhichCells.Assay <- function( cells <- cells %||% colnames(x = object) if (!missing(x = expression) && !is.null(x = substitute(expr = expression))) { key.pattern <- paste0('^', Key(object = object)) - expr <- if (is.call(x = substitute(expr = expression))) { - substitute(expr = expression) + expr <- if (tryCatch(expr = is_quosure(x = expression), error = function(...) FALSE)) { + expression + } else if (is.call(x = enquo(arg = expression))) { + enquo(arg = expression) } else { parse(text = expression) } - expr.char <- as.character(x = expr) + expr.char <- suppressWarnings(expr = as.character(x = expr)) expr.char <- unlist(x = lapply(X = expr.char, FUN = strsplit, split = ' ')) expr.char <- gsub( pattern = key.pattern, @@ -4798,8 +4836,7 @@ WhichCells.Assay <- function( expr.char <- expr.char[vars.use] data.subset <- as.data.frame(x = t(x = as.matrix(x = object[expr.char, ]))) colnames(x = data.subset) <- expr.char - data.subset <- subset.data.frame(x = data.subset, subset = eval(expr = expr)) - cells <- rownames(x = data.subset) + cells <- rownames(x = data.subset)[eval_tidy(expr = expr, data = data.subset)] } if (invert) { cells <- colnames(x = object)[!colnames(x = object) %in% cells] @@ -4816,6 +4853,7 @@ WhichCells.Assay <- function( #' @param seed Random seed for downsampling. If NULL, does not set a seed #' #' @importFrom stats na.omit +#' @importFrom rlang is_quosure enquo eval_tidy #' #' @rdname WhichCells #' @export @@ -4870,12 +4908,14 @@ WhichCells.Seurat <- function( } ) key.pattern <- paste0('^', object.keys, collapse = '|') - expr <- if (is.call(x = substitute(expr = expression))) { - substitute(expr = expression) + expr <- if (tryCatch(expr = is_quosure(x = expression), error = function(...) FALSE)) { + expression + } else if (is.call(x = enquo(arg = expression))) { + enquo(arg = expression) } else { parse(text = expression) } - expr.char <- as.character(x = expr) + expr.char <- suppressWarnings(expr = as.character(x = expr)) expr.char <- unlist(x = lapply(X = expr.char, FUN = strsplit, split = ' ')) expr.char <- gsub( pattern = '(', @@ -4895,12 +4935,11 @@ WhichCells.Seurat <- function( ) data.subset <- FetchData( object = object, - vars = expr.char[vars.use], + vars = unique(x = expr.char[vars.use]), cells = cells, slot = slot ) - data.subset <- subset.data.frame(x = data.subset, subset = eval(expr = expr)) - cells <- rownames(x = data.subset) + cells <- rownames(x = data.subset)[eval_tidy(expr = expr, data = data.subset)] } if (invert) { cell.order <- colnames(x = object) @@ -6058,6 +6097,8 @@ subset.DimReduc <- function(x, cells = NULL, features = NULL, ...) { #' #' @return A subsetted Seurat object #' +#' @importFrom rlang enquo +#' #' @rdname subset.Seurat #' @aliases subset #' @seealso \code{\link[base]{subset}} \code{\link{WhichCells}} @@ -6074,7 +6115,7 @@ subset.DimReduc <- function(x, cells = NULL, features = NULL, ...) { #' subset.Seurat <- function(x, subset, cells = NULL, features = NULL, idents = NULL, ...) { if (!missing(x = subset)) { - subset <- deparse(expr = substitute(expr = subset)) + subset <- enquo(arg = subset) } cells <- WhichCells( object = x, @@ -6640,7 +6681,7 @@ setMethod( cat( "Active assay:", DefaultAssay(object = object), - paste0('(', nrow(x = object), ' features)') + paste0('(', nrow(x = object), ' features, ', length(x = VariableFeatures(object = object)), ' variable features)') ) other.assays <- assays[assays != DefaultAssay(object = object)] if (length(x = other.assays) > 0) { diff --git a/R/preprocessing.R b/R/preprocessing.R index 2b8631dcb..f2805779e 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -165,6 +165,10 @@ CalculateBarcodeInflections <- function( #' @param include.body Include the gene body? #' @param upstream Number of bases upstream to consider #' @param downstream Number of bases downstream to consider +#' @param keep.sparse Leave the matrix as a sparse matrix. Setting this option to +#' TRUE will take much longer but will use less memory. This can be useful if +#' you have a very large matrix that cannot fit into memory when converted to +#' a dense form. #' @param verbose Print progress/messages #' #' @importFrom future nbrOfWorkers @@ -177,6 +181,7 @@ CreateGeneActivityMatrix <- function( include.body = TRUE, upstream = 2000, downstream = 0, + keep.sparse = FALSE, verbose = TRUE ) { if (!PackageCheck('GenomicRanges', error = FALSE)) { @@ -228,7 +233,9 @@ CreateGeneActivityMatrix <- function( colnames(x = annotations) <- c('feature', 'new_feature') # collapse into expression matrix - peak.matrix <- as(object = peak.matrix, Class = 'matrix') + if (!keep.sparse) { + peak.matrix <- as(object = peak.matrix, Class = 'matrix') + } all.features <- unique(x = annotations$new_feature) if (nbrOfWorkers() > 1) { @@ -236,18 +243,21 @@ CreateGeneActivityMatrix <- function( } else { mysapply <- ifelse(test = verbose, yes = pbsapply, no = sapply) } - newmat <- mysapply(X = 1:length(x = all.features), FUN = function(x){ + newmat.list <- mysapply(X = 1:length(x = all.features), FUN = function(x){ features.use <- annotations[annotations$new_feature == all.features[[x]], ]$feature submat <- peak.matrix[features.use, ] if (length(x = features.use) > 1) { - return(Matrix::colSums(x = submat)) + submat <- Matrix::colSums(submat) + } + if (keep.sparse) { + return(as(object = as.matrix(x = submat), Class = 'dgCMatrix')) } else { - return(submat) + return(as.matrix(x = submat)) } - }) + }, simplify = FALSE) + newmat = do.call(what = cbind, args = newmat.list) newmat <- t(x = newmat) rownames(x = newmat) <- all.features - colnames(x = newmat) <- colnames(x = peak.matrix) return(as(object = newmat, Class = 'dgCMatrix')) } @@ -822,6 +832,7 @@ ReadAlevin <- function(base.path) { #' will be prefixed with the name. #' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2 #' @param unique.features Make feature names unique (default TRUE) +#' @param strip.suffix Remove trailing "-1" if present in all cell barcodes. #' #' @return If features.csv indicates the data has multiple data types, a list #' containing a sparse matrix of the data from each type will be returned. @@ -847,7 +858,12 @@ ReadAlevin <- function(base.path) { #' seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`) #' } #' -Read10X <- function(data.dir = NULL, gene.column = 2, unique.features = TRUE) { +Read10X <- function( + data.dir = NULL, + gene.column = 2, + unique.features = TRUE, + strip.suffix = FALSE +) { full.data <- list() for (i in seq_along(along.with = data.dir)) { run <- data.dir[i] @@ -878,7 +894,7 @@ Read10X <- function(data.dir = NULL, gene.column = 2, unique.features = TRUE) { } data <- readMM(file = matrix.loc) cell.names <- readLines(barcode.loc) - if (all(grepl(pattern = "\\-1$", x = cell.names))) { + if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) { cell.names <- as.vector(x = as.character(x = sapply( X = cell.names, FUN = ExtractField, @@ -934,7 +950,7 @@ Read10X <- function(data.dir = NULL, gene.column = 2, unique.features = TRUE) { data <- lapply( X = lvls, FUN = function(l) { - return(data[data_types == l, ]) + return(data[data_types == l, , drop = FALSE]) } ) names(x = data) <- lvls @@ -1112,24 +1128,23 @@ SampleUMI <- function( ) { data <- as(object = data, Class = "dgCMatrix") if (length(x = max.umi) == 1) { - return( - RunUMISampling( - data = data, - sample_val = max.umi, - upsample = upsample, - display_progress = verbose - ) + new_data <- RunUMISampling( + data = data, + sample_val = max.umi, + upsample = upsample, + display_progress = verbose ) } else if (length(x = max.umi) != ncol(x = data)) { stop("max.umi vector not equal to number of cells") + } else { + new_data <- RunUMISamplingPerCell( + data = data, + sample_val = max.umi, + upsample = upsample, + display_progress = verbose + ) } - new_data = RunUMISamplingPerCell( - data = data, - sample_val = max.umi, - upsample = upsample, - display_progress = verbose - ) - dimnames(new_data) <- dimnames(data) + dimnames(x = new_data) <- dimnames(x = data) return(new_data) } @@ -2188,6 +2203,21 @@ ScaleData.default <- function( ) } # Currently, RegressOutMatrix will do nothing if latent.data = NULL + notfound <- setdiff(x = vars.to.regress, y = colnames(x = latent.data)) + if (length(x = notfound) == length(x = vars.to.regress)) { + stop( + "None of the requested variables to regress are present in the object.", + call. = FALSE + ) + } else if (length(x = notfound) > 0) { + warning( + "Requested variables to regress not in object: ", + paste(notfound, collapse = ", "), + call. = FALSE, + immediate. = TRUE + ) + vars.to.regress <- colnames(x = latent.data) + } if (verbose) { message("Regressing out ", paste(vars.to.regress, collapse = ', ')) } @@ -2504,7 +2534,7 @@ ClassifyCells <- function(data, q) { message("No threshold found for ", colnames(x = data)[i], "...") } ) - if (is.character(x = model)) { + if (is.null(x = model)) { next } x <- seq.int( diff --git a/R/utilities.R b/R/utilities.R index 0e0320a8b..90b133ba2 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1783,9 +1783,9 @@ PackageCheck <- function(..., error = TRUE) { ) if (error && any(!package.installed)) { stop( - "Cannot find ", - paste(pkgs[!package.installed], collapse = ', '), - "; please install" + "Cannot find the following packages: ", + paste(pkgs[!package.installed], collapse = ', '), + ". Please install" ) } invisible(x = package.installed) diff --git a/R/visualization.R b/R/visualization.R index 4ea30f366..1e19939d9 100644 --- a/R/visualization.R +++ b/R/visualization.R @@ -526,8 +526,8 @@ RidgePlot <- function( #' @inheritParams RidgePlot #' @param pt.size Point size for geom_violin #' @param split.by A variable to split the violin plots by, -#' @param multi.group plot each group of the split violin plots by multiple or single violin shapes -#' see \code{\link{FetchData}} for more details +#' @param split.plot plot each group of the split violin plots by multiple or +#' single violin shapes. #' @param adjust Adjust parameter for geom_violin #' #' @return A \code{\link[patchwork]{patchwork}ed} ggplot object if @@ -557,12 +557,25 @@ VlnPlot <- function( log = FALSE, ncol = NULL, slot = 'data', - multi.group = FALSE, + split.plot = FALSE, combine = TRUE ) { + if ( + !is.null(x = split.by) & + getOption(x = 'Seurat.warn.vlnplot.split', default = TRUE) + ) { + message( + "The default behaviour of split.by has changed.\n", + "Separate violin plots are now plotted side-by-side.\n", + "To restore the old behaviour of a single split violin,\n", + "set split.plot = TRUE. + \nThis message will be shown once per session." + ) + options(Seurat.warn.vlnplot.split = FALSE) + } return(ExIPlot( object = object, - type = ifelse(test = multi.group, yes = 'multiViolin', no = 'violin'), + type = ifelse(test = split.plot, yes = 'splitViolin', no = 'violin'), features = features, idents = idents, ncol = ncol, @@ -844,7 +857,8 @@ DimPlot <- function( #' @param ncol Number of columns to combine multiple feature plots to, ignored if \code{split.by} is not \code{NULL} #' @param coord.fixed Plot cartesian coordinates with fixed aspect ratio #' @param by.col If splitting by a factor, plot the splits per column with the features as rows; ignored if \code{blend = TRUE} -#' @param sort.cell If \code{TRUE}, the positive cells will overlap the negative cells +#' @param sort.cell Redundant with \code{order}. This argument is being +#' deprecated. Please use \code{order} instead. #' @param combine Combine plots into a single \code{\link[patchwork]{patchwork}ed} #' ggplot object. If \code{FALSE}, return a list of ggplot objects #' @@ -897,9 +911,21 @@ FeaturePlot <- function( ncol = NULL, coord.fixed = FALSE, by.col = TRUE, - sort.cell = FALSE, + sort.cell = NULL, combine = TRUE ) { + # TODO: deprecate fully on 3.2.0 + if (!is.null(x = sort.cell)) { + warning( + "The sort.cell parameter is being deprecated. Please use the order ", + "parameter instead for equivalent functionality.", + call. = FALSE, + immediate. = TRUE + ) + if (isTRUE(x = sort.cell)) { + order <- sort.cell + } + } # Set a theme to remove right-hand Y axis lines # Also sets right-hand Y axis text label formatting no.right <- theme( @@ -1125,9 +1151,6 @@ FeaturePlot <- function( cols.use <- NULL } data.single <- data.plot[, c(dims, 'ident', feature, shape.by)] - if (sort.cell) { - data.single <- data.single[order(data.single[, feature]),] - } # Make the plot plot <- SingleDimPlot( data = data.single, @@ -1327,8 +1350,8 @@ FeaturePlot <- function( theme(plot.title = element_text(hjust = 0.5)) idx <- idx + 1 } - ncol <- nsplits - nrow <- 1 + ncol <- 1 + nrow <- nsplits } else { nrow <- split.by %iff% length(x = levels(x = data$split)) } @@ -1336,7 +1359,8 @@ FeaturePlot <- function( what = rbind, args = split(x = 1:length(x = plots), f = ceiling(x = seq_along(along.with = 1:length(x = plots)) / length(x = features))) ))] - plots <- wrap_plots(plots, ncol = ncol, nrow = nrow) + # Set ncol to number of splits (nrow) and nrow to number of features (ncol) + plots <- wrap_plots(plots, ncol = nrow, nrow = ncol) if (!is.null(x = legend) && legend == 'none') { plots <- plots & NoLegend() } @@ -1419,10 +1443,13 @@ CellScatter <- function( #' @param span Spline span in loess function call, if \code{NULL}, no spline added #' @param smooth Smooth the graph (similar to smoothScatter) #' @param slot Slot to pull data from, should be one of 'counts', 'data', or 'scale.data' +#' @param combine Combine plots into a single \code{\link[patchwork]{patchwork}ed} #' #' @return A ggplot object #' #' @importFrom ggplot2 geom_smooth aes_string +#' @importFrom patchwork wrap_plots +#' #' @export #' #' @aliases GenePlot @@ -1441,28 +1468,51 @@ FeatureScatter <- function( shape.by = NULL, span = NULL, smooth = FALSE, + combine = TRUE, slot = 'data' ) { cells <- cells %||% colnames(x = object) - group.by <- group.by %||% Idents(object = object)[cells] - if (length(x = group.by) == 1) { - group.by <- object[[]][, group.by] + object[['ident']] <- Idents(object = object) + group.by <- group.by %||% 'ident' + data <- FetchData( + object = object, + vars = c(feature1, feature2, group.by), + cells = cells, + slot = slot + ) + if (isFALSE(x = feature1 %in% colnames(x = data))) { + stop("Feature 1 (", feature1, ") not found.", call. = FALSE) } - plot <- SingleCorPlot( - data = FetchData( - object = object, - vars = c(feature1, feature2), - cells = cells, - slot = slot - ), - col.by = group.by, - cols = cols, - pt.size = pt.size, - smooth = smooth, - legend.title = 'Identity', - span = span + if (isFALSE(x = feature2 %in% colnames(x = data))) { + stop("Feature 2 (", feature2, ") not found.", call. = FALSE) + } + data <- as.data.frame(x = data) + for (group in group.by) { + if (!is.factor(x = data[, group])) { + data[, group] <- factor(x = data[, group]) + } + } + plots <- lapply( + X = group.by, + FUN = function(x) { + SingleCorPlot( + data = data[,c(feature1, feature2)], + col.by = data[, x], + cols = cols, + pt.size = pt.size, + smooth = smooth, + legend.title = 'Identity', + span = span + ) + } ) - return(plot) + if (isTRUE(x = length(x = plots) == 1)) { + return(plots[[1]]) + } + if (isTRUE(x = combine)) { + plots <- wrap_plots(plots, ncol = length(x = group.by)) + } + return(plots) } #' View variable features @@ -1860,6 +1910,7 @@ BarcodeInflectionsPlot <- function(object) { #' @param group.by Factor to group the cells by #' @param split.by Factor to split the groups by (replicates the functionality of the old SplitDotPlotGG); #' see \code{\link{FetchData}} for more details +#' @param scale Determine whether the data is scaled, TRUE for default #' @param scale.by Scale the size of the points by 'size' or by 'radius' #' @param scale.min Set lower limit for scaling, use NA for default #' @param scale.max Set upper limit for scaling, use NA for default @@ -1892,6 +1943,7 @@ DotPlot <- function( dot.scale = 6, group.by = NULL, split.by = NULL, + scale = TRUE, scale.by = 'radius', scale.min = NA, scale.max = NA @@ -1955,15 +2007,26 @@ DotPlot <- function( if (!is.null(x = id.levels)) { data.plot$id <- factor(x = data.plot$id, levels = id.levels) } - avg.exp.scaled <- sapply( - X = unique(x = data.plot$features.plot), - FUN = function(x) { - data.use <- data.plot[data.plot$features.plot == x, 'avg.exp'] - data.use <- scale(x = data.use) - data.use <- MinMax(data = data.use, min = col.min, max = col.max) - return(data.use) - } - ) + if (length(x = levels(x = data.plot$id)) == 1) { + scale <- FALSE + warning("Only one identity present, the expression values will be not scaled.") + } + avg.exp.scaled <- sapply( + X = unique(x = data.plot$features.plot), + FUN = function(x) { + data.use <- data.plot[data.plot$features.plot == x, 'avg.exp'] + if (scale) { + data.use <- scale(x = data.use) + data.use <- MinMax(data = data.use, min = col.min, max = col.max) + } else { + data.use <- log(x = data.use) + } + return(data.use) + } + ) + + + avg.exp.scaled <- as.vector(x = t(x = avg.exp.scaled)) if (!is.null(x = split.by)) { avg.exp.scaled <- as.numeric(x = cut(x = avg.exp.scaled, breaks = 20)) @@ -1977,10 +2040,16 @@ DotPlot <- function( data.plot$pct.exp <- data.plot$pct.exp * 100 if (!is.null(x = split.by)) { splits.use <- vapply( - X = strsplit(x = as.character(x = data.plot$id), split = '_'), - FUN = '[[', + X = as.character(x = data.plot$id), + FUN = gsub, FUN.VALUE = character(length = 1L), - 2 + pattern = paste0( + '^((', + paste(sort(x = levels(x = object), decreasing = TRUE), collapse = '|'), + ')_)' + ), + replacement = '', + USE.NAMES = FALSE ) data.plot$colors <- mapply( FUN = function(color, value) { @@ -3448,7 +3517,7 @@ BlendMap <- function(color.matrix) { # # @return An n x n matrix of blended colors # -#' @importFrom grDevices rgb colorRamp +#' @importFrom grDevices col2rgb # BlendMatrix <- function( n = 10, @@ -3459,10 +3528,13 @@ BlendMatrix <- function( if (0 > col.threshold || col.threshold > 1) { stop("col.threshold must be between 0 and 1") } - C0 <- colorRamp(colors = negative.color)(1) - ramp <- colorRamp(colors = two.colors) - C1 <- ramp(x = 0) - C2 <- ramp(x = 1) + C0 <- as.vector(col2rgb(negative.color, alpha = TRUE)) + C1 <- as.vector(col2rgb(two.colors[1], alpha = TRUE)) + C2 <- as.vector(col2rgb(two.colors[2], alpha = TRUE)) + blend_alpha <- (C1[4] + C2[4])/2 + C0 <- C0[-4] + C1 <- C1[-4] + C2 <- C2[-4] merge.weight <- min(255 / (C1 + C2 + C0 + 0.01)) sigmoid <- function(x) { return(1 / (1 + exp(-x))) @@ -3475,6 +3547,7 @@ BlendMatrix <- function( C0, C1, C2, + alpha, merge.weight ) { c.min <- sigmoid(5 * (1 / n - col.threshold)) @@ -3495,10 +3568,10 @@ BlendMatrix <- function( C_blend[C_blend > 255] <- 255 C_blend[C_blend < 0] <- 0 return(rgb( - red = C_blend[, 1], - green = C_blend[, 2], - blue = C_blend[, 3], - alpha = 255, + red = C_blend[1], + green = C_blend[2], + blue = C_blend[3], + alpha = alpha, maxColorValue = 255 )) } @@ -3513,6 +3586,7 @@ BlendMatrix <- function( C0 = C0, C1 = C1, C2 = C2, + alpha = blend_alpha, merge.weight = merge.weight ) } @@ -3610,7 +3684,7 @@ DefaultDimReduc <- function(object, assay = NULL) { # Basically combines the codebase for VlnPlot and RidgePlot # # @param object Seurat object -# @param type Plot type, choose from 'ridge', 'violin', or 'multiViolin' +# @param type Plot type, choose from 'ridge', 'violin', or 'splitViolin' # @param features Features to plot (gene expression, metrics, PC scores, # anything that can be retreived by FetchData) # @param idents Which classes to include in the plot (default is all) @@ -3698,6 +3772,10 @@ ExIPlot <- function( } cols <- rep_len(x = cols, length.out = length(x = levels(x = split))) names(x = cols) <- sort(x = levels(x = split)) + if ((length(x = cols) > 2) & (type == "splitViolin")) { + warning("Split violin is only supported for <3 groups, using multi-violin.") + type <- "violin" + } } if (same.y.lims && is.null(x = y.max)) { y.max <- max(data) @@ -3722,7 +3800,7 @@ ExIPlot <- function( label.fxn <- switch( EXPR = type, 'violin' = ylab, - "multiViolin" = ylab, + "splitViolin" = ylab, 'ridge' = xlab, stop("Unknown ExIPlot type ", type, call. = FALSE) ) @@ -4562,7 +4640,7 @@ SingleDimPlot <- function( #' @importFrom stats rnorm #' @importFrom utils globalVariables #' @importFrom ggridges geom_density_ridges theme_ridges -#' @importFrom ggplot2 ggplot aes_string theme labs geom_violin geom_jitter ylim +#' @importFrom ggplot2 ggplot aes_string theme labs geom_violin geom_jitter ylim position_jitterdodge #' scale_fill_manual scale_y_log10 scale_x_log10 scale_y_discrete scale_x_continuous waiver #' @importFrom cowplot theme_cowplot #' @@ -4612,14 +4690,14 @@ SingleExIPlot <- function( data[, feature] <- data[, feature] + noise } axis.label <- 'Expression Level' - y.max <- y.max %||% max(data[, feature]) + y.max <- y.max %||% max(data[, feature][is.finite(x = data[, feature])]) if (type == 'violin' && !is.null(x = split)) { data$split <- split - vln.geom <- geom_split_violin + vln.geom <- geom_violin fill <- 'split' - } else if (type == 'multiViolin' && !is.null(x = split )) { + } else if (type == 'splitViolin' && !is.null(x = split )) { data$split <- split - vln.geom <- geom_violin + vln.geom <- geom_split_violin fill <- 'split' type <- 'violin' } else { @@ -4637,7 +4715,14 @@ SingleExIPlot <- function( vln.geom(scale = 'width', adjust = adjust, trim = TRUE), theme(axis.text.x = element_text(angle = 45, hjust = 1)) ) - jitter <- geom_jitter(height = 0, size = pt.size) + if (is.null(x = split)) { + jitter <- geom_jitter(height = 0, size = pt.size) + } else { + jitter <- geom_jitter( + position = position_jitterdodge(jitter.width = 0.4, dodge.width = 0.9), + size = pt.size + ) + } log.scale <- scale_y_log10() axis.scale <- ylim }, diff --git a/R/zzz.R b/R/zzz.R index d89d7c2a6..5b2de9a8a 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,3 +1,7 @@ +#' Seurat package +#' +#' Tools for single-cell genomics +#' #' Tools for single-cell genomics #' #' @section Package options: @@ -15,6 +19,10 @@ #' \item{\code{Seurat.checkdots}}{For functions that have ... as a parameter, #' this controls the behavior when an item isn't used. Can be one of warn, #' stop, or silent.} +#' \item{\code{Seurat.limma.wilcox.msg}}{Show message about more efficient +#' Wilcoxon Rank Sum test available via the limma package} +#' \item{\code{Seurat.warn.vlnplot.split}}{Show message about changes to +#' default behavior of split/multi violin plots} #' } #' #' @docType package @@ -26,7 +34,9 @@ NULL seurat_default_options <- list( Seurat.memsafe = FALSE, Seurat.warn.umap.uwot = TRUE, - Seurat.checkdots = "warn" + Seurat.checkdots = "warn", + Seurat.limma.wilcox.msg = TRUE, + Seurat.warn.vlnplot.split = TRUE ) .onLoad <- function(libname, pkgname) { diff --git a/README.md b/README.md index a1f9c8a25..3be4fd4de 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,19 @@ -[![Build Status](https://travis-ci.com/satijalab/seurat.svg)](https://travis-ci.com/satijalab/seurat) -[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/satijalab/seurat?svg=true)](https://ci.appveyor.com/project/satijalab/seurat) +[![Build Status](https://travis-ci.com/satijalab/seurat.svg?branch=master)](https://travis-ci.com/satijalab/seurat) +[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/satijalab/seurat?branch=master&svg=true)](https://ci.appveyor.com/project/satijalab/seurat) [![CRAN Version](https://www.r-pkg.org/badges/version/Seurat)](https://cran.r-project.org/package=Seurat) [![CRAN Downloads](https://cranlogs.r-pkg.org/badges/Seurat)](https://cran.r-project.org/package=Seurat) -# Seurat v3.1.4 +# Seurat v3.1.5 Seurat is an R toolkit for single cell genomics, developed and maintained by the Satija Lab at NYGC. Instructions, documentation, and tutorials can be found at: + * https://satijalab.org/seurat Seurat is also hosted on GitHub, you can view and clone the repository at + * https://github.com/satijalab/seurat Seurat has been successfully installed on Mac OS X, Linux, and Windows, using the devtools package to install directly from GitHub @@ -21,12 +23,14 @@ Improvements and new features will be added on a regular basis, please contact s Version History August 20, 2019 + * Version 3.1 * Changes: * Support for SCTransform integration workflows * Integration speed ups: reference-based integration + reciprocal PCA April 12, 2019 + * Version 3.0 * Changes: * Preprint published describing new methods for identifying anchors across single-cell datasets @@ -34,29 +38,34 @@ April 12, 2019 * Parallelization support via future July 20, 2018 + * Version 2.4 * Changes: * Java dependency removed and functionality rewritten in Rcpp March 22, 2018 + * Version 2.3 * Changes: * New utility functions * Speed and efficiency improvments January 10, 2018 + * Version 2.2 * Changes: * Support for multiple-dataset alignment with RunMultiCCA and AlignSubspace * New methods for evaluating alignment performance October 12, 2017 + * Version 2.1 * Changes: * Support for using MAST and DESeq2 packages for differential expression testing in FindMarkers * Support for multi-modal single-cell data via \@assay slot July 26, 2017 + * Version 2.0 * Changes: * Preprint released for integrated analysis of scRNA-seq across conditions, technologies and species @@ -64,12 +73,14 @@ July 26, 2017 * Methods for scoring gene expression and cell-cycle phase October 4, 2016 + * Version 1.4 released * Changes: * Improved tools for cluster evaluation/visualizations * Methods for combining and adding to datasets August 22, 2016: + * Version 1.3 released * Changes : * Improved clustering approach - see FAQ for details @@ -79,6 +90,7 @@ August 22, 2016: * Updated visualizations May 21, 2015: + * Drop-Seq manuscript published. Version 1.2 released * Changes : * Added support for spectral t-SNE and density clustering @@ -88,4 +100,5 @@ May 21, 2015: * Small bug fixes April 13, 2015: + * Spatial mapping manuscript published. Version 1.1 released (initial release) diff --git a/cran-comments.md b/cran-comments.md index 81a9a0ba4..1bbd87b9c 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,8 +1,8 @@ -# Seurat v3.1.4 +# Seurat v3.1.5 ## Test environments * local Ubuntu 16.04.6 and 18.04.2 installs, R 3.6.1 -* local Windows 10 install, R 3.5.3 +* local Windows 10 install, R 3.5.3, R-devel (4.1.0) * Ubuntu 16.04.6 (on travis-ci), R 3.6.1 * macOS 10.13.3 (on travis-ci), R 3.6.1 * Windows Server 2012 R2 (on AppVeyor), R 3.6.1 Patched @@ -40,6 +40,6 @@ There were 3 NOTEs: ## Downstream dependencies -There is one pacakge that imports Seurat: multicross; this update does not impact its functionality +There are three pacakges that imports Seurat: multicross, scMappR, and Signac; this update does not impact their functionality -There are three packages that suggest Seurat: BisqueRNA, clustree, and Rmagic; this update does not impact their functionality. +There are four packages that suggest Seurat: BisqueRNA, clustree, Rmagic, and treefit; this update does not impact their functionality. diff --git a/inst/CITATION b/inst/CITATION index 0f8c616f4..1a56568f9 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -7,13 +7,16 @@ citEntry(entry = "article", as.person("Christoph Hafemeister"), as.person("Efthymia Papalexi"), as.person("William M Mauck III"), + as.person("Yuhan Hao"), as.person("Marlon Stoeckius"), as.person("Peter Smibert"), as.person("Rahul Satija")), - title = "Comprehensive integration of single cell data", - journal = "bioRxiv", - year = "2018", - doi = "10.1101/460147", - url = "https://www.biorxiv.org/content/10.1101/460147v1", - textVersion = "Stuart and Butler et al. Comprehensive integration of single cell data. bioRxiv (2018)." + title = "Comprehensive Integration of Single-Cell Data", + journal = "Cell", + year = "2019", + volume = "177", + pages = "1888-1902", + doi = "10.1016/j.cell.2019.05.031", + url = "https://doi.org/10.1016/j.cell.2019.05.031", + textVersion = "Stuart and Butler et al. Comprehensive Integration of Single-Cell Data. Cell (2019)." ) diff --git a/man/CreateGeneActivityMatrix.Rd b/man/CreateGeneActivityMatrix.Rd index d69bc808f..825452336 100644 --- a/man/CreateGeneActivityMatrix.Rd +++ b/man/CreateGeneActivityMatrix.Rd @@ -11,6 +11,7 @@ CreateGeneActivityMatrix( include.body = TRUE, upstream = 2000, downstream = 0, + keep.sparse = FALSE, verbose = TRUE ) } @@ -27,6 +28,11 @@ CreateGeneActivityMatrix( \item{downstream}{Number of bases downstream to consider} +\item{keep.sparse}{Leave the matrix as a sparse matrix. Setting this option to +TRUE will take much longer but will use less memory. This can be useful if +you have a very large matrix that cannot fit into memory when converted to +a dense form.} + \item{verbose}{Print progress/messages} } \description{ diff --git a/man/DotPlot.Rd b/man/DotPlot.Rd index eec9138ad..a7cd18516 100644 --- a/man/DotPlot.Rd +++ b/man/DotPlot.Rd @@ -16,6 +16,7 @@ DotPlot( dot.scale = 6, group.by = NULL, split.by = NULL, + scale = TRUE, scale.by = "radius", scale.min = NA, scale.max = NA @@ -48,6 +49,8 @@ gene will have no dot drawn.} \item{split.by}{Factor to split the groups by (replicates the functionality of the old SplitDotPlotGG); see \code{\link{FetchData}} for more details} +\item{scale}{Determine whether the data is scaled, TRUE for default} + \item{scale.by}{Scale the size of the points by 'size' or by 'radius'} \item{scale.min}{Set lower limit for scaling, use NA for default} diff --git a/man/FeaturePlot.Rd b/man/FeaturePlot.Rd index c67e85e8b..b81f43986 100644 --- a/man/FeaturePlot.Rd +++ b/man/FeaturePlot.Rd @@ -28,7 +28,7 @@ FeaturePlot( ncol = NULL, coord.fixed = FALSE, by.col = TRUE, - sort.cell = FALSE, + sort.cell = NULL, combine = TRUE ) } @@ -92,7 +92,8 @@ different colors and different shapes on cells} \item{by.col}{If splitting by a factor, plot the splits per column with the features as rows; ignored if \code{blend = TRUE}} -\item{sort.cell}{If \code{TRUE}, the positive cells will overlap the negative cells} +\item{sort.cell}{Redundant with \code{order}. This argument is being +deprecated. Please use \code{order} instead.} \item{combine}{Combine plots into a single \code{\link[patchwork]{patchwork}ed} ggplot object. If \code{FALSE}, return a list of ggplot objects} diff --git a/man/FeatureScatter.Rd b/man/FeatureScatter.Rd index 599fcaffb..cbba72eed 100644 --- a/man/FeatureScatter.Rd +++ b/man/FeatureScatter.Rd @@ -16,6 +16,7 @@ FeatureScatter( shape.by = NULL, span = NULL, smooth = FALSE, + combine = TRUE, slot = "data" ) } @@ -42,6 +43,8 @@ be metrics, PC scores, etc. - anything that can be retreived with FetchData} \item{smooth}{Smooth the graph (similar to smoothScatter)} +\item{combine}{Combine plots into a single \code{\link[patchwork]{patchwork}ed}} + \item{slot}{Slot to pull data from, should be one of 'counts', 'data', or 'scale.data'} } \value{ diff --git a/man/FindConservedMarkers.Rd b/man/FindConservedMarkers.Rd index 2500454e6..0e29098c5 100644 --- a/man/FindConservedMarkers.Rd +++ b/man/FindConservedMarkers.Rd @@ -11,7 +11,7 @@ FindConservedMarkers( grouping.var, assay = "RNA", slot = "data", - meta.method = minimump, + meta.method = metap::minimump, verbose = TRUE, ... ) diff --git a/man/FindIntegrationAnchors.Rd b/man/FindIntegrationAnchors.Rd index 1d3187985..c2d0e3c94 100644 --- a/man/FindIntegrationAnchors.Rd +++ b/man/FindIntegrationAnchors.Rd @@ -25,8 +25,8 @@ FindIntegrationAnchors( ) } \arguments{ -\item{object.list}{A list of objects between which to find anchors for -downstream integration.} +\item{object.list}{A list of \code{\link{Seurat}} objects between which to +find anchors for downstream integration.} \item{assay}{A vector of assay names specifying which assay to use when constructing anchors. If NULL, the current default assay for each object is @@ -87,8 +87,74 @@ annoy} \item{verbose}{Print progress bars and output} } \value{ -Returns an AnchorSet object +Returns an \code{\link{AnchorSet}} object that can be used as input to +\code{\link{IntegrateData}}. } \description{ -Finds the integration anchors +Find a set of anchors between a list of \code{\link{Seurat}} objects. +These anchors can later be used to integrate the objects using the +\code{\link{IntegrateData}} function. +} +\details{ +The main steps of this procedure are outlined below. For a more detailed +description of the methodology, please see Stuart, Butler, et al Cell 2019: +\url{https://doi.org/10.1016/j.cell.2019.05.031}; +\url{https://doi.org/10.1101/460147} + +First, determine anchor.features if not explicitly specified using +\code{\link{SelectIntegrationFeatures}}. Then for all pairwise combinations +of reference and query datasets: + +\itemize{ + \item{Perform dimensional reduction on the dataset pair as specified via + the \code{reduction} parameter. If \code{l2.norm} is set to \code{TRUE}, + perform L2 normalization of the embedding vectors.} + \item{Identify anchors - pairs of cells from each dataset + that are contained within each other's neighborhoods (also known as mutual + nearest neighbors).} + \item{Filter low confidence anchors to ensure anchors in the low dimension + space are in broad agreement with the high dimensional measurements. This + is done by looking at the neighbors of each query cell in the reference + dataset using \code{max.features} to define this space. If the reference + cell isn't found within the first \code{k.filter} neighbors, remove the + anchor.} + \item{Assign each remaining anchor a score. For each anchor cell, determine + the nearest \code{k.score} anchors within its own dataset and within its + pair's dataset. Based on these neighborhoods, construct an overall neighbor + graph and then compute the shared neighbor overlap between anchor and query + cells (analogous to an SNN graph). We use the 0.01 and 0.90 quantiles on + these scores to dampen outlier effects and rescale to range between 0-1.} +} +} +\examples{ +\dontrun{ +# to install the SeuratData package see https://github.com/satijalab/seurat-data +library(SeuratData) +data("panc8") + +# panc8 is a merged Seurat object containing 8 separate pancreas datasets +# split the object by dataset +pancreas.list <- SplitObject(panc8, split.by = "tech") + +# perform standard preprocessing on each object +for (i in 1:length(pancreas.list)) { + pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE) + pancreas.list[[i]] <- FindVariableFeatures( + pancreas.list[[i]], selection.method = "vst", + nfeatures = 2000, verbose = FALSE + ) +} + +# find anchors +anchors <- FindIntegrationAnchors(object.list = pancreas.list) + +# integrate data +integrated <- IntegrateData(anchorset = anchors) +} + +} +\references{ +Stuart T, Butler A, et al. Comprehensive Integration of +Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +j.cell.2019.05.031} } diff --git a/man/FindTransferAnchors.Rd b/man/FindTransferAnchors.Rd index 7fd5c04af..940143b81 100644 --- a/man/FindTransferAnchors.Rd +++ b/man/FindTransferAnchors.Rd @@ -27,57 +27,138 @@ FindTransferAnchors( ) } \arguments{ -\item{reference}{Seurat object to use as the reference} +\item{reference}{\code{\link{Seurat}} object to use as the reference} -\item{query}{Seurat object to use as the query} +\item{query}{\code{\link{Seurat}} object to use as the query} -\item{normalization.method}{Name of normalization method used: LogNormalize or SCT} +\item{normalization.method}{Name of normalization method used: LogNormalize +or SCT} -\item{reference.assay}{Assay to use from reference} +\item{reference.assay}{Name of the Assay to use from reference} -\item{query.assay}{Assay to use from query} +\item{query.assay}{Name of the Assay to use from query} -\item{reduction}{Dimensional reduction to perform when finding anchors. Options are: +\item{reduction}{Dimensional reduction to perform when finding anchors. +Options are: \itemize{ - \item{pcaproject: Project the PCA from the reference onto the query. We recommend using PCA - when reference and query datasets are from scRNA-seq} + \item{pcaproject: Project the PCA from the reference onto the query. We + recommend using PCA when reference and query datasets are from scRNA-seq} \item{cca: Run a CCA on the reference and query } }} -\item{project.query}{Project the PCA from the query dataset onto the reference. Use only in rare -cases where the query dataset has a much larger cell number, but the reference dataset has a -unique assay for transfer.} +\item{project.query}{Project the PCA from the query dataset onto the +reference. Use only in rare cases where the query dataset has a much larger +cell number, but the reference dataset has a unique assay for transfer.} -\item{features}{Features to use for dimensional reduction} +\item{features}{Features to use for dimensional reduction. If not specified, +set as variable features of the reference object which are also present in +the query.} -\item{npcs}{Number of PCs to compute on reference. If null, then use an existing PCA structure in -the reference object} +\item{npcs}{Number of PCs to compute on reference. If null, then use an +existing PCA structure in the reference object} -\item{l2.norm}{Perform L2 normalization on the cell embeddings after dimensional reduction} +\item{l2.norm}{Perform L2 normalization on the cell embeddings after +dimensional reduction} -\item{dims}{Which dimensions to use from the reduction to specify the neighbor search space} +\item{dims}{Which dimensions to use from the reduction to specify the +neighbor search space} -\item{k.anchor}{How many neighbors (k) to use when picking anchors} +\item{k.anchor}{How many neighbors (k) to use when finding anchors} \item{k.filter}{How many neighbors (k) to use when filtering anchors} \item{k.score}{How many neighbors (k) to use when scoring anchors} -\item{max.features}{The maximum number of features to use when specifying the neighborhood search -space in the anchor filtering} +\item{max.features}{The maximum number of features to use when specifying the +neighborhood search space in the anchor filtering} \item{nn.method}{Method for nearest neighbor finding. Options include: rann, annoy} -\item{eps}{Error bound on the neighbor finding algorithm (from RANN)} +\item{eps}{Error bound on the neighbor finding algorithm (from +\code{\link{RANN}})} -\item{approx.pca}{Use truncated singular value decomposition to approximate PCA} +\item{approx.pca}{Use truncated singular value decomposition to approximate +PCA} \item{verbose}{Print progress bars and output} } \value{ -Returns an AnchorSet object +Returns an \code{AnchorSet} object that can be used as input to +\code{\link{TransferData}} } \description{ -Finds the transfer anchors +Find a set of anchors between a reference and query object. These +anchors can later be used to transfer data from the reference to +query object using the \code{\link{TransferData}} object. +} +\details{ +The main steps of this procedure are outlined below. For a more detailed +description of the methodology, please see Stuart, Butler, et al Cell 2019. +\url{https://doi.org/10.1016/j.cell.2019.05.031}; +\url{https://doi.org/10.1101/460147} + +\itemize{ + + \item{Perform dimensional reduction. Exactly what is done here depends on + the values set for the \code{reduction} and \code{project.query} + parameters. If \code{reduction = "pcaproject"}, a PCA is performed on + either the reference (if \code{project.query = FALSE}) or the query (if + \code{project.query = TRUE}), using the \code{features} specified. The data + from the other dataset is then projected onto this learned PCA structure. + If \code{reduction = "cca"}, then CCA is performed on the reference and + query for this dimensional reduction step. If \code{l2.norm} is set to + \code{TRUE}, perform L2 normalization of the embedding vectors.} + \item{Identify anchors between the reference and query - pairs of cells + from each dataset that are contained within each other's neighborhoods + (also known as mutual nearest neighbors).} + \item{Filter low confidence anchors to ensure anchors in the low dimension + space are in broad agreement with the high dimensional measurements. This + is done by looking at the neighbors of each query cell in the reference + dataset using \code{max.features} to define this space. If the reference + cell isn't found within the first \code{k.filter} neighbors, remove the + anchor.} + \item{Assign each remaining anchor a score. For each anchor cell, determine + the nearest \code{k.score} anchors within its own dataset and within its + pair's dataset. Based on these neighborhoods, construct an overall neighbor + graph and then compute the shared neighbor overlap between anchor and query + cells (analogous to an SNN graph). We use the 0.01 and 0.90 quantiles on + these scores to dampen outlier effects and rescale to range between 0-1.} +} +} +\examples{ +\dontrun{ +# to install the SeuratData package see https://github.com/satijalab/seurat-data +library(SeuratData) +data("pbmc3k") + +# for demonstration, split the object into reference and query +pbmc.reference <- pbmc3k[, 1:1350] +pbmc.query <- pbmc3k[, 1351:2700] + +# perform standard preprocessing on each object +pbmc.reference <- NormalizeData(pbmc.reference) +pbmc.reference <- FindVariableFeatures(pbmc.reference) +pbmc.reference <- ScaleData(pbmc.reference) + +pbmc.query <- NormalizeData(pbmc.query) +pbmc.query <- FindVariableFeatures(pbmc.query) +pbmc.query <- ScaleData(pbmc.query) + +# find anchors +anchors <- FindTransferAnchors(reference = pbmc.reference, query = pbmc.query) + +# transfer labels +predictions <- TransferData( + anchorset = anchors, + refdata = pbmc.reference$seurat_annotations +) +pbmc.query <- AddMetaData(object = pbmc.query, metadata = predictions) +} + +} +\references{ +Stuart T, Butler A, et al. Comprehensive Integration of +Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +j.cell.2019.05.031}; } diff --git a/man/IntegrateData.Rd b/man/IntegrateData.Rd index f05fd4885..d01b8a1ff 100644 --- a/man/IntegrateData.Rd +++ b/man/IntegrateData.Rd @@ -22,50 +22,132 @@ IntegrateData( ) } \arguments{ -\item{anchorset}{Results from FindIntegrationAnchors} +\item{anchorset}{An \code{\link{AnchorSet}} object generated by +\code{\link{FindIntegrationAnchors}}} \item{new.assay.name}{Name for the new assay containing the integrated data} \item{normalization.method}{Name of normalization method used: LogNormalize or SCT} -\item{features}{Vector of features to use when computing the PCA to determine the weights. Only set -if you want a different set from those used in the anchor finding process} +\item{features}{Vector of features to use when computing the PCA to determine +the weights. Only set if you want a different set from those used in the +anchor finding process} -\item{features.to.integrate}{Vector of features to integrate. By default, will use the features -used in anchor finding.} +\item{features.to.integrate}{Vector of features to integrate. By default, +will use the features used in anchor finding.} -\item{dims}{Number of PCs to use in the weighting procedure} +\item{dims}{Number of dimensions to use in the anchor weighting procedure} -\item{k.weight}{Number of neighbors to consider when weighting} +\item{k.weight}{Number of neighbors to consider when weighting anchors} -\item{weight.reduction}{Dimension reduction to use when calculating anchor weights. -This can be either: +\item{weight.reduction}{Dimension reduction to use when calculating anchor +weights. This can be one of: \itemize{ - \item{A string, specifying the name of a dimension reduction present in all objects to be integrated} - \item{A vector of strings, specifying the name of a dimension reduction to use for each object to be integrated} - \item{A vector of Dimreduc objects, specifying the object to use for each object in the integration} - \item{NULL, in which case a new PCA will be calculated and used to calculate anchor weights} + \item{A string, specifying the name of a dimension reduction present in + all objects to be integrated} + \item{A vector of strings, specifying the name of a dimension reduction to + use for each object to be integrated} + \item{A vector of \code{\link{DimReduc}} objects, specifying the object to + use for each object in the integration} + \item{NULL, in which case a new PCA will be calculated and used to + calculate anchor weights} } -Note that, if specified, the requested dimension reduction will only be used for calculating anchor weights in the -first merge between reference and query, as the merged object will subsequently contain more cells than was in +Note that, if specified, the requested dimension reduction will only be used +for calculating anchor weights in the first merge between reference and +query, as the merged object will subsequently contain more cells than was in query, and weights will need to be calculated for all cells in the object.} \item{sd.weight}{Controls the bandwidth of the Gaussian kernel for weighting} -\item{sample.tree}{Specify the order of integration. If NULL, will compute automatically.} +\item{sample.tree}{Specify the order of integration. If NULL, will compute +automatically.} -\item{preserve.order}{Do not reorder objects based on size for each pairwise integration.} +\item{preserve.order}{Do not reorder objects based on size for each pairwise +integration.} \item{do.cpp}{Run cpp code where applicable} -\item{eps}{Error bound on the neighbor finding algorithm (from \code{\link{RANN}})} +\item{eps}{Error bound on the neighbor finding algorithm (from +\code{\link{RANN}})} \item{verbose}{Print progress bars and output} } \value{ -Returns a Seurat object with a new integrated Assay +Returns a \code{\link{Seurat}} object with a new integrated +\code{\link{Assay}}. If \code{normalization.method = "LogNormalize"}, the +integrated data is returned to the \code{data} slot and can be treated as +log-normalized, corrected data. If \code{normalization.method = "SCT"}, the +integrated data is returned to the \code{scale.data} slot and can be treated +as centered, corrected Pearson residuals. } \description{ -Perform dataset integration using a pre-computed anchorset +Perform dataset integration using a pre-computed \code{\link{AnchorSet}}. +} +\details{ +The main steps of this procedure are outlined below. For a more detailed +description of the methodology, please see Stuart, Butler, et al Cell 2019. +\url{https://doi.org/10.1016/j.cell.2019.05.031}; +\url{https://doi.org/10.1101/460147} + +For pairwise integration: + +\itemize{ + \item{Construct a weights matrix that defines the association between each + query cell and each anchor. These weights are computed as 1 - the distance + between the query cell and the anchor divided by the distance of the query + cell to the \code{k.weight}th anchor multiplied by the anchor score + computed in \code{\link{FindIntegrationAnchors}}. We then apply a Gaussian + kernel width a bandwidth defined by \code{sd.weight} and normalize across + all \code{k.weight} anchors.} + \item{Compute the anchor integration matrix as the difference between the + two expression matrices for every pair of anchor cells} + \item{Compute the transformation matrix as the product of the integration + matrix and the weights matrix.} + \item{Subtract the transformation matrix from the original expression + matrix.} +} + +For multiple dataset integration, we perform iterative pairwise integration. +To determine the order of integration (if not specified via +\code{sample.tree}), we +\itemize{ + \item{Define a distance between datasets as the total number of cells in + the smaller dataset divided by the total number of anchors between the two + datasets.} + \item{Compute all pairwise distances between datasets} + \item{Cluster this distance matrix to determine a guide tree} +} +} +\examples{ +\dontrun{ +# to install the SeuratData package see https://github.com/satijalab/seurat-data +library(SeuratData) +data("panc8") + +# panc8 is a merged Seurat object containing 8 separate pancreas datasets +# split the object by dataset +pancreas.list <- SplitObject(panc8, split.by = "tech") + +# perform standard preprocessing on each object +for (i in 1:length(pancreas.list)) { + pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE) + pancreas.list[[i]] <- FindVariableFeatures( + pancreas.list[[i]], selection.method = "vst", + nfeatures = 2000, verbose = FALSE + ) +} + +# find anchors +anchors <- FindIntegrationAnchors(object.list = pancreas.list) + +# integrate data +integrated <- IntegrateData(anchorset = anchors) +} + +} +\references{ +Stuart T, Butler A, et al. Comprehensive Integration of +Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +j.cell.2019.05.031} } diff --git a/man/Misc.Rd b/man/Misc.Rd index 153f5944d..ccf6d3076 100644 --- a/man/Misc.Rd +++ b/man/Misc.Rd @@ -4,8 +4,10 @@ \alias{Misc} \alias{Misc<-} \alias{Misc.Assay} +\alias{Misc.DimReduc} \alias{Misc.Seurat} \alias{Misc<-.Assay} +\alias{Misc<-.DimReduc} \alias{Misc<-.Seurat} \title{Access miscellaneous data} \usage{ @@ -15,10 +17,14 @@ Misc(object, ...) <- value \method{Misc}{Assay}(object, slot = NULL, ...) +\method{Misc}{DimReduc}(object, slot = NULL, ...) + \method{Misc}{Seurat}(object, slot = NULL, ...) \method{Misc}{Assay}(object, slot, ...) <- value +\method{Misc}{DimReduc}(object, slot, ...) <- value + \method{Misc}{Seurat}(object, slot, ...) <- value } \arguments{ diff --git a/man/MixingMetric.Rd b/man/MixingMetric.Rd index a4b3a90f4..4eea94961 100644 --- a/man/MixingMetric.Rd +++ b/man/MixingMetric.Rd @@ -33,8 +33,7 @@ MixingMetric( \item{verbose}{Displays progress bar} } \value{ -Returns a vector of values representing the entropy metric from each -bootstrapped iteration. +Returns a vector of values of the mixing metric for each cell } \description{ Here we compute a measure of how well mixed a composite dataset is. To diff --git a/man/PrepSCTIntegration.Rd b/man/PrepSCTIntegration.Rd index a0545766a..c4d5653c5 100644 --- a/man/PrepSCTIntegration.Rd +++ b/man/PrepSCTIntegration.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/integration.R \name{PrepSCTIntegration} \alias{PrepSCTIntegration} -\title{Prepare an object list that has been run through SCTransform for integration} +\title{Prepare an object list normalized with sctransform for integration.} \usage{ PrepSCTIntegration( object.list, @@ -13,11 +13,13 @@ PrepSCTIntegration( ) } \arguments{ -\item{object.list}{A list of objects to prep for integration} +\item{object.list}{A list of \code{\link{Seurat}} objects to prepare for integration} -\item{assay}{Name or vector of assay names (one for each object) that correspond -to the assay that SCTransform has been run on. If NULL, the current default -assay for each object is used.} +\item{assay}{The name of the \code{\link{Assay}} to use for integration. This can be a +single name if all the assays to be integrated have the same name, or a character vector +containing the name of each \code{\link{Assay}} in each object to be integrated. The +specified assays must have been normalized using \code{\link{SCTransform}}. +If NULL (default), the current default assay for each object is used.} \item{anchor.features}{Can be either: \itemize{ @@ -33,9 +35,52 @@ the Pearson residual will be clipped to} \item{verbose}{Display output/messages} } \value{ -An object list with the \code{scale.data} slots set to the anchor -features +A list of \code{\link{Seurat}} objects with the appropriate \code{scale.data} slots +containing only the required \code{anchor.features}. } \description{ -Prepare an object list that has been run through SCTransform for integration +This function takes in a list of objects that have been normalized with the +\code{\link{SCTransform}} method and performs the following steps: +\itemize{ + \item{If anchor.features is a numeric value, calls \code{\link{SelectIntegrationFeatures}} + to determine the features to use in the downstream integration procedure.} + \item{Ensures that the sctransform residuals for the features specified + to anchor.features are present in each object in the list. This is + necessary because the default behavior of \code{\link{SCTransform}} is to + only store the residuals for the features determined to be variable. + Residuals are recomputed for missing features using the stored model + parameters via the \code{\link{GetResidual}} function.} + \item{Subsets the \code{scale.data} slot to only contain the residuals for + anchor.features for efficiency in downstream processing. } +} +} +\examples{ +\dontrun{ +# to install the SeuratData package see https://github.com/satijalab/seurat-data +library(SeuratData) +data("panc8") + +# panc8 is a merged Seurat object containing 8 separate pancreas datasets +# split the object by dataset and take the first 2 to integrate +pancreas.list <- SplitObject(panc8, split.by = "tech")[1:2] + +# perform SCTransform normalization +pancreas.list <- lapply(X = pancreas.list, FUN = SCTransform) + +# select integration features and prep step +features <- SelectIntegrationFeatures(pancreas.list) +pancreas.list <- PrepSCTIntegration( + pancreas.list, + anchor.features = features +) + +# downstream integration steps +anchors <- FindIntegrationAnchors( + pancreas.list, + normalization.method = "SCT", + anchor.features = features +) +pancreas.integrated <- IntegrateData(anchors) +} + } diff --git a/man/Read10X.Rd b/man/Read10X.Rd index fdf87d001..23e935ad0 100644 --- a/man/Read10X.Rd +++ b/man/Read10X.Rd @@ -4,7 +4,12 @@ \alias{Read10X} \title{Load in data from 10X} \usage{ -Read10X(data.dir = NULL, gene.column = 2, unique.features = TRUE) +Read10X( + data.dir = NULL, + gene.column = 2, + unique.features = TRUE, + strip.suffix = FALSE +) } \arguments{ \item{data.dir}{Directory containing the matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv @@ -15,6 +20,8 @@ will be prefixed with the name.} \item{gene.column}{Specify which column of genes.tsv or features.tsv to use for gene names; default is 2} \item{unique.features}{Make feature names unique (default TRUE)} + +\item{strip.suffix}{Remove trailing "-1" if present in all cell barcodes.} } \value{ If features.csv indicates the data has multiple data types, a list diff --git a/man/RunUMAP.Rd b/man/RunUMAP.Rd index bab8bdc64..5d5d3c8bb 100644 --- a/man/RunUMAP.Rd +++ b/man/RunUMAP.Rd @@ -11,6 +11,7 @@ RunUMAP(object, ...) \method{RunUMAP}{default}( object, + reduction.model = NULL, assay = NULL, umap.method = "uwot", n.neighbors = 30L, @@ -59,6 +60,7 @@ RunUMAP(object, ...) \method{RunUMAP}{Seurat}( object, + reduction.model = NULL, dims = NULL, reduction = "pca", features = NULL, @@ -93,12 +95,15 @@ RunUMAP(object, ...) \item{...}{Arguments passed to other methods and UMAP} +\item{reduction.model}{\code{DimReduc} object that contains the umap model} + \item{assay}{Assay to pull data for when using \code{features}, or assay used to construct Graph if running UMAP on a Graph} \item{umap.method}{UMAP implementation to run. Can be \describe{ \item{\code{uwot}:}{Runs umap via the uwot R package} + \item{\code{uwot-learn}:}{Runs umap via the uwot R package and return the learned umap model} \item{\code{umap-learn}:}{Run the Seurat wrapper of the python umap-learn package} }} diff --git a/man/SelectIntegrationFeatures.Rd b/man/SelectIntegrationFeatures.Rd index 1ecaf2f2b..b9c639ba7 100644 --- a/man/SelectIntegrationFeatures.Rd +++ b/man/SelectIntegrationFeatures.Rd @@ -18,12 +18,14 @@ SelectIntegrationFeatures( \item{nfeatures}{Number of features to return} -\item{assay}{Name of assay from which to pull the variable features.} +\item{assay}{Name or vector of assay names (one for each object) from which +to pull the variable features.} \item{verbose}{Print messages} -\item{fvf.nfeatures}{nfeatures for FindVariableFeatures. Used if -VariableFeatures have not been set for any object in object.list.} +\item{fvf.nfeatures}{nfeatures for \code{\link{FindVariableFeatures}}. Used +if \code{VariableFeatures} have not been set for any object in +\code{object.list}.} \item{...}{Additional parameters to \code{\link{FindVariableFeatures}}} } @@ -32,6 +34,30 @@ A vector of selected features } \description{ Choose the features to use when integrating multiple datasets. This function -ranks features by the number of datasets they appear in, breaking ties by the -median rank across datasets. It returns the highest features by this ranking. +ranks features by the number of datasets they are deemed variable in, +breaking ties by the median variable feature rank across datasets. It returns +the top scoring features by this ranking. +} +\details{ +If for any assay in the list, \code{\link{FindVariableFeatures}} hasn't been +run, this method will try to run it using the \code{fvf.nfeatures} parameter +and any additional ones specified through the \dots. +} +\examples{ +\dontrun{ +# to install the SeuratData package see https://github.com/satijalab/seurat-data +library(SeuratData) +data("panc8") + +# panc8 is a merged Seurat object containing 8 separate pancreas datasets +# split the object by dataset and take the first 2 +pancreas.list <- SplitObject(panc8, split.by = "tech")[1:2] + +# perform SCTransform normalization +pancreas.list <- lapply(X = pancreas.list, FUN = SCTransform) + +# select integration features +features <- SelectIntegrationFeatures(pancreas.list) +} + } diff --git a/man/Seurat-package.Rd b/man/Seurat-package.Rd index f268daee9..8fe0af6ab 100644 --- a/man/Seurat-package.Rd +++ b/man/Seurat-package.Rd @@ -3,10 +3,13 @@ \docType{package} \name{Seurat-package} \alias{Seurat-package} -\title{Tools for single-cell genomics} +\title{Seurat package} \description{ Tools for single-cell genomics } +\details{ +Tools for single-cell genomics +} \section{Package options}{ @@ -23,6 +26,10 @@ Seurat uses the following [options()] to configure behaviour: \item{\code{Seurat.checkdots}}{For functions that have ... as a parameter, this controls the behavior when an item isn't used. Can be one of warn, stop, or silent.} + \item{\code{Seurat.limma.wilcox.msg}}{Show message about more efficient + Wilcoxon Rank Sum test available via the limma package} + \item{\code{Seurat.warn.vlnplot.split}}{Show message about changes to + default behavior of split/multi violin plots} } } diff --git a/man/TransferData.Rd b/man/TransferData.Rd index 1388696f9..705ac8638 100644 --- a/man/TransferData.Rd +++ b/man/TransferData.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/integration.R \name{TransferData} \alias{TransferData} -\title{Transfer Labels} +\title{Transfer data} \usage{ TransferData( anchorset, @@ -19,44 +19,124 @@ TransferData( ) } \arguments{ -\item{anchorset}{Results from FindTransferAnchors} +\item{anchorset}{An \code{\link{AnchorSet}} object generated by +\code{\link{FindTransferAnchors}}} \item{refdata}{Data to transfer. Should be either a vector where the names correspond to reference cells, or a matrix, where the column names correspond to the reference cells.} -\item{weight.reduction}{Dimensional reduction to use for the weighting. -Options are: +\item{weight.reduction}{Dimensional reduction to use for the weighting +anchors. Options are: \itemize{ \item{pcaproject: Use the projected PCA used for anchor building} \item{pca: Use an internal PCA on the query only} \item{cca: Use the CCA used for anchor building} - \item{custom DimReduc: User provided DimReduc object computed on the query - cells} + \item{custom DimReduc: User provided \code{\link{DimReduc}} object + computed on the query cells} }} \item{l2.norm}{Perform L2 normalization on the cell embeddings after dimensional reduction} -\item{dims}{Number of PCs to use in the weighting procedure} +\item{dims}{Number of dimensions to use in the anchor weighting procedure} -\item{k.weight}{Number of neighbors to consider when weighting} +\item{k.weight}{Number of neighbors to consider when weighting anchors} \item{sd.weight}{Controls the bandwidth of the Gaussian kernel for weighting} -\item{eps}{Error bound on the neighbor finding algorithm (from RANN)} +\item{eps}{Error bound on the neighbor finding algorithm (from +\code{\link{RANN}})} \item{do.cpp}{Run cpp code where applicable} \item{verbose}{Print progress bars and output} -\item{slot}{Slot to store the imputed data} +\item{slot}{Slot to store the imputed data. Must be either "data" (default) +or "counts"} } \value{ -If refdata is a vector, returns a dataframe with label predictions. -If refdata is a matrix, returns an Assay object where the imputed data has -been stored in the provided slot. +If \code{refdata} is a vector, returns a data.frame with label +predictions. If \code{refdata} is a matrix, returns an Assay object where the +imputed data has been stored in the provided slot. } \description{ -Transfers the labels +Transfer categorical or continuous data across single-cell datasets. For +transferring categorical information, pass a vector from the reference +dataset (e.g. \code{refdata = reference$celltype}). For transferring +continuous information, pass a matrix from the reference dataset (e.g. +\code{refdata = GetAssayData(reference[['RNA']])}). +} +\details{ +The main steps of this procedure are outlined below. For a more detailed +description of the methodology, please see Stuart, Butler, et al Cell 2019. +\url{https://doi.org/10.1016/j.cell.2019.05.031}; +\url{https://doi.org/10.1101/460147} + +For both transferring discrete labels and also feature imputation, we first +compute the weights matrix. + +\itemize{ + \item{Construct a weights matrix that defines the association between each + query cell and each anchor. These weights are computed as 1 - the distance + between the query cell and the anchor divided by the distance of the query + cell to the \code{k.weight}th anchor multiplied by the anchor score + computed in \code{\link{FindIntegrationAnchors}}. We then apply a Gaussian + kernel width a bandwidth defined by \code{sd.weight} and normalize across + all \code{k.weight} anchors.} +} + +The main difference between label transfer (classification) and feature +imputation is what gets multiplied by the weights matrix. For label transfer, +we perform the following steps: + +\itemize{ + \item{Create a binary classification matrix, the rows corresponding to each + possible class and the columns corresponding to the anchors. If the + reference cell in the anchor pair is a member of a certain class, that + matrix entry is filled with a 1, otherwise 0.} + \item{Multiply this classification matrix by the transpose of weights + matrix to compute a prediction score for each class for each cell in the + query dataset.} +} + +For feature imputation, we perform the following step: +\itemize{ + \item{Multiply the expression matrix for the reference anchor cells by the + weights matrix. This returns a predicted expression matrix for the + specified features for each cell in the query dataset.} +} +} +\examples{ +\dontrun{ +# to install the SeuratData package see https://github.com/satijalab/seurat-data +library(SeuratData) +data("pbmc3k") + +# for demonstration, split the object into reference and query +pbmc.reference <- pbmc3k[, 1:1350] +pbmc.query <- pbmc3k[, 1351:2700] + +# perform standard preprocessing on each object +pbmc.reference <- NormalizeData(pbmc.reference) +pbmc.reference <- FindVariableFeatures(pbmc.reference) +pbmc.reference <- ScaleData(pbmc.reference) + +pbmc.query <- NormalizeData(pbmc.query) +pbmc.query <- FindVariableFeatures(pbmc.query) +pbmc.query <- ScaleData(pbmc.query) + +# find anchors +anchors <- FindTransferAnchors(reference = pbmc.reference, query = pbmc.query) + +# transfer labels +predictions <- TransferData(anchorset = anchors, refdata = pbmc.reference$seurat_annotations) +pbmc.query <- AddMetaData(object = pbmc.query, metadata = predictions) +} + +} +\references{ +Stuart T, Butler A, et al. Comprehensive Integration of +Single-Cell Data. Cell. 2019;177:1888-1902 \url{https://doi.org/10.1016/ +j.cell.2019.05.031} } diff --git a/man/VlnPlot.Rd b/man/VlnPlot.Rd index 56f9848ab..81d45c731 100644 --- a/man/VlnPlot.Rd +++ b/man/VlnPlot.Rd @@ -20,7 +20,7 @@ VlnPlot( log = FALSE, ncol = NULL, slot = "data", - multi.group = FALSE, + split.plot = FALSE, combine = TRUE ) } @@ -57,8 +57,8 @@ expression of the attribute being potted, can also pass 'increasing' or 'decreas \item{slot}{Use non-normalized counts data for plotting} -\item{multi.group}{plot each group of the split violin plots by multiple or single violin shapes -see \code{\link{FetchData}} for more details} +\item{split.plot}{plot each group of the split violin plots by multiple or +single violin shapes.} \item{combine}{Combine plots into a single \code{\link[patchwork]{patchwork}ed} ggplot object. If \code{FALSE}, return a list of ggplot objects} diff --git a/man/cc.genes.Rd b/man/cc.genes.Rd index 1b07701e8..995b97dbc 100644 --- a/man/cc.genes.Rd +++ b/man/cc.genes.Rd @@ -4,11 +4,13 @@ \name{cc.genes} \alias{cc.genes} \title{Cell cycle genes} -\format{A list of two vectors +\format{ +A list of two vectors \describe{ \item{s.genes}{Genes associated with S-phase} \item{g2m.genes}{Genes associated with G2M-phase} -}} +} +} \source{ \url{http://science.sciencemag.org/content/352/6282/189} } diff --git a/man/cc.genes.updated.2019.Rd b/man/cc.genes.updated.2019.Rd index def9478ad..377a91e6c 100644 --- a/man/cc.genes.updated.2019.Rd +++ b/man/cc.genes.updated.2019.Rd @@ -4,11 +4,13 @@ \name{cc.genes.updated.2019} \alias{cc.genes.updated.2019} \title{Cell cycle genes: 2019 update} -\format{A list of two vectors +\format{ +A list of two vectors \describe{ \item{s.genes}{Genes associated with S-phase} \item{g2m.genes}{Genes associated with G2M-phase} -}} +} +} \source{ \url{http://science.sciencemag.org/content/352/6282/189} } diff --git a/man/pbmc_small.Rd b/man/pbmc_small.Rd index 6721bdfed..9b375dd2f 100644 --- a/man/pbmc_small.Rd +++ b/man/pbmc_small.Rd @@ -4,7 +4,8 @@ \name{pbmc_small} \alias{pbmc_small} \title{A small example version of the PBMC dataset} -\format{A Seurat object with the following slots filled +\format{ +A Seurat object with the following slots filled \describe{ \item{assays}{ \itemize{Currently only contains one assay ("RNA" - scRNA-seq expression data) @@ -21,7 +22,8 @@ \item{reductions}{Dimensional reductions: currently PCA and tSNE} \item{version}{Seurat version used to create the object} \item{commands}{Command history} -}} +} +} \source{ \url{https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k} } diff --git a/src/integration.cpp b/src/integration.cpp index bb61b1303..afb16c289 100644 --- a/src/integration.cpp +++ b/src/integration.cpp @@ -96,17 +96,30 @@ Eigen::SparseMatrix IntegrateDataC( } +int getCoeff (Eigen::SparseMatrix& mat, size_t i, size_t j){ + int score{0}; + if (i == j) { + for (Eigen::SparseMatrix::InnerIterator it(mat, i); it; ++it){ + score++; + } + } else { + for(int k=0; k < mat.outerSize(); ++k) { + if (mat.coeff(i, k) and mat.coeff(j, k)) { score++; } + } + } + + return score; +} + + //[[Rcpp::export]] Eigen::SparseMatrix SNNAnchor( Eigen::SparseMatrix k_matrix, Eigen::SparseMatrix anchor_only ) { - typedef Eigen::SparseMatrix SpMat; - SpMat mat2 = k_matrix; - SpMat mat3 = mat2 * mat2.transpose(); for (int k=0; k::InnerIterator it(anchor_only,k); it; ++it){ - it.valueRef() = mat3.coeff(it.row(), it.col()); + it.valueRef() = getCoeff(k_matrix, it.row(), it.col()); } } return(anchor_only); diff --git a/tests/testthat/test_differential_expression.R b/tests/testthat/test_differential_expression.R index eb0d4cba9..8dce1582a 100644 --- a/tests/testthat/test_differential_expression.R +++ b/tests/testthat/test_differential_expression.R @@ -222,53 +222,56 @@ test_that("LR test works", { }) # Tests for FindConservedMarkers -# -------------------------------------------------------------------------------- -context("FindConservedMarkers") -pbmc_small$groups - -markers <- suppressWarnings(FindConservedMarkers(object = pbmc_small, ident.1 = 0, grouping.var = "groups", verbose = FALSE)) - -standard.names <- c("p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj") - -test_that("FindConservedMarkers works", { - expect_equal(colnames(x = markers), c(paste0("g2_", standard.names), paste0("g1_", standard.names), "max_pval", "minimump_p_val")) - expect_equal(markers[1, "g2_p_val"], 4.983576e-05) - expect_equal(markers[1, "g2_avg_logFC"], -4.125279, tolerance = 1e-6) - # expect_equal(markers[1, "g2_pct.1"], 0.062) - expect_equal(markers[1, "g2_pct.2"], 0.75) - expect_equal(markers[1, "g2_p_val_adj"], 0.0114622238) - expect_equal(markers[1, "g1_p_val"], 3.946643e-08) - expect_equal(markers[1, "g1_avg_logFC"], -3.589384, tolerance = 1e-6) - expect_equal(markers[1, "g1_pct.1"], 0.10) - expect_equal(markers[1, "g1_pct.2"], 0.958) - expect_equal(markers[1, "g1_p_val_adj"], 9.077279e-06) - expect_equal(markers[1, "max_pval"], 4.983576e-05) - expect_equal(markers[1, "minimump_p_val"], 7.893286e-08) - expect_equal(nrow(markers), 162) - expect_equal(rownames(markers)[1], "HLA-DRB1") - expect_equal(markers[, "max_pval"], unname(obj = apply(X = markers, MARGIN = 1, FUN = function(x) max(x[c("g1_p_val", "g2_p_val")])))) -}) - -test_that("FindConservedMarkers errors when expected", { - expect_error(FindConservedMarkers(pbmc_small)) - expect_error(FindConservedMarkers(pbmc_small, ident.1 = 0)) - expect_error(FindConservedMarkers(pbmc_small, ident.1 = 0, grouping.var = "groups", meta.method = "minimump")) -}) - -pbmc.test <- pbmc_small -Idents(object = pbmc.test) <- "RNA_snn_res.1" -pbmc.test$id.group <- paste0(pbmc.test$RNA_snn_res.1, "_", pbmc.test$groups) -pbmc.test <- subset(x = pbmc.test, id.group == "0_g1", invert = TRUE) -markers.missing <- suppressWarnings(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t", verbose = FALSE)) - -test_that("FindConservedMarkers handles missing idents in certain groups", { - expect_warning(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t")) - expect_equal(colnames(x = markers.missing), paste0("g2_", standard.names)) - expect_equal(markers.missing[1, "g2_p_val"], 1.672911e-13) - expect_equal(markers.missing[1, "g2_avg_logFC"], -4.527888, tolerance = 1e-6) - # expect_equal(markers.missing[1, "g2_pct.1"], 0.062) - expect_equal(markers.missing[1, "g2_pct.2"], 0.95) - expect_equal(markers.missing[1, "g2_p_val_adj"], 3.847695e-11) - expect_equal(nrow(markers.missing), 190) - expect_equal(rownames(markers.missing)[1], "HLA-DPB1") -}) +# ------------------------------------------------------------------------------- + +if (requireNamespace('metap', quietly = TRUE)) { + context("FindConservedMarkers") + pbmc_small$groups + + markers <- suppressWarnings(FindConservedMarkers(object = pbmc_small, ident.1 = 0, grouping.var = "groups", verbose = FALSE)) + + standard.names <- c("p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj") + + test_that("FindConservedMarkers works", { + expect_equal(colnames(x = markers), c(paste0("g2_", standard.names), paste0("g1_", standard.names), "max_pval", "minimump_p_val")) + expect_equal(markers[1, "g2_p_val"], 4.983576e-05) + expect_equal(markers[1, "g2_avg_logFC"], -4.125279, tolerance = 1e-6) + # expect_equal(markers[1, "g2_pct.1"], 0.062) + expect_equal(markers[1, "g2_pct.2"], 0.75) + expect_equal(markers[1, "g2_p_val_adj"], 0.0114622238) + expect_equal(markers[1, "g1_p_val"], 3.946643e-08) + expect_equal(markers[1, "g1_avg_logFC"], -3.589384, tolerance = 1e-6) + expect_equal(markers[1, "g1_pct.1"], 0.10) + expect_equal(markers[1, "g1_pct.2"], 0.958) + expect_equal(markers[1, "g1_p_val_adj"], 9.077279e-06) + expect_equal(markers[1, "max_pval"], 4.983576e-05) + expect_equal(markers[1, "minimump_p_val"], 7.893286e-08) + expect_equal(nrow(markers), 162) + expect_equal(rownames(markers)[1], "HLA-DRB1") + expect_equal(markers[, "max_pval"], unname(obj = apply(X = markers, MARGIN = 1, FUN = function(x) max(x[c("g1_p_val", "g2_p_val")])))) + }) + + test_that("FindConservedMarkers errors when expected", { + expect_error(FindConservedMarkers(pbmc_small)) + expect_error(FindConservedMarkers(pbmc_small, ident.1 = 0)) + expect_error(FindConservedMarkers(pbmc_small, ident.1 = 0, grouping.var = "groups", meta.method = "minimump")) + }) + + pbmc.test <- pbmc_small + Idents(object = pbmc.test) <- "RNA_snn_res.1" + pbmc.test$id.group <- paste0(pbmc.test$RNA_snn_res.1, "_", pbmc.test$groups) + pbmc.test <- subset(x = pbmc.test, id.group == "0_g1", invert = TRUE) + markers.missing <- suppressWarnings(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t", verbose = FALSE)) + + test_that("FindConservedMarkers handles missing idents in certain groups", { + expect_warning(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t")) + expect_equal(colnames(x = markers.missing), paste0("g2_", standard.names)) + expect_equal(markers.missing[1, "g2_p_val"], 1.672911e-13) + expect_equal(markers.missing[1, "g2_avg_logFC"], -4.527888, tolerance = 1e-6) + # expect_equal(markers.missing[1, "g2_pct.1"], 0.062) + expect_equal(markers.missing[1, "g2_pct.2"], 0.95) + expect_equal(markers.missing[1, "g2_p_val_adj"], 3.847695e-11) + expect_equal(nrow(markers.missing), 190) + expect_equal(rownames(markers.missing)[1], "HLA-DPB1") + }) +} diff --git a/tests/testthat/test_load_10X.R b/tests/testthat/test_load_10X.R index 25bd216f0..1ab15b134 100644 --- a/tests/testthat/test_load_10X.R +++ b/tests/testthat/test_load_10X.R @@ -9,7 +9,7 @@ test_that("Cell Ranger 3.0 Data Parsing", { expect_is(test.data, "list") expect_equal(ncol(test.data$`Gene Expression`), .5 * ncol(test.data2$`Gene Expression`)) expect_equal(ncol(test.data$`Antibody Capture`), .5 * ncol(test.data2$`Antibody Capture`)) - expect_equal(colnames(test.data2[[1]])[6], "2_AAAGTAGCACAGTCGC") + expect_equal(colnames(test.data2[[1]])[6], "2_AAAGTAGCACAGTCGC-1") expect_equal(test.data$`Gene Expression`[2,2], 1000) }) @@ -17,7 +17,7 @@ test_that("Cell Ranger 3.0 Data Parsing", { test.data3 <- Read10X("../testdata/") test_that("Read10X creates sparse matrix", { expect_is(test.data3, "dgCMatrix") - expect_equal(colnames(test.data3)[1], "ATGCCAGAACGACT") + expect_equal(colnames(test.data3)[1], "ATGCCAGAACGACT-1") expect_equal(rownames(test.data3)[1], "MS4A1") })