From 4e91581052b3da4b08693809129f9c32e21c07a3 Mon Sep 17 00:00:00 2001 From: Stephen Williams Date: Wed, 11 Dec 2024 12:42:51 -0800 Subject: [PATCH] remove unnecessary processing --- R/preprocessing.R | 392 ++++++++++++++++++++++++++-------------------- 1 file changed, 220 insertions(+), 172 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 82fc24858..1b4609146 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -372,14 +372,14 @@ HTODemux <- function( #' } #' GetResidual <- function( - object, - features, - assay = NULL, - umi.assay = "RNA", - clip.range = NULL, - replace.value = FALSE, - na.rm = TRUE, - verbose = TRUE + object, + features, + assay = NULL, + umi.assay = "RNA", + clip.range = NULL, + replace.value = FALSE, + na.rm = TRUE, + verbose = TRUE ) { assay <- assay %||% DefaultAssay(object = object) if (IsSCT(assay = object[[assay]])) { @@ -458,7 +458,7 @@ GetResidual <- function( } existing.data <- GetAssayData(object = object, slot = 'scale.data', assay = assay) all.features <- union(x = rownames(x = existing.data), y = features) - new.scale <- matrix( + new.scale <- matrix( data = NA, nrow = length(x = all.features), ncol = ncol(x = object), @@ -525,15 +525,15 @@ GetResidual <- function( #' } #' Load10X_Spatial <- function ( - data.dir, - filename = "filtered_feature_bc_matrix.h5", - assay = "Spatial", - slice = "slice1", - bin.size = NULL, - filter.matrix = TRUE, - to.upper = FALSE, - image = NULL, - ... + data.dir, + filename = "filtered_feature_bc_matrix.h5", + assay = "Spatial", + slice = "slice1", + bin.size = NULL, + filter.matrix = TRUE, + to.upper = FALSE, + image = NULL, + ... ) { # if more than one directory is passed in if (length(x = data.dir) > 1) { @@ -621,13 +621,13 @@ Load10X_Spatial <- function ( # associate each counts matrix with its corresponding image object.list <- mapply( function( - .object, - .image, - .assay, - .slice + .object, + .image, + .assay, + .slice ) { - # align the image's identifiers with the object's - .image <- .image[Cells(.object)] + # align the image's identifiers with the object's + .image <- .image[Cells(.object)] # add the image to the corresponding Seurat instance .object[[.slice]] <- .image return (.object) @@ -646,7 +646,132 @@ Load10X_Spatial <- function ( return(object) } +#' Add 10X Cell Types to a Seurat Object +#' +#' This function reads cell type annotations from a CSV file and adds them to the metadata of a Seurat object. +#' If the cell type file does not exist, the original Seurat object is returned unchanged. +#' +#' @param data.dir A string specifying the directory containing the "cell_types" folder with the "cell_types.csv" file. +#' @param object A Seurat object to which the cell type annotations will be added. +#' +#' @return A Seurat object with updated metadata including cell type annotations if the file is found. +#' +#' @details +#' The function searches for a CSV file named "cell_types.csv" in the "cell_types" subdirectory within `data.dir`. +#' The CSV file should contain at least a "barcode" column that matches the cell barcodes in the Seurat object. +#' Additional columns in the CSV file will be merged into the Seurat object's metadata. +#' +#' @importFrom utils read.csv +#' @importFrom tibble rownames_to_column column_to_rownames +#' @importFrom base file.path file.exists merge +#' +#' @examples +#' \dontrun{ +#' # Specify the data directory containing the "cell_types" folder +#' data.dir <- "/path/to/data" +#' +#' # Create a Seurat object (example) +#' seurat_obj <- CreateSeuratObject(counts = some_counts_matrix) +#' +#' # Add cell type annotations to the Seurat object +#' seurat_obj <- Add_10X_CellTypes(data.dir, seurat_obj) +#' } +Add_10X_CellTypes <- function(data.dir, object) { + cell_types_path <- file.path(data.dir, "cell_types", "cell_types.csv") + if (file.exists(cell_types_path)) { + cell.types <- read.csv(cell_types_path) + object@meta.data <- dplyr::left_join(tibble::rownames_to_column(object@meta.data, "barcode"), + cell.types, by = "barcode") %>% + tibble::column_to_rownames("barcode") + return(object) + } else { + return(object) + } +} + +#' Load a 10x Genomics Single Cell Experiment into a \code{Seurat} object +#' +#' @inheritParams Read10X +#' @inheritParams SeuratObject::CreateSeuratObject +#' @param data.dir Directory containing the H5 file specified by \code{filename} +#' and the image data in a subdirectory called \code{spatial} +#' @param filename Name of H5 file containing the feature barcode matrix +#' @param to.upper Converts all feature names to upper case. This can provide an +#' approximate conversion of mouse to human gene names which can be useful in an +#' explorative analysis. For cross-species comparisons, orthologous genes should +#' be identified across species and used instead. +#' @param ... Arguments passed to \code{\link{Read10X_h5}} +#' +#' @return A \code{Seurat} object +#' +#' +#' @export +#' @concept preprocessing +#' +#' @examples +#' \dontrun{ +#' data_dir <- 'path/to/data/directory' +#' list.files(data_dir) # Should show filtered_feature_bc_matrix.h5 +#' Load10X(data.dir = data_dir) +#' } +#' +Load10X <- function(data.dir, filename = "filtered_feature_bc_matrix.h5", + assay = "RNA", to.upper = FALSE, ...) { + + if (length(data.dir) > 1) { + stop("`data.dir` expects a single directory path but received multiple values.") + } + if (!file.exists(data.dir)) { + stop(paste0("No such file or directory: '", data.dir, "'")) + } + + filename <- list.files(data.dir, filename, full.names = FALSE, recursive = FALSE) + counts.path <- file.path(data.dir, filename) + if (!file.exists(counts.path)) { + stop(paste0("File not found: '", counts.path, "'")) + } + + counts <- Read10X_h5(counts.path, ...) + + if (to.upper) { + counts <- imap(counts, ~{ + rownames(.x) <- toupper(rownames(.x)) + .x + }) + } + + if (is.list(counts)) { + seurat.list <- lapply(names(counts), function(name) { + CreateSeuratObject( + counts = counts[[name]], + assay = name, + project = name + ) + }) + + for (i in 1:seq_along(seurat.list)) { + if (Assays(seurat.list[[i]]) %in% c("Gene Expression", "RNA")) { + seurat.list[[i]] <- Add_10X_CellTypes(data.dir, seurat.list[[i]]) + } + } + + merged.object <- merge( + x = seurat.list[[1]], + y = seurat.list[-1], + add.cell.ids = names(counts), + merge.data = FALSE + ) + return(merged.object) + + } else { + object <- CreateSeuratObject(counts, assay = assay) + if (Assays(object) %in% c("Gene Expression", "RNA")) { + object <- Add_10X_CellTypes(data.dir, object) + } + return(object) + } +} #' Read10x Probe Metadata #' #' This function reads the probe metadata from a 10x Genomics probe barcode matrix file in HDF5 format. @@ -660,8 +785,8 @@ Load10X_Spatial <- function ( #' @concept preprocessing #' Read10X_probe_metadata <- function( - data.dir, - filename = 'raw_probe_bc_matrix.h5' + data.dir, + filename = 'raw_probe_bc_matrix.h5' ) { if (!requireNamespace('hdf5r', quietly = TRUE)) { stop("Please install hdf5r to read HDF5 files") @@ -1222,11 +1347,11 @@ Read10X_h5 <- function(filename, use.names = TRUE, unique.features = TRUE) { #' @concept preprocessing #' Read10X_Image <- function( - image.dir, - image.name = "tissue_lowres_image.png", - assay = "Spatial", - slice = "slice1", - filter.matrix = TRUE + image.dir, + image.name = "tissue_lowres_image.png", + assay = "Spatial", + slice = "slice1", + filter.matrix = TRUE ) { image <- png::readPNG( source = file.path( @@ -1316,18 +1441,18 @@ Read10X_Coordinates <- function(filename, filter.matrix) { } else { # the coordinate mappings must be in a CSV - read it in coordinates <- read.csv( - file = filename, - col.names = col.names, - header = ifelse( - # assume files calles "tissue_positions.csv" have headers, otherwise - # assume they do not (i.e. "tissue_positions_list.csv") - test = basename(filename) == "tissue_positions.csv", - yes = TRUE, - no = FALSE - ), - as.is = TRUE, - row.names = 1 - ) + file = filename, + col.names = col.names, + header = ifelse( + # assume files calles "tissue_positions.csv" have headers, otherwise + # assume they do not (i.e. "tissue_positions_list.csv") + test = basename(filename) == "tissue_positions.csv", + yes = TRUE, + no = FALSE + ), + as.is = TRUE, + row.names = 1 + ) } # the `tissue` column should contain a boolean indicating whether or not a @@ -2312,11 +2437,11 @@ ReadNanostring <- function( #' @concept preprocessing #' ReadXenium <- function( - data.dir, - outs = c("segmentation_method", "matrix", "microns"), - type = "centroids", - mols.qv.threshold = 20, - flip.xy = F + data.dir, + outs = c("segmentation_method", "matrix", "microns"), + type = "centroids", + mols.qv.threshold = 20, + flip.xy = F ) { # Argument checking type <- match.arg( @@ -2349,83 +2474,6 @@ ReadXenium <- function( } } - data <- sapply(outs, function(otype) { - switch( - EXPR = otype, - 'matrix' = { - pmtx <- progressor() - pmtx(message = 'Reading counts matrix', class = 'sticky', amount = 0) - - for(option in Filter(function(x) x$req, list( - list(filename = "cell_feature_matrix.h5", fn = Read10X_h5, req = has_hdf5r), - list(filename = "cell_feature_matrix", fn = Read10X, req = TRUE) - ))) { - matrix <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) - if(!inherits(matrix, "try-error")) { break } - } - - if(!exists('matrix') || inherits(matrix, "try-error")) { - stop("Xenium outputs were incomplete: missing cell_feature_matrix") - } - - pmtx(type = "finish") - matrix - }, - 'segmentation_method' = { - psegs <- progressor() - psegs( - message = 'Loading cell metadata', - class = 'sticky', - amount = 0 - ) - - col.use <- c( - cell_id = 'cell', - segmentation_method = 'segmentation_method' - ) - - for(option in Filter(function(x) x$req, list( - list( - filename = "cells.parquet", - fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), - req = has_arrow - ), - list( - filename = "cells.csv.gz", - fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE, select = names(col.use)), - req = has_dt - ), - list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) - ))) { - cell_seg <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename))), silent = TRUE) - if(!inherits(cell_seg, "try-error")) { break } - } - - if(!exists('cell_seg') || inherits(cell_seg, "try-error") || length(intersect(names(col.use), colnames(cell_seg))) != 2) { - warning('cells did not contain a segmentation_method column. Skipping...', call. = FALSE, immediate. = TRUE) - NULL - } else { - cell_seg <- cell_seg[, names(col.use)] - colnames(cell_seg) <- col.use - - cell_seg$cell <- binary_to_string(cell_seg$cell) - - has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) - has_arrow <- requireNamespace("arrow", quietly = TRUE) - has_hdf5r <- requireNamespace("hdf5r", quietly = TRUE) - - binary_to_string <- function(arrow_binary) { - if(typeof(arrow_binary) == 'list') { - unlist( - lapply( - arrow_binary, function(x) rawToChar(as.raw(strtoi(x, 16L))) - ) - ) - } else { - arrow_binary - } - } - data <- sapply(outs, function(otype) { switch( EXPR = otype, @@ -3625,25 +3673,25 @@ SampleUMI <- function( #' @export #' SCTransform.default <- function( - object, - cell.attr, - reference.SCT.model = NULL, - do.correct.umi = TRUE, - ncells = 5000, - residual.features = NULL, - variable.features.n = 3000, - variable.features.rv.th = 1.3, - vars.to.regress = NULL, - latent.data = NULL, - do.scale = FALSE, - do.center = TRUE, - clip.range = c(-sqrt(x = ncol(x = umi) / 30), sqrt(x = ncol(x = umi) / 30)), - vst.flavor = 'v2', - conserve.memory = FALSE, - return.only.var.genes = TRUE, - seed.use = 1448145, - verbose = TRUE, - ... + object, + cell.attr, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = umi) / 30), sqrt(x = ncol(x = umi) / 30)), + vst.flavor = 'v2', + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... ) { if (!is.null(x = seed.use)) { set.seed(seed = seed.use) @@ -4148,16 +4196,16 @@ SubsetByBarcodeInflections <- function(object) { #' @export #' FindVariableFeatures.V3Matrix <- function( - object, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - verbose = TRUE, - ... + object, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + verbose = TRUE, + ... ) { CheckDots(...) if (!inherits(x = object, 'Matrix')) { @@ -4575,11 +4623,11 @@ FindSpatiallyVariableFeatures.Seurat <- function( #' @export #' LogNormalize.data.frame <- function( - data, - scale.factor = 1e4, - margin = 2L, - verbose = TRUE, - ... + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... ) { return(LogNormalize( data = as.matrix(x = data), @@ -4594,11 +4642,11 @@ LogNormalize.data.frame <- function( #' @export #' LogNormalize.V3Matrix <- function( - data, - scale.factor = 1e4, - margin = 2L, - verbose = TRUE, - ... + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... ) { # if (is.data.frame(x = data)) { # data <- as.matrix(x = data) @@ -4641,13 +4689,13 @@ LogNormalize.V3Matrix <- function( #' @export #' NormalizeData.V3Matrix <- function( - object, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - block.size = NULL, - verbose = TRUE, - ... + object, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + block.size = NULL, + verbose = TRUE, + ... ) { CheckDots(...) if (is.null(x = normalization.method)) { @@ -5110,7 +5158,7 @@ ScaleData.IterableMatrix <- function( object <- BPCells::min_by_row(mat = object, vals = scale.max * features.sd + features.mean) } scaled.data <- (object - features.mean) / features.sd -return(scaled.data) + return(scaled.data) }