From 4e91581052b3da4b08693809129f9c32e21c07a3 Mon Sep 17 00:00:00 2001
From: Stephen Williams <stephen.williams@10xgenomics.com>
Date: Wed, 11 Dec 2024 12:42:51 -0800
Subject: [PATCH] remove unnecessary processing

---
 R/preprocessing.R | 392 ++++++++++++++++++++++++++--------------------
 1 file changed, 220 insertions(+), 172 deletions(-)

diff --git a/R/preprocessing.R b/R/preprocessing.R
index 82fc24858..1b4609146 100644
--- a/R/preprocessing.R
+++ b/R/preprocessing.R
@@ -372,14 +372,14 @@ HTODemux <- function(
 #' }
 #'
 GetResidual <- function(
-  object,
-  features,
-  assay = NULL,
-  umi.assay = "RNA",
-  clip.range = NULL,
-  replace.value = FALSE,
-  na.rm = TRUE,
-  verbose = TRUE
+    object,
+    features,
+    assay = NULL,
+    umi.assay = "RNA",
+    clip.range = NULL,
+    replace.value = FALSE,
+    na.rm = TRUE,
+    verbose = TRUE
 ) {
   assay <- assay %||% DefaultAssay(object = object)
   if (IsSCT(assay = object[[assay]])) {
@@ -458,7 +458,7 @@ GetResidual <- function(
   }
   existing.data <- GetAssayData(object = object, slot = 'scale.data', assay = assay)
   all.features <- union(x = rownames(x = existing.data), y = features)
-   new.scale <- matrix(
+  new.scale <- matrix(
     data = NA,
     nrow = length(x = all.features),
     ncol = ncol(x = object),
@@ -525,15 +525,15 @@ GetResidual <- function(
 #' }
 #'
 Load10X_Spatial <- function (
-  data.dir,
-  filename = "filtered_feature_bc_matrix.h5",
-  assay = "Spatial",
-  slice = "slice1",
-  bin.size = NULL,
-  filter.matrix = TRUE,
-  to.upper = FALSE,
-  image = NULL,
-  ...
+    data.dir,
+    filename = "filtered_feature_bc_matrix.h5",
+    assay = "Spatial",
+    slice = "slice1",
+    bin.size = NULL,
+    filter.matrix = TRUE,
+    to.upper = FALSE,
+    image = NULL,
+    ...
 ) {
   # if more than one directory is passed in
   if (length(x = data.dir) > 1) {
@@ -621,13 +621,13 @@ Load10X_Spatial <- function (
   # associate each counts matrix with its corresponding image
   object.list <- mapply(
     function(
-      .object,
-      .image,
-      .assay,
-      .slice
+    .object,
+    .image,
+    .assay,
+    .slice
     ) {
-	    # align the image's identifiers with the object's
-	    .image <- .image[Cells(.object)]
+      # align the image's identifiers with the object's
+      .image <- .image[Cells(.object)]
       # add the image to the corresponding Seurat instance
       .object[[.slice]] <- .image
       return (.object)
@@ -646,7 +646,132 @@ Load10X_Spatial <- function (
   return(object)
 }
 
+#' Add 10X Cell Types to a Seurat Object
+#'
+#' This function reads cell type annotations from a CSV file and adds them to the metadata of a Seurat object.
+#' If the cell type file does not exist, the original Seurat object is returned unchanged.
+#'
+#' @param data.dir A string specifying the directory containing the "cell_types" folder with the "cell_types.csv" file.
+#' @param object A Seurat object to which the cell type annotations will be added.
+#'
+#' @return A Seurat object with updated metadata including cell type annotations if the file is found.
+#'
+#' @details
+#' The function searches for a CSV file named "cell_types.csv" in the "cell_types" subdirectory within `data.dir`.
+#' The CSV file should contain at least a "barcode" column that matches the cell barcodes in the Seurat object.
+#' Additional columns in the CSV file will be merged into the Seurat object's metadata.
+#'
+#' @importFrom utils read.csv
+#' @importFrom tibble rownames_to_column column_to_rownames
+#' @importFrom base file.path file.exists merge
+#'
+#' @examples
+#' \dontrun{
+#' # Specify the data directory containing the "cell_types" folder
+#' data.dir <- "/path/to/data"
+#'
+#' # Create a Seurat object (example)
+#' seurat_obj <- CreateSeuratObject(counts = some_counts_matrix)
+#'
+#' # Add cell type annotations to the Seurat object
+#' seurat_obj <- Add_10X_CellTypes(data.dir, seurat_obj)
+#' }
+Add_10X_CellTypes <- function(data.dir, object) {
+  cell_types_path <- file.path(data.dir, "cell_types", "cell_types.csv")
+  if (file.exists(cell_types_path)) {
+    cell.types <- read.csv(cell_types_path)
+    object@meta.data <- dplyr::left_join(tibble::rownames_to_column(object@meta.data, "barcode"),
+                                         cell.types, by = "barcode") %>%
+      tibble::column_to_rownames("barcode")
+    return(object)
+  } else {
+    return(object)
+  }
+}
+
+#' Load a 10x Genomics Single Cell Experiment into a \code{Seurat} object
+#'
+#' @inheritParams Read10X
+#' @inheritParams SeuratObject::CreateSeuratObject
+#' @param data.dir Directory containing the H5 file specified by \code{filename}
+#' and the image data in a subdirectory called \code{spatial}
+#' @param filename Name of H5 file containing the feature barcode matrix
+#' @param to.upper Converts all feature names to upper case. This can provide an
+#' approximate conversion of mouse to human gene names which can be useful in an
+#' explorative analysis. For cross-species comparisons, orthologous genes should
+#' be identified across species and used instead.
+#' @param ... Arguments passed to \code{\link{Read10X_h5}}
+#'
+#' @return A \code{Seurat} object
+#'
+#'
+#' @export
+#' @concept preprocessing
+#'
+#' @examples
+#' \dontrun{
+#' data_dir <- 'path/to/data/directory'
+#' list.files(data_dir) # Should show filtered_feature_bc_matrix.h5
+#' Load10X(data.dir = data_dir)
+#' }
+#'
+Load10X <- function(data.dir, filename = "filtered_feature_bc_matrix.h5",
+                    assay = "RNA", to.upper = FALSE, ...) {
+
+  if (length(data.dir) > 1) {
+    stop("`data.dir` expects a single directory path but received multiple values.")
+  }
+  if (!file.exists(data.dir)) {
+    stop(paste0("No such file or directory: '", data.dir, "'"))
+  }
+
 
+  filename <- list.files(data.dir, filename, full.names = FALSE, recursive = FALSE)
+  counts.path <- file.path(data.dir, filename)
+  if (!file.exists(counts.path)) {
+    stop(paste0("File not found: '", counts.path, "'"))
+  }
+
+  counts <- Read10X_h5(counts.path, ...)
+
+  if (to.upper) {
+    counts <- imap(counts, ~{
+      rownames(.x) <- toupper(rownames(.x))
+      .x
+    })
+  }
+
+  if (is.list(counts)) {
+    seurat.list <- lapply(names(counts), function(name) {
+      CreateSeuratObject(
+        counts = counts[[name]],
+        assay = name,
+        project = name
+      )
+    })
+
+    for (i in 1:seq_along(seurat.list)) {
+      if (Assays(seurat.list[[i]]) %in% c("Gene Expression", "RNA")) {
+        seurat.list[[i]] <- Add_10X_CellTypes(data.dir, seurat.list[[i]])
+      }
+    }
+
+    merged.object <- merge(
+      x = seurat.list[[1]],
+      y = seurat.list[-1],
+      add.cell.ids = names(counts),
+      merge.data = FALSE
+    )
+    return(merged.object)
+
+  } else {
+    object <- CreateSeuratObject(counts, assay = assay)
+    if (Assays(object)  %in%  c("Gene Expression", "RNA")) {
+      object <- Add_10X_CellTypes(data.dir, object)
+    }
+    return(object)
+  }
+}
 #' Read10x Probe Metadata
 #'
 #' This function reads the probe metadata from a 10x Genomics probe barcode matrix file in HDF5 format.
@@ -660,8 +785,8 @@ Load10X_Spatial <- function (
 #' @concept preprocessing
 #'
 Read10X_probe_metadata <- function(
-  data.dir,
-  filename = 'raw_probe_bc_matrix.h5'
+    data.dir,
+    filename = 'raw_probe_bc_matrix.h5'
 ) {
   if (!requireNamespace('hdf5r', quietly = TRUE)) {
     stop("Please install hdf5r to read HDF5 files")
@@ -1222,11 +1347,11 @@ Read10X_h5 <- function(filename, use.names = TRUE, unique.features = TRUE) {
 #' @concept preprocessing
 #'
 Read10X_Image <- function(
-  image.dir,
-  image.name = "tissue_lowres_image.png",
-  assay = "Spatial",
-  slice = "slice1",
-  filter.matrix = TRUE
+    image.dir,
+    image.name = "tissue_lowres_image.png",
+    assay = "Spatial",
+    slice = "slice1",
+    filter.matrix = TRUE
 ) {
   image <- png::readPNG(
     source = file.path(
@@ -1316,18 +1441,18 @@ Read10X_Coordinates <- function(filename, filter.matrix) {
   } else {
     # the coordinate mappings must be in a CSV - read it in
     coordinates <- read.csv(
-        file = filename,
-        col.names = col.names,
-        header = ifelse(
-          # assume files calles "tissue_positions.csv" have headers, otherwise
-          # assume they do not (i.e. "tissue_positions_list.csv")
-          test = basename(filename) == "tissue_positions.csv",
-          yes = TRUE,
-          no = FALSE
-        ),
-        as.is = TRUE,
-        row.names = 1
-      )
+      file = filename,
+      col.names = col.names,
+      header = ifelse(
+        # assume files calles "tissue_positions.csv" have headers, otherwise
+        # assume they do not (i.e. "tissue_positions_list.csv")
+        test = basename(filename) == "tissue_positions.csv",
+        yes = TRUE,
+        no = FALSE
+      ),
+      as.is = TRUE,
+      row.names = 1
+    )
   }
 
   # the `tissue` column should contain a boolean indicating whether or not a
@@ -2312,11 +2437,11 @@ ReadNanostring <- function(
 #' @concept preprocessing
 #'
 ReadXenium <- function(
-  data.dir,
-  outs = c("segmentation_method", "matrix", "microns"),
-  type = "centroids",
-  mols.qv.threshold = 20,
-  flip.xy = F
+    data.dir,
+    outs = c("segmentation_method", "matrix", "microns"),
+    type = "centroids",
+    mols.qv.threshold = 20,
+    flip.xy = F
 ) {
   # Argument checking
   type <- match.arg(
@@ -2349,83 +2474,6 @@ ReadXenium <- function(
     }
   }
 
-  data <- sapply(outs, function(otype) {
-    switch(
-      EXPR = otype,
-      'matrix' = {
-        pmtx <- progressor()
-        pmtx(message = 'Reading counts matrix', class = 'sticky', amount = 0)
-
-        for(option in Filter(function(x) x$req, list(
-          list(filename = "cell_feature_matrix.h5", fn = Read10X_h5, req = has_hdf5r),
-          list(filename = "cell_feature_matrix", fn = Read10X, req = TRUE)
-        ))) {
-          matrix <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename))))
-          if(!inherits(matrix, "try-error")) { break }
-        }
-
-        if(!exists('matrix') || inherits(matrix, "try-error")) {
-          stop("Xenium outputs were incomplete: missing cell_feature_matrix")
-        }
-
-        pmtx(type = "finish")
-        matrix
-      },
-      'segmentation_method' = {
-        psegs <- progressor()
-        psegs(
-          message = 'Loading cell metadata',
-          class = 'sticky',
-          amount = 0
-        )
-
-        col.use <- c(
-          cell_id = 'cell',
-          segmentation_method = 'segmentation_method'
-        )
-
-        for(option in Filter(function(x) x$req, list(
-          list(
-            filename = "cells.parquet",
-            fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))),
-            req = has_arrow
-          ),
-          list(
-            filename = "cells.csv.gz",
-            fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE, select = names(col.use)),
-            req = has_dt
-          ),
-          list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE)
-        ))) {
-          cell_seg <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename))), silent = TRUE)
-          if(!inherits(cell_seg, "try-error")) { break }
-        }
-
-        if(!exists('cell_seg') || inherits(cell_seg, "try-error") || length(intersect(names(col.use), colnames(cell_seg))) != 2) {
-          warning('cells did not contain a segmentation_method column. Skipping...', call. = FALSE, immediate. = TRUE)
-          NULL
-        } else {
-          cell_seg <- cell_seg[, names(col.use)]
-          colnames(cell_seg) <- col.use
-
-          cell_seg$cell <- binary_to_string(cell_seg$cell)
-
-  has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE)
-  has_arrow <- requireNamespace("arrow", quietly = TRUE)
-  has_hdf5r <- requireNamespace("hdf5r", quietly = TRUE)
-
-  binary_to_string <- function(arrow_binary) {
-    if(typeof(arrow_binary) == 'list') {
-      unlist(
-        lapply(
-          arrow_binary, function(x) rawToChar(as.raw(strtoi(x, 16L)))
-        )
-      )
-    } else {
-      arrow_binary
-    }
-  }
-
   data <- sapply(outs, function(otype) {
     switch(
       EXPR = otype,
@@ -3625,25 +3673,25 @@ SampleUMI <- function(
 #' @export
 #'
 SCTransform.default <- function(
-  object,
-  cell.attr,
-  reference.SCT.model = NULL,
-  do.correct.umi = TRUE,
-  ncells = 5000,
-  residual.features = NULL,
-  variable.features.n = 3000,
-  variable.features.rv.th = 1.3,
-  vars.to.regress = NULL,
-  latent.data = NULL,
-  do.scale = FALSE,
-  do.center = TRUE,
-  clip.range = c(-sqrt(x = ncol(x = umi) / 30), sqrt(x = ncol(x = umi) / 30)),
-  vst.flavor = 'v2',
-  conserve.memory = FALSE,
-  return.only.var.genes = TRUE,
-  seed.use = 1448145,
-  verbose = TRUE,
-  ...
+    object,
+    cell.attr,
+    reference.SCT.model = NULL,
+    do.correct.umi = TRUE,
+    ncells = 5000,
+    residual.features = NULL,
+    variable.features.n = 3000,
+    variable.features.rv.th = 1.3,
+    vars.to.regress = NULL,
+    latent.data = NULL,
+    do.scale = FALSE,
+    do.center = TRUE,
+    clip.range = c(-sqrt(x = ncol(x = umi) / 30), sqrt(x = ncol(x = umi) / 30)),
+    vst.flavor = 'v2',
+    conserve.memory = FALSE,
+    return.only.var.genes = TRUE,
+    seed.use = 1448145,
+    verbose = TRUE,
+    ...
 ) {
   if (!is.null(x = seed.use)) {
     set.seed(seed = seed.use)
@@ -4148,16 +4196,16 @@ SubsetByBarcodeInflections <- function(object) {
 #' @export
 #'
 FindVariableFeatures.V3Matrix <- function(
-  object,
-  selection.method = "vst",
-  loess.span = 0.3,
-  clip.max = 'auto',
-  mean.function = FastExpMean,
-  dispersion.function = FastLogVMR,
-  num.bin = 20,
-  binning.method = "equal_width",
-  verbose = TRUE,
-  ...
+    object,
+    selection.method = "vst",
+    loess.span = 0.3,
+    clip.max = 'auto',
+    mean.function = FastExpMean,
+    dispersion.function = FastLogVMR,
+    num.bin = 20,
+    binning.method = "equal_width",
+    verbose = TRUE,
+    ...
 ) {
   CheckDots(...)
   if (!inherits(x = object, 'Matrix')) {
@@ -4575,11 +4623,11 @@ FindSpatiallyVariableFeatures.Seurat <- function(
 #' @export
 #'
 LogNormalize.data.frame <- function(
-  data,
-  scale.factor = 1e4,
-  margin = 2L,
-  verbose = TRUE,
-  ...
+    data,
+    scale.factor = 1e4,
+    margin = 2L,
+    verbose = TRUE,
+    ...
 ) {
   return(LogNormalize(
     data = as.matrix(x = data),
@@ -4594,11 +4642,11 @@ LogNormalize.data.frame <- function(
 #' @export
 #'
 LogNormalize.V3Matrix <- function(
-  data,
-  scale.factor = 1e4,
-  margin = 2L,
-  verbose = TRUE,
-  ...
+    data,
+    scale.factor = 1e4,
+    margin = 2L,
+    verbose = TRUE,
+    ...
 ) {
   # if (is.data.frame(x = data)) {
   #   data <- as.matrix(x = data)
@@ -4641,13 +4689,13 @@ LogNormalize.V3Matrix <- function(
 #' @export
 #'
 NormalizeData.V3Matrix <- function(
-  object,
-  normalization.method = "LogNormalize",
-  scale.factor = 1e4,
-  margin = 1,
-  block.size = NULL,
-  verbose = TRUE,
-  ...
+    object,
+    normalization.method = "LogNormalize",
+    scale.factor = 1e4,
+    margin = 1,
+    block.size = NULL,
+    verbose = TRUE,
+    ...
 ) {
   CheckDots(...)
   if (is.null(x = normalization.method)) {
@@ -5110,7 +5158,7 @@ ScaleData.IterableMatrix <- function(
     object <- BPCells::min_by_row(mat = object, vals = scale.max * features.sd + features.mean)
   }
   scaled.data <- (object - features.mean) / features.sd
-return(scaled.data)
+  return(scaled.data)
 }