enhance.R

# ENHANCE denoising algorithm for single-cell RNA-Seq data
# Version: 0.1
# 
# Authors: Dalia Barkley <dalia.barkley@nyulangone.org>, Florian Wagner <florian.wagner@nyu.edu>
# Copyright (c) 2019 New York University
# 
# Notes
# =====
# - R implementation, depends on "rsvd" CRAN package.

normalize_median = function(
  D,
  med = NULL
){
  # Normalizes the expression profiles in an expression matrix.
  #
  # Args:
  #   D: The count expression matrix (rows=genes, columns=cells).
  #   med (optional): The desired transcript count to normalize to.
  #                          By default, the median transcript count across all cells is used.
  #
  # Returns:
  #   The normalized expression matrix.
  tpercell = colSums(D)
  if (is.null(med)){
    med = median(tpercell)
  }
  D = med * t(t(D) / tpercell)
  return(D)
}

transform_ft = function(
  D
){
  # Performs Freeman-Tukey transformation of a normalized expression matrix. 
  #
  # Args:
  #   D: The normalized expression matrix (rows=genes, columns=cells).
  #
  # Returns:
  #   The transformed expression matrix.
  D = sqrt(D) + sqrt(1+D)
  return(D)
}

simulate_noise_matrix = function(
  D
){
  # Simulates a matrix in which all variance is noise. 
  # First, the mean expression level across the original matrix is calculated. 
  # Then, a new matrix is generated by poisson sampling from these mean values. 
  # 
  #
  # Args:
  #   D: The normalized expression matrix (rows=genes, columns=cells).
  #
  # Returns:
  #   A simulated pure noise matrix of identical dimensions to the input matrix
  ncells = dim(D)[2]
  means = rowMeans(D)
  D = t(sapply(means, function(m){
    rpois(ncells, lambda = m)
  }))
  colnames(D) = 1:ncells
  return(D)
}

perform_pca = function(
  D,
  med = NULL,
  return_pcs = TRUE,
  ratio_pcs = NULL
){
  # Performs PCA
  #
  # Args:
  #   D: The count expression matrix (rows=genes, columns=cells).
  #   med (optional): The desired transcript count to normalize to.
  #                          By default, the median transcript count across all cells is used.
  #   return_pcs (optional): Whether to determine the number of significant principal components
  #                           using a simulated expression matrix. 
  #                           Default: FALSE
  #   ratio_pcs (optional): If determining the number of significant principal components
  #                           using a simulated expression matrix, variance ratio between simulated and real matrix
  #                           to use to determined the threshold. 
  #                           Default: 2
  #
  # Returns:
  #   A PCA object containing rotation (gene loadings), x (cell scores) and pcs (significant principal components) if return_pcs is TRUE. 
  
  if (is.null(med)){
    tpercell = colSums(D)
    med = median(tpercell)
  }
  PCA = rpca(t(transform_ft(normalize_median(D, med = med))), k = 50, center = TRUE, scale = FALSE, retx = TRUE, p = 10, q = 7)
  if (return_pcs){
    D_sim = simulate_noise_matrix(normalize_median(D, med=med))
    PCA_sim = rpca(t(transform_ft(normalize_median(D_sim, med = med))), k = 50, center = TRUE, scale = FALSE, retx = TRUE, p = 10, q = 7)
    pcs = which(PCA$sdev^2 > ratio_pcs*PCA_sim$sdev[1]^2)
    if (length(pcs) < 2){
      pcs = c(1,2)
    }
    return(c(PCA, list('pcs' = pcs)))
  } else {
    return(PCA)
  }
}

find_nearest_neighbors = function(
  D,
  k_nn
){
  # Finds the nearest neighbors of each cell in a matrix
  # First, calculates the distance between each pair of cells. 
  # Then, for each cell, sorts the distances to it to return the nearest neighbors. 
  # 
  #
  # Args:
  #   D: The matrix from which to calculate the distance (rows=variables, columns=cells).
  #       This can be an expression matrix (rows=genes) or principal component coordinates (rows=scores)
  #   k_nn: Number of neighbors to return
  #
  # Returns:
  #   A list containing, for each cell, the indices of its k_nn nearest neighbors
  ncells = dim(D)[2]
  distances = as.matrix(dist(t(D), method = 'euclidean'))
  nn = lapply(1:ncells, function(c){
    order(distances[,c])[1:k_nn]
  })
  return(nn)
}

aggregate_nearest_neighbors = function(
  D,
  nn
){
  # For each cell, aggregates the expression values of its nearest neighbors
  # 
  #
  # Args:
  #   D: The count expression matrix (rows=genes, columns=cells).
  #   nn: The list containing, for each cell, the indices of its k_nn nearest neighbors
  #
  # Returns:
  #   The aggregated expression matrix
  D_agg = sapply(nn, function(indices){
    rowSums(D[, indices])
  })
  return(D_agg)
}

estimate_sizes = function(
  D,
  nn
){
  # For each cell, estimates its size by taking the median size of its k_nn nearest neighbors
  # 
  #
  # Args:
  #   D: The count expression matrix (rows=genes, columns=cells).
  #   nn: The list containing, for each cell, the indices of its k_nn nearest neighbors
  #
  # Returns:
  #   The vector of estimated cell sizes
  sizes = sapply(nn, function(indices){
    median(colSums(D[, indices]))
  })
  return(sizes)
}

enhance = function(
  data_raw = NULL,
  ratio_pcs = 2,
  k_nn = NULL,
  target_transcripts = 2*10^5,
  percent_cells_max = 2
){
  # Main function, performs all the steps to return a denoised expression matrix
  # 
  #
  # Args:
  #   data_raw: The raw count expression matrix (rows=genes, columns=cells).
  #   ratio_pcs (optional): Variance ratio between simulated and real matrix
  #                           to use to determined the threshold at which to keep principal components
  #                           Default: 2
  #   k_nn (optional): Number of neighbors to aggregate
  #                     Default: Calculated so that aggregation yields ~ target_transcripts per cell
  #   target_transcripts (optional): Number of transcripts per cell to aim for in aggregation
  #                                   Default: 2*10^5
  #   percent_cells_max (optional): Maximum percentage of cells to aggregate
  #                                   Default: 2
  #
  # Returns:
  #   The denoised expression matrix
  
  library(rsvd)
  
  # Determining k_nn
  
  med_raw = median(colSums(data_raw))
  
  if (is.null(k_nn)){
    ncells = dim(data_raw)[2]
    k_nn_max = ceiling(percent_cells_max/100 * ncells)
    k_nn = ceiling(target_transcripts / med_raw)
    print(paste('Calculating number of neighbors to aggregate to aim for', target_transcripts, 'transcripts'))
    if (k_nn > k_nn_max){
      k_nn = k_nn_max
      print(paste('Calculated number of neighbors to aggregate was too high, reducing to', percent_cells_max, 'percent of cells'))
    }
  }
  print(paste('Number of neighbors to aggregate:', k_nn))
  
  # PCA on the raw data
  
  pca_raw = perform_pca(D = data_raw, med = med_raw, return_pcs = TRUE, ratio_pcs = ratio_pcs)
  print(paste('Number of principal components to use:', max(pca_raw$pcs)))
  data_raw_pca_raw = t(pca_raw$x)[pca_raw$pcs, ]
  
  # Find nearest neighbors based on PCA, then aggregate raw data
  
  nn_1 = find_nearest_neighbors(D = data_raw_pca_raw, k_nn = k_nn)
  data_agg1 = aggregate_nearest_neighbors(D = data_raw, nn = nn_1)
  
  # PCA on the aggregated data, then projection of the raw data onto these PCs
  
  pca_agg1 = perform_pca(D = data_agg1, med = med_raw, return_pcs = FALSE)
  data_raw_pca_agg1 = t(t(transform_ft(normalize_median(data_raw, med = med_raw)) - pca_agg1$center) %*% pca_agg1$rotation)[pca_raw$pcs, ]
  
  # Find nearest neighbors based on the projected PCA, then aggregate raw data
  
  nn_2 = find_nearest_neighbors(D = data_raw_pca_agg1, k_nn = k_nn)
  data_agg2 = aggregate_nearest_neighbors(D = data_raw, nn = nn_2)
  sizes_agg2 = estimate_sizes(D = data_raw, nn = nn_2)
  med_agg2 = median(colSums(data_agg2))
  
  # PCA on the aggregated data
  
  pca_agg2 = perform_pca(D = data_agg2, med = med_agg2, return_pcs = FALSE)
  data_agg2_pca_agg2 = t(pca_agg2$x)[pca_raw$pcs, ]
  
  # Reversing PCA, rescaling to estimated cell sizes, and reversing transform
  
  data_denoise = pca_agg2$rotation[, pca_raw$pcs] %*% t(pca_agg2$x)[pca_raw$pcs, ] + pca_agg2$center
  data_denoise[data_denoise < 1] = 1
  data_denoise = (data_denoise^2 - 1)^2 / (4*data_denoise^2)
  data_denoise = t(t(data_denoise) * sizes_agg2/colSums(data_denoise))
  colnames(data_denoise) = colnames(data_raw)
  
  return(data_denoise)
  
}

enhance_seurat_wrapper = function(
  object,
  setDefaultAssay = TRUE,
  assay = 'RNA',
  ratio_pcs = 2,
  k_nn = NULL,
  target_transcripts = 2*10^5,
  percent_cells_max = 2
){
  # Seurat wrapper for enhance
  # 
  # Args:
  #   object: A Seurat object
  #   setDefaultAssay: Whether to set enhance as the default assay
  #                     Default: TRUE
  #   assay: Which assay to run enhance on
  #           Default: "RNA"
  #   ratio_pcs (optional): Variance ratio between simulated and real matrix
  #                           to use to determined the threshold at which to keep principal components
  #                           Default: 2
  #   k_nn (optional): Number of neighbors to aggregate
  #                     Default: Calculated so that aggregation yields ~ target_transcripts per cell
  #   target_transcripts (optional): Number of transcripts per cell to aim for in aggregation
  #   percent_cells_max (optional): Maximum percentage of cells to aggregate
  #                                   Default: 2
  #
  # Returns:
  #   A Seurat object with a new assay called "enhance", in which the "counts" slot is the denoised counts matrix
  #     Downstream analysis may include NormalizeData, ScaleData, FindVariableFeatures, RunPCA, etc
  counts = GetAssayData(object, assay = assay, slot = "counts")
  counts_enhance = enhance(data_raw = counts, ratio_pcs = ratio_pcs, k_nn = k_nn, target_transcripts = target_transcripts, percent_cells_max = percent_cells_max)
  counts_enhance = Matrix(data = counts_enhance, sparse = TRUE)
  colnames(counts_enhance) = colnames(counts)
  rownames(counts_enhance) = rownames(counts)
  object[["enhance"]] = CreateAssayObject(counts = counts_enhance)
  if (setDefaultAssay) {
    message("Setting default assay as enhance")
    DefaultAssay(object) = "enhance"
  }
  return(object)
}