-
Notifications
You must be signed in to change notification settings - Fork 2
/
enhance.R
295 lines (266 loc) · 9.93 KB
/
enhance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# ENHANCE denoising algorithm for single-cell RNA-Seq data
# Version: 0.1
#
# Authors: Dalia Barkley <dalia.barkley@nyulangone.org>, Florian Wagner <florian.wagner@nyu.edu>
# Copyright (c) 2019 New York University
#
# Notes
# =====
# - R implementation, depends on "rsvd" CRAN package.
normalize_median = function(
D,
med = NULL
){
# Normalizes the expression profiles in an expression matrix.
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# med (optional): The desired transcript count to normalize to.
# By default, the median transcript count across all cells is used.
#
# Returns:
# The normalized expression matrix.
tpercell = colSums(D)
if (is.null(med)){
med = median(tpercell)
}
D = med * t(t(D) / tpercell)
return(D)
}
transform_ft = function(
D
){
# Performs Freeman-Tukey transformation of a normalized expression matrix.
#
# Args:
# D: The normalized expression matrix (rows=genes, columns=cells).
#
# Returns:
# The transformed expression matrix.
D = sqrt(D) + sqrt(1+D)
return(D)
}
simulate_noise_matrix = function(
D
){
# Simulates a matrix in which all variance is noise.
# First, the mean expression level across the original matrix is calculated.
# Then, a new matrix is generated by poisson sampling from these mean values.
#
#
# Args:
# D: The normalized expression matrix (rows=genes, columns=cells).
#
# Returns:
# A simulated pure noise matrix of identical dimensions to the input matrix
ncells = dim(D)[2]
means = rowMeans(D)
D = t(sapply(means, function(m){
rpois(ncells, lambda = m)
}))
colnames(D) = 1:ncells
return(D)
}
perform_pca = function(
D,
med = NULL,
return_pcs = TRUE,
ratio_pcs = NULL
){
# Performs PCA
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# med (optional): The desired transcript count to normalize to.
# By default, the median transcript count across all cells is used.
# return_pcs (optional): Whether to determine the number of significant principal components
# using a simulated expression matrix.
# Default: FALSE
# ratio_pcs (optional): If determining the number of significant principal components
# using a simulated expression matrix, variance ratio between simulated and real matrix
# to use to determined the threshold.
# Default: 2
#
# Returns:
# A PCA object containing rotation (gene loadings), x (cell scores) and pcs (significant principal components) if return_pcs is TRUE.
if (is.null(med)){
tpercell = colSums(D)
med = median(tpercell)
}
PCA = rpca(t(transform_ft(normalize_median(D, med = med))), k = 50, center = TRUE, scale = FALSE, retx = TRUE, p = 10, q = 7)
if (return_pcs){
D_sim = simulate_noise_matrix(normalize_median(D, med=med))
PCA_sim = rpca(t(transform_ft(normalize_median(D_sim, med = med))), k = 50, center = TRUE, scale = FALSE, retx = TRUE, p = 10, q = 7)
pcs = which(PCA$sdev^2 > ratio_pcs*PCA_sim$sdev[1]^2)
if (length(pcs) < 2){
pcs = c(1,2)
}
return(c(PCA, list('pcs' = pcs)))
} else {
return(PCA)
}
}
find_nearest_neighbors = function(
D,
k_nn
){
# Finds the nearest neighbors of each cell in a matrix
# First, calculates the distance between each pair of cells.
# Then, for each cell, sorts the distances to it to return the nearest neighbors.
#
#
# Args:
# D: The matrix from which to calculate the distance (rows=variables, columns=cells).
# This can be an expression matrix (rows=genes) or principal component coordinates (rows=scores)
# k_nn: Number of neighbors to return
#
# Returns:
# A list containing, for each cell, the indices of its k_nn nearest neighbors
ncells = dim(D)[2]
distances = as.matrix(dist(t(D), method = 'euclidean'))
nn = lapply(1:ncells, function(c){
order(distances[,c])[1:k_nn]
})
return(nn)
}
aggregate_nearest_neighbors = function(
D,
nn
){
# For each cell, aggregates the expression values of its nearest neighbors
#
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# nn: The list containing, for each cell, the indices of its k_nn nearest neighbors
#
# Returns:
# The aggregated expression matrix
D_agg = sapply(nn, function(indices){
rowSums(D[, indices])
})
return(D_agg)
}
estimate_sizes = function(
D,
nn
){
# For each cell, estimates its size by taking the median size of its k_nn nearest neighbors
#
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# nn: The list containing, for each cell, the indices of its k_nn nearest neighbors
#
# Returns:
# The vector of estimated cell sizes
sizes = sapply(nn, function(indices){
median(colSums(D[, indices]))
})
return(sizes)
}
enhance = function(
data_raw = NULL,
ratio_pcs = 2,
k_nn = NULL,
target_transcripts = 2*10^5,
percent_cells_max = 2
){
# Main function, performs all the steps to return a denoised expression matrix
#
#
# Args:
# data_raw: The raw count expression matrix (rows=genes, columns=cells).
# ratio_pcs (optional): Variance ratio between simulated and real matrix
# to use to determined the threshold at which to keep principal components
# Default: 2
# k_nn (optional): Number of neighbors to aggregate
# Default: Calculated so that aggregation yields ~ target_transcripts per cell
# target_transcripts (optional): Number of transcripts per cell to aim for in aggregation
# Default: 2*10^5
# percent_cells_max (optional): Maximum percentage of cells to aggregate
# Default: 2
#
# Returns:
# The denoised expression matrix
library(rsvd)
# Determining k_nn
med_raw = median(colSums(data_raw))
if (is.null(k_nn)){
ncells = dim(data_raw)[2]
k_nn_max = ceiling(percent_cells_max/100 * ncells)
k_nn = ceiling(target_transcripts / med_raw)
print(paste('Calculating number of neighbors to aggregate to aim for', target_transcripts, 'transcripts'))
if (k_nn > k_nn_max){
k_nn = k_nn_max
print(paste('Calculated number of neighbors to aggregate was too high, reducing to', percent_cells_max, 'percent of cells'))
}
}
print(paste('Number of neighbors to aggregate:', k_nn))
# PCA on the raw data
pca_raw = perform_pca(D = data_raw, med = med_raw, return_pcs = TRUE, ratio_pcs = ratio_pcs)
print(paste('Number of principal components to use:', max(pca_raw$pcs)))
data_raw_pca_raw = t(pca_raw$x)[pca_raw$pcs, ]
# Find nearest neighbors based on PCA, then aggregate raw data
nn_1 = find_nearest_neighbors(D = data_raw_pca_raw, k_nn = k_nn)
data_agg1 = aggregate_nearest_neighbors(D = data_raw, nn = nn_1)
# PCA on the aggregated data, then projection of the raw data onto these PCs
pca_agg1 = perform_pca(D = data_agg1, med = med_raw, return_pcs = FALSE)
data_raw_pca_agg1 = t(t(transform_ft(normalize_median(data_raw, med = med_raw)) - pca_agg1$center) %*% pca_agg1$rotation)[pca_raw$pcs, ]
# Find nearest neighbors based on the projected PCA, then aggregate raw data
nn_2 = find_nearest_neighbors(D = data_raw_pca_agg1, k_nn = k_nn)
data_agg2 = aggregate_nearest_neighbors(D = data_raw, nn = nn_2)
sizes_agg2 = estimate_sizes(D = data_raw, nn = nn_2)
med_agg2 = median(colSums(data_agg2))
# PCA on the aggregated data
pca_agg2 = perform_pca(D = data_agg2, med = med_agg2, return_pcs = FALSE)
data_agg2_pca_agg2 = t(pca_agg2$x)[pca_raw$pcs, ]
# Reversing PCA, rescaling to estimated cell sizes, and reversing transform
data_denoise = pca_agg2$rotation[, pca_raw$pcs] %*% t(pca_agg2$x)[pca_raw$pcs, ] + pca_agg2$center
data_denoise[data_denoise < 1] = 1
data_denoise = (data_denoise^2 - 1)^2 / (4*data_denoise^2)
data_denoise = t(t(data_denoise) * sizes_agg2/colSums(data_denoise))
colnames(data_denoise) = colnames(data_raw)
return(data_denoise)
}
enhance_seurat_wrapper = function(
object,
setDefaultAssay = TRUE,
assay = 'RNA',
ratio_pcs = 2,
k_nn = NULL,
target_transcripts = 2*10^5,
percent_cells_max = 2
){
# Seurat wrapper for enhance
#
# Args:
# object: A Seurat object
# setDefaultAssay: Whether to set enhance as the default assay
# Default: TRUE
# assay: Which assay to run enhance on
# Default: "RNA"
# ratio_pcs (optional): Variance ratio between simulated and real matrix
# to use to determined the threshold at which to keep principal components
# Default: 2
# k_nn (optional): Number of neighbors to aggregate
# Default: Calculated so that aggregation yields ~ target_transcripts per cell
# target_transcripts (optional): Number of transcripts per cell to aim for in aggregation
# percent_cells_max (optional): Maximum percentage of cells to aggregate
# Default: 2
#
# Returns:
# A Seurat object with a new assay called "enhance", in which the "counts" slot is the denoised counts matrix
# Downstream analysis may include NormalizeData, ScaleData, FindVariableFeatures, RunPCA, etc
counts = GetAssayData(object, assay = assay, slot = "counts")
counts_enhance = enhance(data_raw = counts, ratio_pcs = ratio_pcs, k_nn = k_nn, target_transcripts = target_transcripts, percent_cells_max = percent_cells_max)
counts_enhance = Matrix(data = counts_enhance, sparse = TRUE)
colnames(counts_enhance) = colnames(counts)
rownames(counts_enhance) = rownames(counts)
object[["enhance"]] = CreateAssayObject(counts = counts_enhance)
if (setDefaultAssay) {
message("Setting default assay as enhance")
DefaultAssay(object) = "enhance"
}
return(object)
}