-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathTCGA_DEGs.Rmd
444 lines (376 loc) · 18.9 KB
/
TCGA_DEGs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
---
title: Differential expression analysis of TCGA data between groups with high/low
expression of selected genes
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: no
html_document:
theme: united
toc: no
csl: styles.ref/genomebiology.csl
bibliography: data.TCGA/TCGA.bib
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is') #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r results='hide'}
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(limma)
library(openxlsx)
library(MDmisc)
library(org.Hs.eg.db)
library(enrichR)
```
```{r}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
if (all(file.exists(FILE), !(force_reload))) {
# If the data has been previously saved, load it
load(file = FILE)
} else {
# If no saved data exists, get it from the remote source
mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
save(file = FILE, list = c("mtx")) # Save it
}
return(mtx)
}
# A function to get data overview
summarize_data <- function(mtx = mtx) {
print(paste0("Dimensions of expression matrix, genex X patients: ", paste(dim(mtx$dat), collapse = " ")))
print(paste0("Dimensions of clinical matrix, patients X parameters: ", paste(dim(mtx$clinical), collapse = " ")))
print(paste0("Dimensions of merged matrix, patients X parameters + genes: ", paste(dim(mtx$merged.dat), collapse = " ")))
print("Head of the merged matrix")
print(mtx$merged.dat[1:5, 1:10])
print("Head of the clinical matrix")
print(mtx$clinical[1:5, 1:7])
print("List of clinical values, and frequency of each variable: ")
clin_vars <- apply(mtx$clinical, 2, function(x) length(table(x[ !(is.na(x) & x != "" )]))) %>% as.data.frame()
# Filter clinical variables to have at least 2, but no more than 10 categories,
# And they are not dates
clin_vars <- clin_vars[ as.numeric(clin_vars$.) > 1 & as.numeric(clin_vars$.) < 10 & !grepl("years|days|date|vital", rownames(clin_vars), perl = TRUE) , , drop = FALSE]
print(kable(clin_vars))
return(rownames(clin_vars))
}
# A wrapper function to perform all functional enrichment analyses.
# Helper function to save non-empty results
save_res <- function(res, fileName = fileName, wb = wb, sheetName = "KEGG") {
if (nrow(res) > 0) {
openxlsx::addWorksheet(wb = wb, sheetName = sheetName)
openxlsx::writeData(wb, res, sheet = sheetName)
openxlsx::saveWorkbook(wb, fileName, overwrite = TRUE)
}
}
```
# Settings
```{r echo=TRUE}
# Cancer type
cancer = "LUSC"
# Gene(s) of interest
selected_genes = c("LIN52") # Can be multiple
# Define quantile qutoffs
quantile_up <- 0.75 # 0.75
quantile_lo <- 0.25 # 0.25
# Which pathway enrichment analysis to run
run_gsea <- FALSE # If TRUE, GSEA pathway enrichment analysis is run, otherwise, standard hypergeometric-based enrichment
```
## Differential expression analysis
Samples in the selected cancer cohort were sorted by expression of the selected genes. Differentially expressed genes were detected between samples in the upper `r quantile_up` percentile of the expression gradient and samples in the lower `r quantile_lo` percentile using `limma` v `r packageVersion("limma")` R package [@Ritchie:2015aa; @Smyth:2004aa]. P-values were corrected for multiple testing using False Discovery Rate (FDR) method [@Benjamini:1995aa]. Genes differentially expressed at FDR < 0.01 were selected for further analysis.
```{r}
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# General settings
useTPM = FALSE # Whether or not to convert expression counts to TPM
data.type = "RNASeq2"
type = ""
# Differential expression cutoff
p_val_cutoff <- 0.05 # Regular p-value cutoff
p_adj_cutoff <- 0.05 # FDR cutoff
nplot <- 50 # How many genes to plot on a heatmap
nbox <- 9 # How many genes to plot on a boxplot
ntable <- 15 # Number of genes to output in a DEG table
nkegg <- 35 # Number of genes to output in a KEGG table
min_kegg_genes <- 20 # Minimum number of genes to run enrichment analysis on
max_kegg_genes <- 2000 # Maximum number of genes to run enrichment analysis on
up_dn_separate <- FALSE # Whether to run KEGG separately on up- and downregulated genes. FALSE - do not distinguish directionality
# Filename to same the results
fileNameRes <- paste0("results/TCGA_DEGs_", selected_genes, "_", cancer, ".xlsx")
```
```{r results='hide'}
mtx <- load_data(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
clinical_annotations <- summarize_data(mtx = mtx)
# source("Supplemental_R_script_1.R")
# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE)
# expr <- expr[, apply(expr, 2, ff)]
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)
# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
```
```{r}
# Do TPM conversion, if needed
if (useTPM) {
# Check if the TPMs have already been precalculated for a given cancer
fileNameTPM <- paste0(data_dir, "/", cancer, "_TPM.Rda")
if (!file.exists(fileNameTPM)) {
source("calcTPM.R")
load(file = "data/feature_length.Rda")
common_genes <- intersect(colnames(expr), feature_length$Symbol) # Common gene symbols
expr <- expr[, c("AffyID", common_genes)] # Subset expression matrix
feature_length <- feature_length[feature_length$Symbol %in% common_genes, ] # Subset feature length
feature_length <- feature_length[match(colnames(expr)[ -1 ], feature_length$Symbol), ] # Match order
all.equal(colnames(expr), c("AffyID", feature_length$Symbol)) # Should be true
expr_tpm <- calcTPM(t(expr[, -1]), feature_length = feature_length) # Convert to TPM, takes time
expr_tpm <- data.frame(AffyID = expr[, "AffyID"], t(expr_tpm), stringsAsFactors = FALSE)
expr <- expr_tpm
save(list = c("expr"), file = fileNameTPM) # Save the object
} else {
load(file = fileNameTPM)
}
}
```
# Expression distribution for `r selected_genes` gene
Informational plot only. Expected log2-transformed RSEM expression range distribution: $\sim 0 - 16$. Median close to $0$ indicate overall low expression, should be avoided.
```{r fig.height=3}
selected_genes_expression <- melt(expr[, colnames(expr) %in% selected_genes, drop = FALSE])
p1 <- ggplot(selected_genes_expression, aes(x = variable, y = log2(value))) + geom_boxplot()
p2 <- ggplot(selected_genes_expression, aes(x = log2(value))) + geom_density()
grid.arrange(p1, p2, ncol = 2)
```
```{r}
# View and subset by expression and quantiles of the selected genes
selected_genes_quantiles_up <- with(selected_genes_expression, tapply(value, variable, quantile, probs = quantile_up)) # Select upper quantile!
selected_genes_quantiles_lo <- with(selected_genes_expression, tapply(value, variable, quantile, probs = quantile_lo)) # Select lower quantile!
# Subset exprs by top expression
selected_index_up <- list() # Collect boolean indexes for each gene
selected_index_lo <- list() # Collect boolean indexes for each gene
for (gene in selected_genes) {
ind_up <- expr[, gene] > selected_genes_quantiles_up[ gene ] # True, if expressed above the selected upper quantile
ind_lo <- expr[, gene] < selected_genes_quantiles_lo[ gene ] # True, if expressed below the selected lower quantile
selected_index_up <- c(selected_index_up, list(ind_up))
selected_index_lo <- c(selected_index_lo, list(ind_lo))
}
ind_up <- apply(as.data.frame(selected_index_up), 1, all) # Collapse indexes from multiple selected genes, all selected genes should be expressed in the upper quantile
ind_lo <- apply(as.data.frame(selected_index_lo), 1, all) # Collapse indexes from multiple selected genes, all selected genes should be expressed in the lower quantile
# For differential analysis, create group labels
group <- vector(mode = "numeric", length = nrow(expr)) # Empty bector
group[ind_up] <- 1 # Assign numeric groups
group[ind_lo] <- 2
# table(group) # How many patients we have
expr <- expr[group != 0, ] # Remove those that are not in quantiles
group <- group[ group != 0 ]
```
# Differential expression analysis
```{r}
# Prerequisites, prepared by the survival.R script
# expr - expression matrix separated by the high/low expression of the selected genes
# group - labeling of samples having high/low expression of the selected genes
# Reshape expression matrix
expr <- (t(expr))
colnames(expr) <- expr[1, ]
expr <- expr[-1, ]
class(expr) <- "numeric"
# expr <- log2(expr + 1)
expr <- voom(expr)$E
# boxplot(expr)
# Limma
design <- model.matrix(~0 + factor(group))
colnames(design) <- c("up", "lo")
fit <- lmFit(expr, design)
contrast.matrix <- makeContrasts(up-lo, levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
degs <- topTable(fit2, coef = 1, number = Inf, p.value = p_val_cutoff)
```
We split `r cancer` cohort into `r length(group[group == 1])` x `r length(group[group == 2])` patients having `r paste(selected_genes, collapse = ", ") ` in lower `r quantile_up` and upper `r quantile_lo` expression quantiles. We have a total of `r nrow(degs)` differentially expressed genes at FDR corrected p-value `r p_val_cutoff`. `r nrow(degs[ degs$logFC > 0, ])` are upregulated, `r nrow(degs[ degs$logFC < 0, ])` are downregulated.
Top 50 the most differentially expressed genes are shown
```{r fig.height=8.5}
index.to.plot <- order(group)
matrix.to.plot <- expr[rownames(degs)[1:50], index.to.plot]
genes.to.plot <- rownames(degs)[1:50]
group.to.plot <- group[index.to.plot]
group.to.plot <- ifelse(group.to.plot == 1, "UP", "LO")
NMF::aheatmap(matrix.to.plot, color=colorRampPalette(c('blue', 'gray', 'yellow'))(20), Colv = NA, Rowv = FALSE, hclust = "ward", scale = "row", annCol = group.to.plot, annColors = list(c("blue", "red")), labRow = genes.to.plot, fontsize = 10, cexRow = 10) # color="-RdYlBu"
```
```{r}
# Save results
# Create (or, load) Excel file
unlink(fileNameRes)
wb <- openxlsx::createWorkbook(fileNameRes) # loadWorkbook(fileNameRes) #
save_res(data.frame(Gene = rownames(degs), degs), fileName = fileNameRes, wb = wb, sheetName = "DEGS")
```
Results are stored in the Excel file `r fileNameRes`
- Legend for gene lists: "Gene" - gene annotations; "logFC" - log fold change; "AveExpr" - average expression, log2; "t" - t-statistics; "P.Val"/"adj.P.Val" - non-/FDR-adjusted p-value, "B" - another statistics.
```{r}
# DT::datatable(degs)
pander(degs[1:min(ntable, nrow(degs)), ])
```
# Functional enrichment analysis
Up- and downregulated genes are tested for functional enrichment `r paste(ifelse(up_dn_separate, "separately", "jointly"))`. `r paste(ifelse(up_dn_separate, "Each table has enrichment results for both up-/downregulated genes. The \"direction\" column indicate which pathways are enriched in \"UP\"- or \"DN\"-regulated genes.", ""))`. FDR cutoff of the significant enrichments - `r p_adj_cutoff`. Top `r ntable` genes shown.
## KEGG pathway enrichment analysis
**Legend:** "database" - source of functional annotations, "category" - name of functional annotation, "pval" - unadjusted enrichment p-value, "qval" - FDR-adjusted p-value, "genes" - comma-separated differentially expressed genes enriched in a corresponding functional category. `r paste(ifelse(up_dn_separate, "\"direction\" - UP/DN, an indicator whether genes are up- or downregulated.", ""))`
```{r}
if( run_gsea == FALSE) {
# Subset the number of DEGs for KEGG analysis to the maximum
if (nrow(degs) > max_kegg_genes) {
degs_subset <- degs[1:max_kegg_genes, ]
} else {
degs_subset <- degs
}
# Get list of up- and downregulated genes
up.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t > 0 ]))
dn.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t < 0 ]))
# Run KEGG
if (up_dn_separate) {
# Analyze up- and downregulated genes separately
print(paste0("KEGG pathway run on ", length(up.genes), " upregulated and ", length(dn.genes), " downregulated genes."))
res.kegg <- save_enrichr(up.genes = up.genes, dn.genes = dn.genes, databases = "KEGG_2016", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
} else {
# Analyze up- and downregulated genes together
print(paste0("KEGG pathway run on ", length(unique(c(up.genes, dn.genes))), " genes without distinguishing them by directionality."))
res.kegg <- MDmisc::save_enrichr(up.genes = unique(c(up.genes, dn.genes)), databases = "KEGG_2016", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
}
}
```
## KEGG pathway GSEA analysis
**Legend:** "ID", "Description" - KEGG pathway ID/description, respectively; "NES" - [normalized enrichment score](http://software.broadinstitute.org/gsea/doc/GSEAUserGuideFrame.html); "pvalue", "p.adjust" - raw and FDR-adjusted p-values, respectively; "core_enrichment" - genes enriched in the corresponding pathway.
```{r}
if (run_gsea == TRUE) {
library(clusterProfiler)
library(DOSE)
## GSEA using clusterProfiler
# All DEGs
degs.all <- topTable(fit2, coef = 1, number = Inf)
# Convert symbols to entrezids
eid <- bitr(rownames(degs.all), fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
# Attach converted entrezids
degs.all <- left_join(data.frame(SYMBOL = rownames(degs.all), degs.all), eid, by = "SYMBOL")
degs.all <- degs.all[ degs.all$ENTREZID != "", ]
# List of t-statistics
geneList <- degs.all$t
# Make it named
names(geneList) <- degs.all$ENTREZID
# And decreasing sorted
geneList <- sort(geneList, decreasing = TRUE)
# Actual GSEA
set.seed(1)
ego3 <- gseKEGG(geneList = geneList,
organism = "hsa",
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 0.1,
verbose = FALSE)
# Get summary
ego3 <- setReadable(ego3, OrgDb = org.Hs.eg.db, keytype = "ENTREZID")
res.kegg <- as.data.frame(ego3)
# Save the full results
save_res(res.kegg, fileName = fileNameRes, wb = wb, sheetName = "KEGG_GSEA")
# Prepare for table output
res.kegg <- res.kegg[, c("ID", "Description", "NES", "pvalue", "p.adjust", "core_enrichment")]
res.kegg <- res.kegg[order(res.kegg$NES, decreasing = TRUE), ]
res.kegg <- res.kegg[res.kegg$p.adjust < p_adj_cutoff, ]
res.kegg$NES <- round(res.kegg$NES, digits = 2)
res.kegg$pvalue <- formatC(res.kegg$pvalue, format = "e", digits = 2)
res.kegg$p.adjust <- formatC(res.kegg$p.adjust, format = "e", digits = 2)
rownames(res.kegg) <- NULL
}
```
A total of `r nrow(res.kegg)` KEGG pathways were detected as significantly affected at FDR `r p_adj_cutoff`. Top `r ntable` shown.
```{r}
# Display the results
# DT::datatable(res.kegg)
if (nrow(res.kegg) > 0 ) {
kable(res.kegg[1:min(ntable, nrow(res.kegg)), ])
}
```
## Selected pathway
Red/Green - up/downregulated genes in upper vs. lower `r selected_genes` expressing samples. Gray - marginal fold change, yet significant. White - gene is not differentially expressed
```{r eval=FALSE}
library(pathview)
library(openxlsx)
degs <- read.xlsx(fileNameRes, cols = c(1, 2))
degs.genes <- degs$logFC
names(degs.genes) <- degs$Gene
# Adjust as needed
pv.out <- pathview(gene.data = degs.genes, pathway.id = "04972", species = "hsa", gene.idtype = "SYMBOL", gene.annotpkg = "org.Hs.eg.db", out.suffix = paste(selected_genes, collapse = "-"))
```
```{r echo=FALSE, out.height='300px', eval=FALSE}
knitr::include_graphics('hsa04972.LIN52.png')
```
```{r eval = FALSE}
# ![](hsa04210.CTBP2.png)
# KEGG canonical pathway enrichment analysis
# - Legend for GO/KEGG functional enrichment results: "ID" - unique identifier of functional category; "Pvalue" - non-adjusted p-value; "OddsRatio" - enrichment odds ratio; "ExpCount" - number of genes expected to be selected in a category; "Count" - number of genes observed in the current list; "Size" - total number of genes in a category; "Term" - category description; "p.adj" - false discovery rate; "SYMBOL", "ENTREZ" - genes observed in the current list as annotated with a category
#
# Pathways for **differentially expressed genes** identified above
# # Gene ontology, molecular function
# res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="GO", ont="MF")
# save_res(res, fileName, wb = wb, sheetName = "GOMF")
# # Gene ontology, biological process
# res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="GO", ont="BP")
# save_res(res, fileName, wb = wb, sheetName = "GOBP")
# # Gene ontology, cellular component
# res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="GO", ont="CC")
# save_res(res, fileName, wb = wb, sheetName = "GOCC")
# KEGG canonical pathways
res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="KEGG")
DT::datatable(res)
```
```{r eval=FALSE}
# GSEA analysis
## KEGGUP
# **Upregulated** pathways
library(gage)
data("kegg.gs")
# Convert Symbols to EntrezIDs
ids <- clusterProfiler::bitr(rownames(expr), fromType="SYMBOL", toType="ENTREZID", OrgDb = "org.Hs.eg.db")
expr <- left_join(data.frame(SYMBOL = rownames(expr), expr), ids, by = "SYMBOL") # Append converted EntrezIDs
expr$SYMBOL <- NULL # Remove symbols
expr <- expr[!is.na(expr$ENTREZID), ] # Remove unmapped rows
rownames(expr) <- expr$ENTREZID # Add EntrezIDs as rows
expr$ENTREZID <- NULL # Remove them from the data frame
# Analysis for pathways overrepresented by up/downregulated genes
kegg.p <- gage(expr,
gsets = kegg.gs,
ref = which(group == 2, arr.ind = TRUE),
samp = which(group == 1, arr.ind = TRUE),
same.dir = TRUE,
rank.test = TRUE,
saaTest = gs.KSTest,
compare = "unpairedd")
```
```{r eval=FALSE}
DT::datatable(data.frame(pathway = rownames(kegg.p$greater), kegg.p$greater[, c("p.val", "q.val", "set.size")]))
```
```{r eval=FALSE}
## KEGGDN
# **Downregulated** pathways
DT::datatable(data.frame(pathway = rownames(kegg.p$less), kegg.p$less[, c("p.val", "q.val", "set.size")]))
```
```{r session_info, eval = FALSE}
diagnostics <- devtools::session_info()
platform <- data.frame(diagnostics$platform %>% unlist, stringsAsFactors = FALSE)
colnames(platform) <- c("description")
pander(platform)
packages <- as.data.frame(diagnostics$packages)
pander(packages[ packages$`*` == "*", ])
```
# References