-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathTCGA_CNV.Rmd
419 lines (355 loc) · 16.7 KB
/
TCGA_CNV.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
---
title: CNV analysis
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: no
html_document:
theme: united
toc: no
csl: styles.ref/genomebiology.csl
bibliography: data.TCGA/TCGA.bib
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is') #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r results='hide'}
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(limma)
library(openxlsx)
library(MDmisc)
library(org.Hs.eg.db)
devtools::install_github("mdozmorov/enrichR")
library(enrichR)
library(pheatmap)
library(gplots) # install.packages("gplot")
library(RColorBrewer) # of source("http:/bioconductor.org/biocLite.R") biocLite("RColorBrewer")
library(survival)
library(survminer)
```
```{r}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
if (all(file.exists(FILE), !(force_reload))) {
# If the data has been previously saved, load it
load(file = FILE)
} else {
# If no saved data exists, get it from the remote source
mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
save(file = FILE, list = c("mtx")) # Save it
}
return(mtx)
}
# A function to get data overview
summarize_data <- function(mtx = mtx) {
print(paste0("Dimensions of expression matrix, genex X patients: ", paste(dim(mtx$dat), collapse = " ")))
print(paste0("Dimensions of clinical matrix, patients X parameters: ", paste(dim(mtx$clinical), collapse = " ")))
print(paste0("Dimensions of merged matrix, patients X parameters + genes: ", paste(dim(mtx$merged.dat), collapse = " ")))
print("Head of the merged matrix")
print(mtx$merged.dat[1:5, 1:10])
print("Head of the clinical matrix")
print(mtx$clinical[1:5, 1:7])
print("List of clinical values, and frequency of each variable: ")
clin_vars <- apply(mtx$clinical, 2, function(x) length(table(x[ !(is.na(x) & x != "" )]))) %>% as.data.frame()
# Filter clinical variables to have at least 2, but no more than 10 categories,
# And they are not dates
clin_vars <- clin_vars[ as.numeric(clin_vars$.) > 1 & as.numeric(clin_vars$.) < 10 & !grepl("years|days|date|vital", rownames(clin_vars), perl = TRUE) , , drop = FALSE]
print(kable(clin_vars))
return(rownames(clin_vars))
}
```
```{r echo=TRUE}
# Cancer type
cancer = "LIHC"
# Gene(s) of interest
selected_genes = c("MTDH", "SDCBP") # Can be multiple
```
```{r}
# Differential expression cutoff
p_val_cutoff <- 0.01 # P-value cutoff
p_adj_cutoff <- 0.05 # FDR cutoff
nplot <- 50 # How many genes to plot on a heatmap
nbox <- 9 # How many genes to plot on a boxplot
ntable <- 15 # Number of genes to output in a DEG table
nkegg <- 15 # Number of genes to output in a KEGG table
min_kegg_genes <- 20 # Minimum number of genes to run enrichment analysis on
max_kegg_genes <- 2000 # Maximum number of genes to run enrichment analysis on
up_dn_separate <- FALSE # Whether to run KEGG separately on up- and downregulated genes. FALSE - do not distinguish directionality
# Which pathway enrichment analysis to run
run_gsea <- FALSE # If TRUE, GSEA pathway enrichment analysis is run, otherwise, standard hypergeometric-based enrichment
# Filename to same the results
fileNameRes <- paste0("results/", cancer, "_", paste(selected_genes, collapse = "-"), "_DEGs_", p_val_cutoff, ".xlsx")
```
```{r}
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# General settings
useTPM = FALSE # Whether or not to convert expression counts to TPM
data.type = "CNA_SNP"
type = ""
# data.type = "CNA_CGH"
# type = "415K"
```
```{r results='hide'}
mtx <- load_data(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
clinical_annotations <- summarize_data(mtx = mtx)
# source("Supplemental_R_script_1.R")
# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE)
# expr <- expr[, apply(expr, 2, ff)]
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)
# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
```
Density plots of GISTIC scores (discretized copy number variation to values -2, -1, 0, 1, 2), see [FAQ](http://www.cbioportal.org/faq#what-is-gistic-what-is-rae). The majority of the samples should be normal (high peak around zero).
```{r fig.height=3, results='hide'}
expr_selected <- expr[, colnames(expr) %in% selected_genes | colnames(expr) == "AffyID"]
for (i in 1:length(selected_genes)) {
density(expr_selected[, colnames(expr_selected) == selected_genes[i]]) %>% plot(main = selected_genes[i], xlab = "GISTIC scores") %>% print
}
rownames(expr_selected) <- expr_selected$AffyID
expr_selected$AffyID <- NULL
```
\pagebreak
Heatmap of GISTIC scores across samples (X axis) in the corresponding gene(s). Blue/red insicate deletion/amplification, respectively.
```{r fig.show='hide'}
# Initial heatmap building
col3 <- colorRampPalette(c("blue", "white", "red"))
dist.method <- "euclidean" # "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
hclust.method <- "ward.D" # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
# Get the data
expr_selected_for_heatmap <- t(expr_selected) # Rotate, so samples are on X axis
expr_selected_for_heatmap <- as.matrix(expr_selected_for_heatmap)
# If just one gene, add a row of zeros to generate a heatmap
if (nrow(expr_selected_for_heatmap) == 1) {
expr_selected_for_heatmap <- rbind(expr_selected_for_heatmap, 0)
}
# Plot the heatmap
h <- heatmap.2(expr_selected_for_heatmap, Rowv = FALSE, dendrogram = "column", distfun=function(x){dist(x,method=dist.method)}, hclustfun=function(x){hclust(x,method=hclust.method)}, col = col3, trace = "none", density.info = "none", cexRow = 1, labCol = NA)
# And the dendrogram
h$colDendrogram %>% plot
```
```{r }
# Cut the dendrogram into four clusters
h_clust <- cutree(as.hclust(h$colDendrogram), k = 4)
# Redo cluster numbers into colors
h_clust_color <- h_clust
h_clust_color[h_clust_color == 1] <- "red"
h_clust_color[h_clust_color == 2] <- "blue"
h_clust_color[h_clust_color == 3] <- "green"
h_clust_color[h_clust_color == 4] <- "yellow"
# Plot the heatmap with colored clusters
h <- heatmap.2(expr_selected_for_heatmap, dendrogram = "column", distfun=function(x){dist(x,method=dist.method)}, hclustfun=function(x){hclust(x,method=hclust.method)}, col = col3, trace = "none", density.info = "none", ColSideColors = h_clust_color, cexRow = 1, labCol = NA)
```
!!! Critical. Define which clusters to compare.
```{r echo=TRUE}
# Red cluster vs. other
# ind_up <- h_clust_color == "red"
# ind_lo <- !(ind_up)
# Red cluster vs. blue
# ind_up <- h_clust_color == "red"
# ind_lo <- h_clust_color == "blue"
# Red and Yellow clusters vs. blue
ind_up <- names(h_clust_color)[h_clust_color == "red" | h_clust_color == "yellow"]
ind_lo <- names(h_clust_color)[h_clust_color == "blue"]
```
\pagebreak
```{r}
data.type = "RNASeq2"
type = ""
```
```{r results='hide'}
mtx <- load_data(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
clinical_annotations <- summarize_data(mtx = mtx)
# source("Supplemental_R_script_1.R")
# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE)
# expr <- expr[, apply(expr, 2, ff)]
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)
# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
```
```{r}
# Subset expression and clinical data to samples in clusters
expr <- rbind(expr[expr$AffyID %in% ind_up, ],
expr[expr$AffyID %in% ind_lo, ])
clin <- rbind(clin[clin$AffyID %in% ind_up, ],
clin[clin$AffyID %in% ind_lo, ])
# Define group assignment
group <- c(rep(1, length(expr$AffyID[expr$AffyID %in% ind_up])),
rep(2, length(expr$AffyID[expr$AffyID %in% ind_lo])))
```
# Survival analysis
Survival comparison between the selected clusters. Cluster 1 corresponds to samples defined by "ind_up", cluster 2 - to "ind_dn".
```{r}
# Survival data
clin_surv <- data.frame(OS = clin$OS, status = clin$status, cluster = group)
# https://stats.stackexchange.com/questions/359185/survival-analysis-censoring-question
clin_surv <- clin_surv[!(clin_surv$OS < 5 & clin_surv == 0), ] # Filter out those surviving 5 days and labeled as 0 alive
fit <- survfit(Surv(time = OS, event = status, type = "right") ~ cluster, data = clin_surv)
ggsurvplot(fit, pval = TRUE)
```
\pagebreak
# Differential expression analysis
```{r}
# Prerequisites, prepared by the survival.R script
# expr_for_deg - expr_for_degession matrix separated by the high/low expr_for_degession of the selected genes
# group - labeling of samples having high/low expr_for_degession of the selected genes
# Reshape expr_for_degession matrix
expr_for_deg <- (t(expr))
colnames(expr_for_deg) <- expr_for_deg[1, ]
expr_for_deg <- expr_for_deg[-1, ]
class(expr_for_deg) <- "numeric"
expr_for_deg <- log2(expr_for_deg + 1)
# expr_for_deg <- voom(expr_for_deg)$E
# boxplot(expr_for_deg)
# Limma
design <- model.matrix(~0 + factor(group))
colnames(design) <- c("up", "lo")
fit <- lmFit(expr_for_deg, design)
contrast.matrix <- makeContrasts(up-lo, levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
degs <- topTable(fit2, coef = 1, number = Inf, p.value = p_adj_cutoff)
```
We split `r cancer` cohort into `r length(group[group == 1])` x `r length(group[group == 2])` groups manually separated previously. We have a total of `r nrow(degs)` differentially expressed genes at FDR corrected p-value `r p_adj_cutoff`. `r nrow(degs[ degs$logFC > 0, ])` are upregulated, `r nrow(degs[ degs$logFC < 0, ])` are downregulated.
Top 50 the most differentially expressed genes are shown
```{r fig.height=7}
matrix.to.plot <- expr_for_deg[rownames(expr_for_deg) %in% rownames(degs)[1:min(nrow(degs), nplot)], ]
colnames(matrix.to.plot) <- make.unique(colnames(matrix.to.plot))
genes.to.plot <- rownames(degs)[1:min(nrow(degs), nplot)]
group.to.plot <- group
group.to.plot <- ifelse(group.to.plot == 1, "UP", "LO")
group.to.plot <- data.frame(Group = group.to.plot)
rownames(group.to.plot) <- colnames(matrix.to.plot)
pheatmap(matrix.to.plot, color=colorRampPalette(c('blue', 'gray', 'yellow'))(20), clustering_method = "average", scale = "row", annotation_col = group.to.plot, treeheight_row = 0, treeheight_col = 0, show_colnames = FALSE)
```
```{r}
# Save results
# Create (or, load) Excel file
unlink(fileNameRes)
wb <- openxlsx::createWorkbook(fileNameRes) # loadWorkbook(fileNameRes) #
save_res(data.frame(Gene = rownames(degs), degs), fileName = fileNameRes, wb = wb, sheetName = "DEGS")
```
Results are stored in the Excel file `r fileNameRes`
- Legend for gene lists: "Gene" - gene annotations; "logFC" - log fold change; "AveExpr" - average expression, log2; "t" - t-statistics; "P.Val"/"adj.P.Val" - non-/FDR-adjusted p-value, "B" - another statistics.
```{r}
# DT::datatable(degs)
pander(degs[1:min(ntable, nrow(degs)), ])
```
# Functional enrichment analysis
Up- and downregulated genes are tested for functional enrichment `r paste(ifelse(up_dn_separate, "separately", "jointly"))`. `r paste(ifelse(up_dn_separate, "Each table has enrichment results for both up-/downregulated genes. The \"direction\" column indicate which pathways are enriched in \"UP\"- or \"DN\"-regulated genes.", ""))`. FDR cutoff of the significant enrichments - `r p_adj_cutoff`. Top `r ntable` genes shown.
## KEGG pathway enrichment analysis
**Legend:** "database" - source of functional annotations, "category" - name of functional annotation, "pval" - unadjusted enrichment p-value, "qval" - FDR-adjusted p-value, "genes" - comma-separated differentially expressed genes enriched in a corresponding functional category. `r paste(ifelse(up_dn_separate, "\"direction\" - UP/DN, an indicator whether genes are up- or downregulated.", ""))`
```{r}
if( run_gsea == FALSE) {
# Subset the number of DEGs for KEGG analysis to the maximum
if (nrow(degs) > max_kegg_genes) {
degs_subset <- degs[1:max_kegg_genes, ]
} else {
degs_subset <- degs
}
# Get list of up- and downregulated genes
up.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t > 0 ]))
dn.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t < 0 ]))
# Run KEGG
if (up_dn_separate) {
# Analyze up- and downregulated genes separately
print(paste0("KEGG pathway run on ", length(up.genes), " upregulated and ", length(dn.genes), " downregulated genes."))
res.kegg <- save_enrichr(up.genes = up.genes, dn.genes = dn.genes, databases = "KEGG_2016", p_adj_cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
} else {
# Analyze up- and downregulated genes together
print(paste0("KEGG pathway run on ", length(unique(c(up.genes, dn.genes))), " genes without distinguishing them by directionality."))
res.kegg <- save_enrichr(up.genes = unique(c(up.genes, dn.genes)), databases = "KEGG_2016", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
}
}
```
## KEGG pathway GSEA analysis
**Legend:** "ID", "Description" - KEGG pathway ID/description, respectively; "NES" - [normalized enrichment score](http://software.broadinstitute.org/gsea/doc/GSEAUserGuideFrame.html); "pvalue", "p.adjust" - raw and FDR-adjusted p-values, respectively; "core_enrichment" - genes enriched in the corresponding pathway.
```{r}
if (run_gsea == TRUE) {
library(clusterProfiler)
## GSEA using clusterProfiler
# All DEGs
degs.all <- topTable(fit2, coef = 1, number = Inf, p.value = 1)
# Convert symbols to entrezids
eid <- bitr(rownames(degs.all), fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
# Attach converted entrezids
degs.all <- left_join(data.frame(SYMBOL = rownames(degs.all), degs.all), eid, by = "SYMBOL")
degs.all <- degs.all[ degs.all$ENTREZID != "", ]
degs.all <- degs.all[ complete.cases(degs.all), ]
# List of t-statistics
geneList <- degs.all$B
# Make it named
names(geneList) <- degs.all$ENTREZID
# And decreasing sorted
geneList <- sort(geneList, decreasing = TRUE)
# Actual GSEA
set.seed(1)
ego3 <- gseKEGG(geneList = geneList,
organism = "hsa",
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 0.1,
verbose = FALSE)
# Get summary
ego3 <- setReadable(ego3, OrgDb = org.Hs.eg.db, keytype = "ENTREZID")
res.kegg <- as.data.frame(ego3)
if (nrow(res.kegg) > 0) {
# Save the full results
save_res(res.kegg, fileName = fileNameRes, wb = wb, sheetName = "KEGG_GSEA")
# Prepare for table output
res.kegg <- res.kegg[, c("ID", "Description", "NES", "pvalue", "p.adjust", "core_enrichment")]
res.kegg <- res.kegg[order(res.kegg$NES, decreasing = TRUE), ]
res.kegg <- res.kegg[res.kegg$p.adjust < p_adj_cutoff, ]
res.kegg$NES <- round(res.kegg$NES, digits = 2)
res.kegg$pvalue <- formatC(res.kegg$pvalue, format = "e", digits = 2)
res.kegg$p.adjust <- formatC(res.kegg$p.adjust, format = "e", digits = 2)
rownames(res.kegg) <- NULL
}
}
```
A total of `r nrow(res.kegg)` KEGG pathways were detected as significantly affected at FDR `r p_adj_cutoff`. Top `r ntable` shown.
```{r}
# Display the results
# DT::datatable(res.kegg)
if (nrow(res.kegg) > 0 ) {
kable(res.kegg[1:min(ntable, nrow(res.kegg)), ])
}
```
## Selected pathway
Red/Green - up/downregulated genes in upper vs. lower `r selected_genes` expressing samples. Gray - marginal fold change, yet significant. White - gene is not differentially expressed
```{r eval=FALSE}
library(pathview)
library(openxlsx)
degs <- read.xlsx(fileNameRes, cols = c(1, 2))
degs.genes <- degs$logFC
names(degs.genes) <- degs$Gene
# Adjust as needed
pv.out <- pathview(gene.data = degs.genes, pathway.id = "03010", species = "hsa", gene.idtype = "SYMBOL", gene.annotpkg = "org.Hs.eg.db", out.suffix = paste(selected_genes, collapse = "-"))
```
```{r echo=FALSE, out.height='300px', eval=TRUE}
knitr::include_graphics('hsa03010.MTDH-SDCBP.png')
```