-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathCELLX_analysis.Rmd
234 lines (186 loc) · 10.5 KB
/
CELLX_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
---
title: "CELLX analysis"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
always_allow_html: yes
output:
pdf_document:
toc: no
html_document:
theme: united
toc: yes
editor_options:
chunk_output_type: console
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r libraries, include=FALSE}
library(knitr)
library(ggplot2)
library(plotly)
library(writexl)
library(readr)
library(cowplot)
```
# Data from CELLX
- Go to [http://54.149.52.246/cgi-bin/RPPA/cellx.cgi](http://54.149.52.246/cgi-bin/RPPA/cellx.cgi)
- Select "Expression" tab out of "CNV"/"Expression"/"Mutation"/"Other" tabs
- Select the "RSEM-barplot" option on the sidebar. Read more about RSEM selected_gene expression measures at [https://deweylab.github.io/RSEM/](https://deweylab.github.io/RSEM/)
- Select any cancer-associated selected_gene from [http://cancer.sanger.ac.uk/census](http://cancer.sanger.ac.uk/census), e.g., "ERBB2". Alternatively, use any selected_gene name you may find biologically interesting
- Enter lower-case selected_gene name into the "HUGO" textbox at the bottom of the page
- Click "Submit" - the page will refresh in ~20 sec
- Save the tab-separated data using "Download table" link into `data/RSEM_expression_GENESYMBOL.data.tsv`
- Import the downloaded data into R
```{r settings}
selected_gene <- "TMEM219" # Change
min_number_of_normal <- 12 # Minimum number of samples to conduct a test
n_top <- 15 # How many top results to use
fileNameIn1 <- paste0("data/RSEM_expression_", selected_gene, ".data.tsv") # Input data
fileNameOut1 <- paste0("results/CELLX_analysis_", selected_gene, ".xlsx") # Output results
```
# Gene `r selected_gene` analysis
```{r data.setup, include=F}
# Read annotations
cancers <- openxlsx::read.xlsx("data.TCGA/TCGA_cancers.xlsx")
cells <- read_tsv("data.TCGA/CCLE_Cell_lines_annotations_20181226.txt")
cells <- mutate(cells, Cell.ID = as.character(sapply(cells$CCLE_ID, function(x) strsplit(x, "_")[[1]][1])) )# Add cell ID column
# Read expression data
expr <- read.table(fileNameIn1, header = TRUE, stringsAsFactors = FALSE)
samples <- unique(expr$affy_source)
cancer <- sort(samples[substr(samples, (nchar(samples) - 1), nchar(samples)) != "_N"])
normal <- sort(samples[substr(samples, (nchar(samples) - 1), nchar(samples)) == "_N"])
expr$tissue <- factor(NA, levels = c("Cancer", "Normal"))
expr$tissue[expr$affy_source %in% cancer == T] <- "Cancer"
expr$tissue[expr$affy_source %in% cancer == F] <- "Normal"
# Separate CCLE and TCGA expression
expr_ccle <- expr[ grep("CCLE", expr$affy_source, invert = FALSE), ]
expr_tcga <- expr[ grep("CCLE", expr$affy_source, invert = TRUE), ]
expr_tcga$affy_source <- as.character(sapply(expr_tcga$affy_source, function(x) strsplit(x, "-")[[1]][2]))
expr_tcga$affy_source <- paste(expr_tcga$affy_source, expr_tcga$tissue, sep = "_")
# Attach annotations
# intersect(expr_ccle$cell, cells$Cell.ID) %>% length
expr_ccle <- left_join(expr_ccle, cells, by = c("cell" = "Cell.ID")) # ToDo: Better merging, currently many missing
# Sort
expr_ccle <- expr_ccle[order(expr_ccle[, colnames(expr_ccle) == selected_gene], decreasing = TRUE), ]
expr_tcga <- expr_tcga[order(expr_tcga[, colnames(expr_tcga) == selected_gene], decreasing = TRUE), ]
expr_tcga_levels <- aggregate(expr_tcga[, colnames(expr_tcga) == selected_gene], by = list(expr_tcga$affy_source), mean)
expr_tcga$affy_source <- factor(expr_tcga$affy_source, levels = expr_tcga_levels[order(expr_tcga_levels$x, decreasing = TRUE), "Group.1"])
```
# CCLE expression
## Top `r n_top` cell lines with the _highest_ log2(RSEM) expression of `r selected_gene`
```{r}
expr_ccle_top <- expr_ccle[1:n_top, colnames(expr_ccle) %in% c("cell", selected_gene, "CCLE_ID")]
rownames(expr_ccle_top) <- NULL
pander(expr_ccle_top)
```
## Top `r n_top` cell lines with the _lowest_ log2(RSEM) expression of `r selected_gene`
```{r}
expr_ccle_top <- expr_ccle[(nrow(expr_ccle) - n_top):nrow(expr_ccle), colnames(expr_ccle) %in% c("cell", selected_gene, "CCLE_ID")]
rownames(expr_ccle_top) <- NULL
pander(expr_ccle_top)
```
# Cancer vs. normal `r selected_gene` log2(RSEM) expression boxplots, all cancers
```{r expression.plot, fig.height=5}
rsem <- ggplot(expr_tcga, aes(affy_source, eval(parse(text = selected_gene)))) +
geom_boxplot(aes(fill = tissue), outlier.shape = 1) +
scale_fill_manual(name = "", values = c("red", "green")) +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), panel.grid.major = element_blank()) +
labs(title = paste("Tumor-Normal Expression, ", selected_gene), x = NULL) +
ylab("log2(RSEM) gene expression") +
theme(legend.position="bottom")
rsem
```
\pagebreak
# Differential expression of selected_gene `r selected_gene` between tumor and normal tissues
The table below displays differential expression statistics comparing the expression levels of `r selected_gene` in normal versus cancer tissues (where at least `r min_number_of_normal` normal samples are available) by Welch two-sample t-test.
The table is sorted by the absolute "Fold_change" level, from largest to smallest. Positive/negative log2 Fold change indicates the gene is up/down in cancer samples, respectively, and "p-value" indicates the level of differential expression.
```{r expression.t.table}
# # Check if all normal tissues have a cancer tissue counterpart and vice versa
# table(substr(normal,1,(nchar(normal)-2)) %in% cancer)
# table(cancer %in% substr(normal,1,(nchar(normal)-2)))
cancer <- unique(sapply(as.character(expr_tcga$affy_source), function(x) strsplit(x, "_")[[1]][1]) %>% as.character) %>% sort
expr_tcga$affy_source <- as.character(expr_tcga$affy_source)
# Set up object for storing t-test results
t.table <- data.frame(Cancer_name = NA,
Normal = NA,
Cancer = NA,
Mean_expression_normal = NA,
Mean_expression_cancer = NA,
Fold_change = NA,
p_value = NA)
# Loop over all cancer tissue types to extract information
for (i in 1:length(cancer)) {
if(sum(expr_tcga$affy_source == paste0(cancer[i], "_Normal")) >= min_number_of_normal) {
t.table[i, "Cancer_name"] <- cancer[i]
t.table[i, c("Normal", "Cancer")] <- c(length(expr_tcga$affy_source == paste0(cancer[i], "_Normal")),
length(expr_tcga$affy_source == paste0(cancer[i], "_Cancer")))
t.table[i, c("Mean_expression_normal", "Mean_expression_cancer")] <-
c(expr_tcga_levels$x[expr_tcga_levels$Group.1 == paste0(cancer[i], "_Normal")],
expr_tcga_levels$x[expr_tcga_levels$Group.1 == paste0(cancer[i], "_Cancer")])
t.table[i, "Fold_change"] <- signif( (t.table$Mean_expression_cancer[i] - t.table$Mean_expression_normal[i]) )
res <- t.test(expr_tcga[expr_tcga$affy_source == paste0(cancer[i], "_Normal"), selected_gene],
expr_tcga[expr_tcga$affy_source == paste0(cancer[i], "_Cancer"), selected_gene], alternative = "two.sided")
t.table[i, "p_value"] <- res$p.value
}
}
t.table <- t.table[complete.cases(t.table), ]
t.table <- t.table[ order(abs(as.numeric(t.table$Fold_change)), decreasing = TRUE), ]
t.table <- left_join(t.table, cancers, by = c("Cancer_name" = "Acronym"))
t.table <- t.table[, c("Cancer_name", "Cancer.Name", "Fold_change", "p_value", "Mean_expression_normal", "Mean_expression_cancer")]
t.table$p_value <- formatC(as.numeric(t.table$p_value), format = "e", digits = 2)
colnames(t.table) <- c("TCGA ID", "Description", "log2 Fold change", "p-value", "Mean log2(RSEM) normal", "Mean log2(RSEM) cancer")
rownames(t.table) <- NULL # Drop rownames
# Visualize the most important
pander(t.table)
```
\pagebreak
# Cancer vs. normal `r selected_gene` log2(RSEM) expression boxplots, individual cancers
The figure below is a plot that highlights the differences in `r selected_gene` expression distribution in normal and cancer tissues for each cancer type (where at least `r min_number_of_normal` normal samples are available).
```{r interactive.plot, warning=F, fig.height=12}
# Add t test info to dataset to display means and test results in the graph
expr_tcga$Cancer <- sapply(expr_tcga$affy_source, function(x) strsplit(x, "_")[[1]][1])
expr2 <- c()
for (i in cancer) {
if(sum(expr_tcga$affy_source == paste0(i, "_Normal")) > min_number_of_normal &
sum(expr_tcga$affy_source == paste0(i, "_Cancer")) > min_number_of_normal) {
expr2 <- rbind(expr2, expr_tcga[expr_tcga$affy_source == paste0(i, "_Normal") | expr_tcga$affy_source == paste0(i, "_Cancer"), ])
}
}
# expr2 <- expr2[!is.na(expr2$Mean_expression_normal), ]
expr2$tissue <- relevel(expr2$tissue, ref = "Normal") # Make normal boxplot plotted first
## Boxplot version: looks nice but the hover text doesn't work for displaying t.test statistics
rsem_int <- ggplot(expr2, aes(tissue, eval(parse(text = selected_gene)))) +
geom_boxplot(aes(fill = tissue), outlier.shape = 1) +
scale_fill_manual(name = "", values = c("green", "red")) +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), panel.grid.major = element_blank()) +
labs(title = paste("Tumor-Normal Expression, ", selected_gene, "\n"), x = NULL) +
facet_wrap(~Cancer, ncol = 7) + guides(fill = F) +
ylab("log2(RSEM) gene expression")
# ggplotly(rsem_int, tooltip = c("colour", "text"))
plot(rsem_int)
ggsave(paste0("CELLX_", selected_gene, ".pdf"), width = 6, height = 5)
```
```{r}
# Save all results
x <- list(TCGA = t.table, CCLE = expr_ccle)
write_xlsx(x, fileNameOut1)
```
```{r interactive.coord.plot, eval=FALSE}
# Cancer vs. normal expression~significance plot
# The figure below is an interactive plot that highlights the relationship between *`r names(expr)[3]`* expression in normal and cancerous tissues for each type of cancer, colored by the significance of the differences.
# Coordinate plot version
t.int <- ggplot(t.table[complete.cases(t.table), ], aes(Mean_expression_normal, Mean_expression_cancer)) +
geom_point(aes(text = paste("Name:", Acronym, ", t:", t_statistic), color = p_value)) + labs(title = paste("Expression of", selected_gene, "Across Cancer Types"), x = "\nExpression Level in Normal Tissue", y = "Expression Level in Cancer Tissue\n") +
scale_color_gradient(low = "red", high = "black")
ggplotly(t.int)
```