-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiff_expr.Martin.txt
343 lines (248 loc) · 10.4 KB
/
diff_expr.Martin.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
---
title: "Differential Expression by Martin Nwadiugwu"
output:
html_document:
toc: TRUE
toc_depth: 2
toc_float: TRUE
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Perform differential gene expression analysis in R.
## Installation of bioconductor packages
```{r eval=F}
BiocManager::install(c("limma","affy","hgu133plus2.db"))
```
### Loading the needed libraries
```{r eval=T}
library(affy)
library(limma)
library(hgu133plus2.db)
```
## Part1: Raw data (tar file) obtained from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE82208
### Setting up the working directory
```{r eval = F}
setwd("C:/Users/mnwadiugwu/Desktop")
```
#Checking for list of files in working directory
```{r eval=T}
list.files()
```
### Untarring the 'tar files' and reading it into R
```{r eval=T}
# untarring the GSE82208 "tar" file
untar("GSE82208_RAW.tar")
# reading and assigning the untarred files (CEL files) into R
mcn <- ReadAffy(celfile.path="/Users/mnwadiugwu/Desktop/project",compress=TRUE)
```
## Part2: Normalizing and creating expression matrix for visualization
```{r eval=T}
# performing RMA normalization
celnorm <-rma(mcn)
# converting to expression matrix
expr<-exprs(celnorm)
#print first six rows and columns
print(expr[1:6,1:6])
#printing expression matrix dimension
print(dim(expr))
#printing sample names of expression matrix
print(colnames(expr))
#removing ".CEL.gz" extension from sample names using gsub
colnames(expr) <- gsub(".CEL.gz","",colnames(expr))
#confirming the column extension change
print(colnames(expr))
```
### boxplot of expression values distribution
```{r eval=T}
boxplot(log2(expr+1), use.cols=TRUE, las=2,cex.axis=0.8, outline = F)
```
## Part3: Fiting Linear Model using limma ($Y = B0+ B1*X$) sample group($X$), gene expression($Y$) -for comparing groups--
```{r eval=T}
#creating a factor data
group <- factor(c(1,1,1,0,1,1,1,1,1,1,0,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0))
#group <- factor(c(rep(0,27),rep(1,25)))
#creating design matrix to indicate the different RNA or miRNA samples
design <- model.matrix(~ 0 + group)
# note that in this dataset first 27 samples are FTC samples and next 25 samples are FTA samples
colnames(design)<-c("FTC","FTA")
print(design)
#creating contrast matrix to compare the different samples. The sample with the value +1 will be sample of interest
contrast.matrix<-makeContrasts(FTC-FTA, levels=design)
print(contrast.matrix)
# fitting the expression data into a linear model
fit <- lmFit(celnorm, design)
fit <- contrasts.fit(fit, contrast.matrix)
# printing sample coeffeicients
print(head(fit$coefficients))
# Empirical Bayes to adjust variances, computes t-stat, std. error, log Fold Change, log odd
fit2 <- eBayes(fit)
```
### summary of the results using topTable()
## Adj. p.value using the BH method
## log-odds ((B-statistic)
```{r eval=T}
# extracting top 1000 differentially expressed genes
results<-topTable(fit2, adjust="BH", number=1000, sort.by="B")
# printing few results, the rownames contains the affy-id
print(head(results))
```
## Part 4: Mapping Probe Ids to Gene Names
```{r eval=T}
#using hgu133plus2.db package earlier installed
mapped_probes <- mappedkeys(hgu133plus2SYMBOL)
#prints first five affy-ids for which there is some information
print(mapped_probes[1:5])
# get mapping of affy-id to symbol for mapped probes
mapped_list <- as.list(hgu133plus2SYMBOL[mapped_probes])
#prints symbol information for first five affy-ids
print(mapped_list[1:5])
# match gives vector of positions of matches of 1st arg. in its 2nd.
index <- match(rownames(results),mapped_probes)
# get the corresponding symbol from these indices
symbol <- as.character(mapped_list[index])
# some symbols are labelled NULL, substitute NULL to NA
symbol <- gsub("NULL", NA, symbol)
# add a column to the results table and save to result_final
result_final<-cbind(results, symbol)
print(head(result_final))
```
## Part 5: Saving the output into "csv" files
```{r eval=T}
write.csv(result_final,file="DE_Final_FTC_FTA.csv", quote=F)
# get a subset of result that is up regulated in cancer
result_final_pos <- subset(result_final, result_final$logFC>0)
print(head(result_final_pos))
write.csv(result_final_pos,file="DE_Final_FTC_FTA_up.csv", quote=F)
# get a subset of result that is down regulated in cancer
result_final_neg <- subset(result_final, result_final$logFC<0)
print(head(result_final_pos))
write.csv(result_final_neg,file="DE_Final_FTC_FTA-down.csv", quote=F)
```
#Deliverable 2
#Reading miRNA target genes
```{r eval=T}
#Installing R packages for data manipulation
install.packages("dplyr")
install.packages("tidyverse")
#loading libraries
library(dplyr)
library(tidyverse)
#Reading the target site csv file in R
MiR <- read.csv(file ="/Users/mnwadiugwu/Desktop/project/MicroRNA_Target_Sites.csv")
# converting it to a dplyr file format
MiR <- tbl_df(MiR)
#filter file to contain only the miRNAs of interest
f=filter(MiR, miRNA == 'hsa-miR-21-3p'| miRNA == 'hsa-miR-21-5p')
#create unique gene list
GeneList <- data.frame(unique(f$Target.Gene))
#editing column names of unique gene list
names(GeneList) <- gsub("unique.f.", "", names(GeneList))
print(GeneList)
#saving unique gene list to a csv file
write.csv(GeneList, "UniqueGeneList.csv")
```
##Deliverable 3
##Finding down-regulated miRNA target genes in FTC
```{r eval=T}
#reading down regulated FTC file
DownFTC <- read.csv("DE_Final_FTA_FTC-down.csv")
#Finding miRNA(miR 21) target genes via intersection with down regulated FTC file
td<- data.frame(intersect(DownFTC$symbol, GeneList$Target.Gene.))
print(td)
#saving miR 21 target genes in FTA to a csv file
write.csv(td, "FTcGene_of_interest.csv")
#Finding down-regulated miRNA target genes in FTA
DownFTA <- read.csv("DE_FTA_FTC_DOWN.csv")
#Finding miRNA(miR 21) target genes via intersection with down regulated FTA file
fk <- data.frame(intersect(DownFTA$symbol, GeneList$Target.Gene.))
print(fk)
#saving miR 21 target genes in FTA to a csv file
write.csv(fk, "Deliverable_3_FTAGene_of_interest.csv")
```
##Deliverable 4
##Functional enrichment
#Installing packages
BiocManager::install(c("GOFunction","org.Hs.eg.db","pathview","ReactomePA"))
## Loading needed libraries
```{r eval=T}
library(GOFunction)
library(org.Hs.eg.db)
library(pathview)
library(ReactomePA)
library(hgu133plus2.db)
```
##Functional Enrichment using Gene Ontology, Outputs GO plot and list of enriched terms
## Part 4: Mapping Probe Ids to Gene Names
````{r eval=T}
#Loading diff. analysis object
load("fit2.bin")
# reading gene differential analysis results
result_final <- read.csv(file="DE_Final_FTC_FTA-down.csv")
#Mapping Probe Ids to Gene Names
library("hgu133plus2.db")
#using hgu133plus2.db package earlier installed
mapped_probes <- mappedkeys(hgu133plus2SYMBOL)
# get mapping of affy-id to symbol for mapped probes
mapped_list <- as.list(hgu133plus2SYMBOL[mapped_probes])
all.probes <- rownames(fit2$coefficients)
# match gives vector of positions of matches of 1st arg. in its 2nd.
index <- match(all.probes,mapped_probes)
# get the corresponding symbol from these indices
all.symbol <- as.character(mapped_list[index])
# some symbols are labelled NULL, substitute NULL to NA
all.symbol <- gsub("NULL", NA, all.symbol)
#Converting the gene names to Entrez Gene ID using the **pathview** library.
library(pathview)
#converting the gene symbol from DE_FTC_FTA_up.csv file to Entrez Gene
allsymbol2eg <- id2eg(as.character(all.symbol),category="symbol",org="Hs")
# saving the second column of the converted Entrez Gene ID
entrez_all <- allsymbol2eg[,2]
entrez_all <- as.numeric(unique(entrez_all[!is.na(entrez_all)]))
deg2eg <- id2eg(as.character(result_final$symbol),category = "symbol", org = "Hs")
entrez_deg <- deg2eg[,2]
#converting the results to unique numerical variables and removing NAs
entrez_deg <- as.numeric(unique(entrez_deg[!is.na(entrez_deg)]))
library(GOFunction)
# Perform gene ontology biological process enrichment for up-regulated genes
sigUpTermBP <- GOFunction(entrez_deg,entrez_all,organism="org.Hs.eg.db",
ontology="MF",fdrmethod="BH",filename="fin_FTA_FTC_DSigTermMF",fdrth=0.1)
# Perform gene ontology Molecular function enrichment for up-regulated geens
sigUpTermBP <- GOFunction(entrez_deg,entrez_all,organism="org.Hs.eg.db",
ontology="BP",fdrmethod="BH",filename="fin_FTA_FTC_DSigTermBP",fdrth=0.1)
```
#Visualization of pathway and Diff. Expressed Genes in KEGG pathway
# Step 1: Convert the gene names to Entrez Gene ID
### Using **pathview** library.
* In this step, we will convert the gene symbols from the up regulated list to Entrez Gene Ids.
```{r eval = T}
up <- read.csv(file="DE_Final_FTC_FTA-down.csv")
#convert the eigth column(symbol) to Entrez Gene
upsymbol2eg <- id2eg(as.character(up[,8]),category="symbol",org="Hs")
print(head(upsymbol2eg))
# save the second column of upsymbol2eg map into a vector called entrez_up
entrez_up <- upsymbol2eg[,2]
entrez_up <- as.numeric(unique(entrez_up[!is.na(entrez_up)]))
```
## Visualization of pathway and Diff. Expressed Genes in KEGG pathway
* Use the down-regulated gene list to identify which proteins are involved in **Thyriod cancer**.
* The pathway id used was 05216 for thyriod cancer
* Output are png files with mapping of genes represented as boxes filled with green color.
```{r eval=T}
p.values <- up$P.Value
#assign entrez ids to pvalues
names(p.values) <-upsymbol2eg[,2]
#remove entries with no entrez ids
p.values <- p.values[!is.na(names(p.values))]
pv.out <- pathview(gene.data = -log10(p.values), pathway.id = "05216", species = "hsa", out.suffix = "kegg_pathway")
#Look for png files with the names **kegg_pathway** as output of this command.
```
## Implementing Functional Enrichment with Reactome Pathways
```{r eval=T}
x <- enrichPathway(gene = entrez_deg, pvalueCutoff = 0.2,qvalueCutoff = 0.2, readable=T)
head(data.frame(x))
# save the result into a file **reactome.up.csv**
write.table(data.frame(x), file="reactome.FTC_up.csv", quote=F, row.names=F, col.names=T)
# plot the top 10 pathways enriched (Click on Zoom to see everything in the figure)
barplot(x, showCategory = 10)
```