forked from mariestrader/S.purp_MeDIP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DESeq2_ProCounts_Wald.R
306 lines (239 loc) · 13.1 KB
/
DESeq2_ProCounts_Wald.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# This script uses DESeq2 to analyze methylation counts of promoters
# This script also outputs files for GOMWU analysis and DAPC
# Strader et al 2019, Journal of Experimental Marine Biology and Ecology
# Parental environments alter DNA methylation in offspring of the purple sea urchin, Strongylocentrotus purpuratus
# note on naming scheme. Samples were initially labeled using (B) Blue: non-upwelling conditions (650 uatm, 17C); (R) Red: upwelling conditions (1330 uatm, 13C), with the first letter denoting the maternal environment and the second letter denoting the developmental environment (B: 450 uatm, R: 1050 uatm). The naming scheme was updated for the manuscript for clarity. BB=NL, BR=NH, RB=UL, RR=UH
#---------------- load libraries ------------------
library('DESeq2') #DESeq2_1.18.1
library('arrayQualityMetrics') #arrayQualityMetrics_3.34.0
library('vegan') #vegan_2.5-3
library('rgl') #rgl_0.99.16
library('ape') #ape_5.2
library('pheatmap') #pheatmap_1.0.10
library('dplyr') #dplyr_0.7.7
library('adegenet') #adegenet_2.1.1
library('Rmisc') #Rmisc_1.5
library('rlang') #rlang_0.3.0.1
library('ggplot2') #ggplot2_3.1.0
#---------------- set directory ------------------
#setwd()
#---------------- read in counts data ------------------
pcountsALL=read.table("MEDIP_proCounts_10July2018.txt", header=T) #version mapped only to CDS regions
pcounts=as.data.frame(pcountsALL[-c(2:6) ]) #removing top rows with general count numbers
row.names(pcounts)<-pcounts$Geneid
pcounts$Geneid=NULL
length(pcounts[,1]) #29923
colnames(pcounts)=c("BB1_BL","BB1_G","BB1_PR","BB2_BL","BB2_G","BB2_PR","BB3_BL","BB3_G","BB3_PR","BR1_BL","BR1_G","BR1_PR","BR2_BL","BR2_PR","BR3_BL","BR3_G","BR3_PR","RB1_BL","RB1_G","RB1_PR","RB2_BL","RB2_G","RB2_PR","RB3_BL","RB3_G","RB3_PR","RR1_BL","RR1_G","RR1_PR","RR2_BL","RR2_G","RR2_PR","RR3_BL","RR3_G","RR3_PR")
summary(colSums(pcounts))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 126003 379926 460429 473915 519338 898705
totalCounts=colSums(pcounts)
totalCounts
#BB1_BL BB1_G BB1_PR BB2_BL BB2_G BB2_PR BB3_BL BB3_G BB3_PR BR1_BL BR1_G BR1_PR BR2_BL BR2_PR BR3_BL
#432234 877839 596985 521128 437977 487299 476244 440651 692467 602793 620167 283123 381500 368240 378351
# BR3_G BR3_PR RB1_BL RB1_G RB1_PR RB2_BL RB2_G RB2_PR RB3_BL RB3_G RB3_PR RR1_BL RR1_G RR1_PR RR2_BL
#543294 359608 898705 545381 482068 460429 126003 466068 437693 497304 362945 461329 363598 517547 515494
# RR2_G RR2_PR RR3_BL RR3_G RR3_PR
#297790 364724 440137 407782 442143
min(totalCounts) #126003
mean(totalCounts) #473915.4
max(totalCounts) #898705
#---------------- Get promoter count means and remove low promoters ------------------
mns = apply(pcounts, 1, mean)
pcounts=pcounts[mns>3,] #get rid of promoters that show little or no methylation
table(mns > 3)
#FALSE TRUE
# 15370 14553
totalCountsFilt=colSums(pcounts)
totalCountsFilt
#BB1_BL BB1_G BB1_PR BB2_BL BB2_G BB2_PR BB3_BL BB3_G BB3_PR BR1_BL BR1_G BR1_PR BR2_BL BR2_PR BR3_BL
#423108 842261 569480 507685 431376 476580 464580 417819 663109 567419 599532 276070 367798 351993 366986
# BR3_G BR3_PR RB1_BL RB1_G RB1_PR RB2_BL RB2_G RB2_PR RB3_BL RB3_G RB3_PR RR1_BL RR1_G RR1_PR RR2_BL
#526279 349806 859717 527721 470280 443084 122030 455322 423668 478303 349115 446730 351146 489546 500716
# RR2_G RR2_PR RR3_BL RR3_G RR3_PR
#290622 353000 424130 394200 424065
min(totalCountsFilt) #122030
max(totalCountsFilt) #859717
mean(totalCountsFilt) #457293.6
#---------------- set up sample metadata ------------------
colData <- data.frame(row.names= colnames(pcounts), bucket= c("BB1","BB1","BB1","BB2","BB2","BB2","BB3","BB3","BB3","BR1","BR1","BR1","BR2","BR2","BR3","BR3","BR3","RB1","RB1","RB1","RB2","RB2","RB2","RB3","RB3","RB3","RR1","RR1","RR1","RR2","RR2","RR2","RR3","RR3","RR3"), stage=c("BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR"))
colData$treat = substr(colData$bucket, 1,2)
colData$treat_maternal = substr(colData$bucket, 1,1)
colData$treat_dev = substr(colData$bucket, 2,2)
#---------------- identify sample outliers ------------------
#dds<-DESeqDataSetFromMatrix(countData=pcounts, colData=colData, design= ~ treat+stage)
#vsd=varianceStabilizingTransformation(dds, blind=T)
#e=ExpressionSet(assay(vsd), AnnotatedDataFrame(as.data.frame(colData(vsd))))
#arrayQualityMetrics(e, intgroup=c("stage"), force=T, outdir= "report_for_PRO_vsd_MEDIP_07102018")
#Was determined there are 4 ouliers: BB1_BL, BB2_G, RB2_G, RR2_G
#---------------- remove outliers from both gcounts and colData ------------------
#remove outliers from both gcounts and colData
pcounts$BB1_BL<-NULL
pcounts$BB2_G<-NULL
pcounts$RB2_G<-NULL
pcounts$RR2_G<-NULL
colData=as.data.frame(colData[-c(1,5,22,31), ])
#---------------- Wald Tests ------------------
dds<-DESeqDataSetFromMatrix(pcounts,
colData = colData,
design = formula(~ treat_maternal + treat_dev + stage ))
meth.rld=rlog(dds)
meth.coldata = colData
#### Wald test for maternal treatment
ddsM_RB<-DESeq(dds, minReplicatesForReplace=Inf)
resM_RB<-results(ddsM_RB, contrast=c('treat_maternal', 'R', 'B'))
mcols(resM_RB,use.names=TRUE)
summary(resM_RB)
#out of 14553 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 185, 1.3%
#LFC < 0 (down) : 200, 1.4%
#outliers [1] : 5, 0.034%
#low counts [2] : 0, 0%
#(mean count < 2)
#### Wald test for developmental treatment
ddsD_RB<-DESeq(dds, minReplicatesForReplace=Inf)
resD_RB<-results(ddsD_RB, contrast=c('treat_dev', 'R', 'B'))
mcols(resD_RB,use.names=TRUE)
summary(resD_RB)
#out of 14553 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 2, 0.014%
#LFC < 0 (down) : 2, 0.014%
#outliers [1] : 5, 0.034%
#low counts [2] : 0, 0%
#(mean count < 2)
####Wald test for stages
dds_BL_G<-DESeq(dds, minReplicatesForReplace=Inf)
res_BL_G<-results(dds_BL_G, contrast=c('stage', 'BL', 'G'))
mcols(res_BL_G,use.names=TRUE)
summary(res_BL_G)
#out of 14553 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 0, 0%
#LFC < 0 (down) : 0, 0%
#outliers [1] : 5, 0.034%
#low counts [2] : 0, 0%
#(mean count < 2)
dds_BL_PR<-DESeq(dds, minReplicatesForReplace=Inf)
res_BL_PR<-results(dds_BL_PR, contrast=c('stage', 'BL', 'PR'))
mcols(res_BL_PR,use.names=TRUE)
summary(res_BL_PR)
#out of 14553 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 18, 0.12%
#LFC < 0 (down) : 4, 0.027%
#outliers [1] : 5, 0.034%
#low counts [2] : 1129, 7.8%
#(mean count < 4)
dds_G_PR<-DESeq(dds, minReplicatesForReplace=Inf)
res_G_PR<-results(dds_G_PR, contrast=c('stage', 'G', 'PR'))
mcols(res_G_PR,use.names=TRUE)
summary(res_G_PR)
#out of 14553 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 0, 0%
#LFC < 0 (down) : 0, 0%
#outliers [1] : 5, 0.034%
#low counts [2] : 0, 0%
#(mean count < 2)
save(dds,resM_RB,resD_RB,res_BL_G,res_BL_PR,res_G_PR,meth.rld, meth.coldata,pcounts, file="MeDIP_DESeq2_PRO_Wald.Rdata")
#---------------- Make output files for rld and p values; get #DMGs for 0.05 cutoff ------------------
ll=load("MeDIP_DESeq2_PRO_Wald.Rdata")
meth.rld.df = assay(meth.rld)
colnames(meth.rld.df) = colnames(pcounts)
vals=(cbind(resM_RB$stat, resM_RB$pvalue, resM_RB$padj, resD_RB$stat, resD_RB$pvalue, resD_RB$padj, res_G_PR$stat, res_G_PR$pvalue, res_G_PR$padj, res_BL_G$stat, res_BL_G$pvalue, res_BL_G$padj, res_BL_PR$stat, res_BL_PR$pvalue, res_BL_PR$padj)) # collect pvalues for each comparison
colnames(vals)=c("stat_resM_RB","pval_resM_RB", "padj_resM_RB","stat_resD_RB","pval_resD_RB", "padj_resD_RB","stat_res_G_PR","pval_res_G_PR", "padj_res_G_PR","stat_res_BL_G","pval_res_BL_G", "padj_res_BL_G","stat_res_BL_PR","pval_res_BL_PR", "padj_res_BL_PR") # name columns
rldpvals=as.data.frame(cbind(meth.rld.df,vals)) # combine RLD with pvals
#get number of DMGs with padj < 0.01
#maternal condition
length((rldpvals[rldpvals$padj_resM_RB<= 0.01 & !is.na(rldpvals$padj_resM_RB), ])[,1]) #156
#developmental condition
length((rldpvals[rldpvals$padj_resD_RB<= 0.01 & !is.na(rldpvals$padj_resD_RB), ])[,1]) #1
#Gastrula v. Prism
length((rldpvals[rldpvals$padj_res_G_PR<= 0.01 & !is.na(rldpvals$padj_res_G_PR), ])[,1]) #0
#Blastula v. Prism
length((rldpvals[rldpvals$padj_res_BL_PR<= 0.01 & !is.na(rldpvals$padj_res_BL_PR), ])[,1]) #4
#Blastula v. Gastrula
length((rldpvals[rldpvals$padj_res_BL_G<= 0.01 & !is.na(rldpvals$padj_res_BL_G), ])[,1]) #0
write.csv(rldpvals,file="MeDIP_Spurp_PRO_rldpvals.csv",quote=F, row.names=T)
#---------------- principal coordinate analysis ------------------
ll=load("MeDIP_DESeq2_PRO_Wald.Rdata")
meth.rld.df = assay(meth.rld)
colnames(meth.rld.df) = colnames(pcounts)
dd.veg=vegdist(t(meth.rld.df), "manhattan")
div.dd.veg=dd.veg/1000
head(div.dd.veg)
dd.pcoa=pcoa(div.dd.veg)
head(dd.pcoa)
scores=dd.pcoa$vectors
conditions=meth.coldata
conditions$treat_maternal=as.factor(conditions$treat_maternal)
conditions$treat_dev=as.factor(conditions$treat_dev)
dd.pcoa
#$values
# Eigenvalues Relative_eig Broken_stick Cumul_eig Cumul_br_stick
#1 50.688293 0.14745346 0.133166238 0.1474535 0.1331662
#2 29.573446 0.08602986 0.099832904 0.2334833 0.2329991
#3 18.300417 0.05323635 0.083166238 0.2867197 0.3161654
#4 17.210215 0.05006493 0.072055127 0.3367846 0.3882205
#---------------- plotting principal coordinate analysis ------------------
### definte maternal conditions for plotting
colorsMat <- c("coral", "turquoise4")
conditions$treat_maternal_c <- colorsMat[as.factor(conditions$treat_maternal)]
conditions$treat_maternal=as.factor(conditions$treat_maternal)
conditions$treat_maternal = factor(conditions$treat_maternal, levels = c('B', 'R'), labels = c('Non-Upwelling', 'Upwelling'))
plot(scores[,1], scores[,2], col=conditions$treat_maternal_c,pch=19, xlab="PCo1", ylab="PCo2", main="PCoA by Maternal Condition")
ordispider(scores,conditions$treat_maternal,label=F)
ordiellipse(scores,conditions$treat_maternal,label=F )
legend("topleft", legend = levels(conditions$treat_maternal), col=colorsMat, pch = 19)
### definte developmental conditions for plotting
colorsMat <- c("coral", "turquoise4")
conditions$treat_dev_c <- colorsMat[as.factor(conditions$treat_dev)]
conditions$treat_dev=as.factor(conditions$treat_dev)
conditions$treat_dev = factor(conditions$treat_dev, levels = c('B', 'R'), labels = c('Non-Upwelling', 'Upwelling'))
plot(scores[,1], scores[,2], col=conditions$treat_dev_c,pch=19, xlab="PCo1", ylab="PCo2", main="PCoA by Larval Condition")
ordispider(scores,conditions$treat_dev,label=F)
ordiellipse(scores,conditions$treat_dev,label=F )
legend("topleft", legend = levels(conditions$treat_dev), col=colorsMat, pch = 19)
### definte developmental stage for plotting
colorsStage <- c("coral", "turquoise4", "gold")
conditions$stage_c <- colorsStage[as.factor(conditions$stage)]
conditions$stage=as.factor(conditions$stage)
conditions$stage = factor(conditions$stage, levels = c('BL', 'G', "PR"), labels = c('Blastula', 'Gastrula', 'Prism'))
plot(scores[,1], scores[,2], col=conditions$stage_c,pch=19, xlab="PCo1", ylab="PCo2", main="PCoA by Developmental Stage")
ordispider(scores,conditions$stage,label=F)
ordiellipse(scores,conditions$stage,label=F )
legend("topleft", legend = levels(conditions$stage), col=colorsStage, pch = 19)
#---------------- multivariate statistics ------------------
ll=load("MeDIP_DESeq2_PRO_Wald.Rdata")
rld=assay(meth.rld)
ad=adonis(t(rld)~treat_maternal+treat_dev+stage,data=meth.coldata,method="manhattan")
ad
#adonis(formula = t(rld) ~ treat_maternal + treat_dev + stage, data = meth.coldata, method = "manhattan")
#
#Permutation: free
#Number of permutations: 999
#
#Terms added sequentially (first to last)
#
# Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
#treat_maternal 1 19875603 19875603 1.7932 0.05782 0.004 **
#treat_dev 1 11563178 11563178 1.0432 0.03364 0.320
#stage 2 24137313 12068656 1.0888 0.07022 0.204
#Residuals 26 288181820 11083916 0.83833
#Total 30 343757913 1.00000
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
labs=c("Maternal_condition","Developmental_condition","Stage","Residuals")
cols=c("skyblue","green2","coral","grey80")
labs2 = paste(labs, round(ad$aov.tab$R2[1:4]*100, digits=1))
pie(ad$aov.tab$R2[1:4],labels=labs2,col=cols,main="GBM")
#---------------- volcano plots ------------------
ll=load("MeDIP_DESeq2_PRO_Wald.Rdata")
with(resM_RB, plot(log2FoldChange, -log10(pvalue), pch=20, main="Maternal Condition Padj<0.01", xlim=c(-5,5), ylim=c(1,45)))
with(subset(resM_RB, padj<0.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="firebrick"))
with(resD_RB, plot(log2FoldChange, -log10(pvalue), pch=20, main="Developmental Condition Padj<0.01", xlim=c(-5,5),ylim=c(1,45)))
with(subset(resD_RB, padj<0.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="firebrick"))
with(res_BL_PR, plot(log2FoldChange, -log10(pvalue), pch=20, main="BL v. PR Padj<0.01", xlim=c(-5,5),ylim=c(1,45)))
with(subset(res_BL_PR, padj<0.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="firebrick"))