forked from mariestrader/S.purp_MeDIP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DESeq2_geneCounts_MeDIP.R
678 lines (530 loc) · 26.6 KB
/
DESeq2_geneCounts_MeDIP.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
# This script uses DESeq2 to analyze methylation counts of genes
# This script also outputs files for GOMWU analysis and DAPC
# Strader et al 2019, Journal of Experimental Marine Biology and Ecology
# Parental environments alter DNA methylation in offspring of the purple sea urchin, Strongylocentrotus purpuratus
# note on naming scheme. Samples were initially labeled using (B) Blue: non-upwelling conditions (650 uatm, 17C); (R) Red: upwelling conditions (1330 uatm, 13C), with the first letter denoting the maternal environment and the second letter denoting the developmental environment (B: 450 uatm, R: 1050 uatm). The naming scheme was updated for the manuscript for clarity. BB=NL, BR=NH, RB=UL, RR=UH
#---------------- load libraries ------------------
library('DESeq2') #DESeq2_1.18.1
library('arrayQualityMetrics') #arrayQualityMetrics_3.34.0
library('vegan') #vegan_2.5-3
library('rgl') #rgl_0.99.16
library('ape') #ape_5.2
library('pheatmap') #pheatmap_1.0.10
library('dplyr') #dplyr_0.7.7
library('adegenet') #adegenet_2.1.1
library('Rmisc') #Rmisc_1.5
library('rlang') #rlang_0.3.0.1
library('ggplot2') #ggplot2_3.1.0
#---------------- set directory ------------------
#setwd()
#---------------- read in counts data ------------------
gcountsALL=read.table("MEDIP_geneCounts_6July2018.txt", header=T)
#Mapped to Strongylocentrotus_purpuratus.Spur_3.1.dna.toplevel using hisat2, sam files cleaned, sorted, dups removed with picard tools, counted with featurecounts -t gene -g gene_id counted multimapped reads and overlaps
gcounts=as.data.frame(gcountsALL[-c(2:6) ]) #removing top rows with general count numbers
row.names(gcounts)<-gcounts$Geneid
gcounts$Geneid=NULL
length(gcounts[,1]) #30240
colnames(gcounts)=c("BB1_BL","BB1_G","BB1_PR","BB2_BL","BB2_G","BB2_PR","BB3_BL","BB3_G","BB3_PR","BR1_BL","BR1_G","BR1_PR","BR2_BL","BR2_PR","BR3_BL","BR3_G","BR3_PR","RB1_BL","RB1_G","RB1_PR","RB2_BL","RB2_G","RB2_PR","RB3_BL","RB3_G","RB3_PR","RR1_BL","RR1_G","RR1_PR","RR2_BL","RR2_G","RR2_PR","RR3_BL","RR3_G","RR3_PR")
summary(colSums(gcounts))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#1173097 3535692 4140541 4274867 4794318 8110571
totalCounts=colSums(gcounts)
totalCounts
# BB1_BL BB1_G BB1_PR BB2_BL BB2_G BB2_PR BB3_BL BB3_G BB3_PR BR1_BL BR1_G BR1_PR BR2_BL
#3542891 7138590 5373064 4663930 3528492 4256346 4140541 4165290 5966408 5543711 5483689 2753554 3587477
# BR2_PR BR3_BL BR3_G BR3_PR RB1_BL RB1_G RB1_PR RB2_BL RB2_G RB2_PR RB3_BL RB3_G RB3_PR
#2899708 3429055 4968640 3616494 8110571 4852457 4736180 4069137 1173097 4591530 4049210 4870062 3741686
# RR1_BL RR1_G RR1_PR RR2_BL RR2_G RR2_PR RR3_BL RR3_G RR3_PR
#4270730 3400495 4529580 4714959 2466288 3499958 4081959 3414140 3990436
min(totalCounts) #1173097
mean(totalCounts) #4274867
max(totalCounts) #8110571
#---------------- Get density of methylation across genes (are there 2 peaks?) yes! ------------------
glength=as.data.frame(gcountsALL[-c(2:5,7:41) ])
colnames(glength)=c("annot", "length")
mns = as.data.frame(apply(gcounts, 1, mean))
row.names(mns)->mns$annot
colnames(mns)=c("means", "annot")
m=merge(mns, glength, by='annot',all.x=T)
m$means<-m$means+0.00001
m$class=-log(m$means/m$length)
dim(m)#30240 4
hist(m$class, breaks=100)
gbm_class=data.frame(cbind("gene"=m$annot,"class"=m$class))
write.csv(gbm_class,file=".../gbm_class.csv",quote=F, row.names=F)
#---------------- Get gene count means and remove low genes ------------------
mns = apply(gcounts, 1, mean)
gcounts=gcounts[mns>3,] #get rid of genes that show little or no methylation
table(mns > 3)
#FALSE TRUE
# 5893 24347
dim(gcounts) #24347 35
totalCountsFilt=colSums(gcounts)
totalCountsFilt
# BB1_BL BB1_G BB1_PR BB2_BL BB2_G BB2_PR BB3_BL BB3_G BB3_PR BR1_BL BR1_G BR1_PR BR2_BL
#3539981 7124707 5363262 4659516 3526018 4251950 4136327 4157350 5954841 5531505 5476589 2751190 3582640
# BR2_PR BR3_BL BR3_G BR3_PR RB1_BL RB1_G RB1_PR RB2_BL RB2_G RB2_PR RB3_BL RB3_G RB3_PR
#2893091 3425375 4962700 3612910 8094731 4846625 4732099 4063090 1171630 4587433 4044546 4863194 3736563
# RR1_BL RR1_G RR1_PR RR2_BL RR2_G RR2_PR RR3_BL RR3_G RR3_PR
#4265121 3396052 4518217 4710323 2462965 3495980 4076445 3408342 3982961
min(totalCountsFilt) #1171630
max(totalCountsFilt) #8094731
mean(totalCountsFilt) #4268751
#---------------- set up sample metadata ------------------
colData <- data.frame(row.names= colnames(gcounts), bucket= c("BB1","BB1","BB1","BB2","BB2","BB2","BB3","BB3","BB3","BR1","BR1","BR1","BR2","BR2","BR3","BR3","BR3","RB1","RB1","RB1","RB2","RB2","RB2","RB3","RB3","RB3","RR1","RR1","RR1","RR2","RR2","RR2","RR3","RR3","RR3"), stage=c("BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR","BL","G","PR"))
colData$treat = substr(colData$bucket, 1,2)
colData$treat_maternal = substr(colData$bucket, 1,1)
colData$treat_dev = substr(colData$bucket, 2,2)
#---------------- identify sample outliers ------------------
#dds<-DESeqDataSetFromMatrix(countData=gcounts, colData=colData, design= ~ treat+stage)
#vsd=varianceStabilizingTransformation(dds, blind=T)
#e=ExpressionSet(assay(vsd), AnnotatedDataFrame(as.data.frame(colData(vsd))))
#arrayQualityMetrics(e, intgroup=c("stage"), force=T, outdir= "report_for_genes_vsd_MEDIP_07082018")
#Was determined there are 3 ouliers: BB1_BL, BB2_G, RR2_G
#---------------- remove outliers from both gcounts and colData ------------------
gcounts$BB1_BL<-NULL
gcounts$BB2_G<-NULL
gcounts$RR2_G<-NULL
colData=as.data.frame(colData[-c(1,5,31), ])
#---------------- Wald Tests ------------------
dds<-DESeqDataSetFromMatrix(gcounts,
colData = colData,
design = formula(~ treat_maternal + treat_dev + stage ))
meth.rld=rlog(dds)
meth.coldata = colData
#### Wald test for maternal treatment
ddsM_RB<-DESeq(dds, minReplicatesForReplace=Inf)
resM_RB<-results(ddsM_RB, contrast=c('treat_maternal', 'R', 'B'))
mcols(resM_RB,use.names=TRUE)
summary(resM_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 589, 2.4%
#LFC < 0 (down) : 642, 2.6%
#outliers [1] : 5, 0.021%
#low counts [2] : 0, 0%
#(mean count < 2)
#### Wald test for developmental treatment
ddsD_RB<-DESeq(dds, minReplicatesForReplace=Inf)
resD_RB<-results(ddsD_RB, contrast=c('treat_dev', 'R', 'B'))
mcols(resD_RB,use.names=TRUE)
summary(resD_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 0, 0%
#LFC < 0 (down) : 0, 0%
#outliers [1] : 5, 0.021%
#low counts [2] : 0, 0%
#(mean count < 2)
#### Wald test for stages
dds_BL_G<-DESeq(dds, minReplicatesForReplace=Inf)
res_BL_G<-results(dds_BL_G, contrast=c('stage', 'BL', 'G'))
mcols(res_BL_G,use.names=TRUE)
summary(res_BL_G)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 7, 0.029%
#LFC < 0 (down) : 7, 0.029%
#outliers [1] : 5, 0.021%
#low counts [2] : 13685, 56%
#(mean count < 68)
dds_BL_PR<-DESeq(dds, minReplicatesForReplace=Inf)
res_BL_PR<-results(dds_BL_PR, contrast=c('stage', 'BL', 'PR'))
mcols(res_BL_PR,use.names=TRUE)
summary(res_BL_PR)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 45, 0.18%
#LFC < 0 (down) : 105, 0.43%
#outliers [1] : 5, 0.021%
#low counts [2] : 9909, 41%
#(mean count < 35)
dds_G_PR<-DESeq(dds, minReplicatesForReplace=Inf)
res_G_PR<-results(dds_G_PR, contrast=c('stage', 'G', 'PR'))
mcols(res_G_PR,use.names=TRUE)
summary(res_G_PR)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 1, 0.0041%
#LFC < 0 (down) : 0, 0%
#outliers [1] : 5, 0.021%
#low counts [2] : 0, 0%
#(mean count < 2)
save(dds,resM_RB,resD_RB,res_BL_G,res_BL_PR,res_G_PR,meth.rld, meth.coldata,gcounts, file="MeDIP_DESeq2_full_Wald_genes.Rdata")
#---------------- Look at treatment differneces in methylation at each dev stage ------------------
#---------------- Blastula ------------------
gcountsB=as.data.frame(select(gcounts, ends_with("BL")))
colDataB=subset(meth.coldata, stage=="BL")
dds<-DESeqDataSetFromMatrix(gcountsB,
colData = colDataB,
design = formula(~ treat_maternal + treat_dev))
######### test for maternal
dds.mb<-DESeq(dds, minReplicatesForReplace=Inf)
BresM_RB=results(dds.mb, contrast=c('treat_maternal', 'R', 'B'))
mcols(BresM_RB,use.names=TRUE)
summary(BresM_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 138, 0.57%
#LFC < 0 (down) : 146, 0.6%
#outliers [1] : 15, 0.062%
#low counts [2] : 0, 0%
#(mean count < 0)
######### test for developmental
dds.db<-DESeq(dds, minReplicatesForReplace=Inf)
BresD_RB=results(dds.db, contrast=c('treat_dev', 'R', 'B'))
mcols(BresD_RB,use.names=TRUE)
summary(BresD_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 4, 0.016%
#LFC < 0 (down) : 10, 0.041%
#outliers [1] : 15, 0.062%
#low counts [2] : 3774, 16%
#(mean count < 9)
#---------------- Gastrula ------------------
gcountsG=as.data.frame(select(gcounts, ends_with("G")))
colDataG=subset(meth.coldata, stage=="G")
dds<-DESeqDataSetFromMatrix(gcountsG,
colData = colDataG,
design = formula(~ treat_maternal + treat_dev))
######### test for maternal
dds.mg<-DESeq(dds, minReplicatesForReplace=Inf)
GresM_RB=results(dds.mg, contrast=c('treat_maternal', 'R', 'B'))
mcols(GresM_RB,use.names=TRUE)
summary(GresM_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 128, 0.53%
#LFC < 0 (down) : 104, 0.43%
#outliers [1] : 8, 0.033%
#low counts [2] : 0, 0%
#(mean count < 1)
######### test for developmental
dds.dg<-DESeq(dds, minReplicatesForReplace=Inf)
GresD_RB=results(dds.dg, contrast = c('treat_dev', 'R', 'B'))
mcols(GresD_RB,use.names=TRUE)
summary(GresD_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 10, 0.041%
#LFC < 0 (down) : 4, 0.016%
#outliers [1] : 8, 0.033%
#low counts [2] : 5193, 21%
#(mean count < 12)
#---------------- Prism ------------------
gcountsP=as.data.frame(select(gcounts, ends_with("PR")))
colDataP=subset(meth.coldata, stage=="PR")
dds<-DESeqDataSetFromMatrix(gcountsP,
colData = colDataP,
design = formula(~ treat_maternal + treat_dev))
######### test for maternal
dds.mp<-DESeq(dds, minReplicatesForReplace=Inf)
PresM_RB=results(dds.dg, contrast = c('treat_maternal', 'R', 'B'))
mcols(PresM_RB,use.names=TRUE)
summary(PresM_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 128, 0.53%
#LFC < 0 (down) : 104, 0.43%
#outliers [1] : 8, 0.033%
#low counts [2] : 0, 0%
#(mean count < 1)
######### test for developmental
dds.dp<-DESeq(dds, minReplicatesForReplace=Inf)
PresD_RB=results(dds.dp, contrast = c('treat_dev', 'R', 'B'))
summary(PresD_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 2, 0.0082%
#LFC < 0 (down) : 1, 0.0041%
#outliers [1] : 8, 0.033%
#low counts [2] : 0, 0%
#(mean count < 2)
#---------------- Look at differences in developmental treatment between each maternal treatment
# subset samples that come from upwelling maternal condition
gcountsU=as.data.frame(select(gcounts, starts_with("R")))
colDataU=subset(meth.coldata, treat_maternal=="R")
dds<-DESeqDataSetFromMatrix(gcountsU,
colData = colDataU,
design = formula(~ treat_dev))
######### test for developmental
dds.u<-DESeq(dds, minReplicatesForReplace=Inf)
UresD_RB=results(dds.u, contrast=c('treat_dev', 'R', 'B'))
mcols(UresD_RB,use.names=TRUE)
summary(UresD_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 0, 0%
#LFC < 0 (down) : 0, 0%
#outliers [1] : 9, 0.037%
#low counts [2] : 0, 0%
#(mean count < 0)
# subset samples that come from non-upwelling maternal condition
gcountsN=as.data.frame(select(gcounts, starts_with("B")))
colDataN=subset(meth.coldata, treat_maternal=="B")
dds<-DESeqDataSetFromMatrix(gcountsN,
colData = colDataN,
design = formula(~ treat_dev))
######### test for developmental
dds.n<-DESeq(dds, minReplicatesForReplace=Inf)
NresD_RB=results(dds.n, contrast=c('treat_dev', 'R', 'B'))
mcols(NresD_RB,use.names=TRUE)
summary(NresD_RB)
#out of 24347 with nonzero total read count
#adjusted p-value < 0.1
#LFC > 0 (up) : 0, 0%
#LFC < 0 (down) : 1, 0.0041%
#outliers [1] : 10, 0.041%
#low counts [2] : 0, 0%
#(mean count < 1)
#---------------- Make output files for rld and p values; get #DMGs for 0.05 cutoff ------------------
ll=load("MeDIP_DESeq2_full_Wald_genes.Rdata")
meth.rld.df = assay(meth.rld)
colnames(meth.rld.df) = colnames(gcounts)
vals=(cbind(resM_RB$stat, resM_RB$pvalue, resM_RB$padj, resD_RB$stat, resD_RB$pvalue, resD_RB$padj, res_G_PR$stat, res_G_PR$pvalue, res_G_PR$padj, res_BL_G$stat, res_BL_G$pvalue, res_BL_G$padj, res_BL_PR$stat, res_BL_PR$pvalue, res_BL_PR$padj)) # collect pvalues for each comparison
colnames(vals)=c("stat_resM_RB","pval_resM_RB", "padj_resM_RB","stat_resD_RB","pval_resD_RB", "padj_resD_RB","stat_res_G_PR","pval_res_G_PR", "padj_res_G_PR","stat_res_BL_G","pval_res_BL_G", "padj_res_BL_G","stat_res_BL_PR","pval_res_BL_PR", "padj_res_BL_PR") # name columns
rldpvals=as.data.frame(cbind(meth.rld.df,vals)) # combine RLD with pvals
#get number of DMGs with padj < 0.01
#maternal condition
length((rldpvals[rldpvals$padj_resM_RB<= 0.01 & !is.na(rldpvals$padj_resM_RB), ])[,1]) #605
#developmental condition
length((rldpvals[rldpvals$padj_resD_RB<= 0.01 & !is.na(rldpvals$padj_resD_RB), ])[,1]) #0
#Gastrula v. Prism
length((rldpvals[rldpvals$padj_res_G_PR<= 0.01 & !is.na(rldpvals$padj_res_G_PR), ])[,1]) #1
#Blastula v. Prism
length((rldpvals[rldpvals$padj_res_BL_PR<= 0.01 & !is.na(rldpvals$padj_res_BL_PR), ])[,1]) #22
#Blastula v. Gastrula
length((rldpvals[rldpvals$padj_res_BL_G<= 0.01 & !is.na(rldpvals$padj_res_BL_G), ])[,1]) #0
write.csv(rldpvals,file="MeDIP_Spurp_rldpvals.csv",quote=F, row.names=T)
ll=read.csv("MeDIP_Spurp_rldpvals.csv")
llsig = ll[order(ll$pval_resM_RB),]
llsig=llsig[llsig$pval_resM_RB<= 0.05 & !is.na(llsig$pval_resM_RB), ]
row.names(llsig)=llsig$X
llsig=as.data.frame(llsig[-c(1,34:48) ],) #collect wanted data columns for output
dim(llsig) #3123 32
write.csv(llsig,file="SigGenes_Maternal0.05.csv",quote=F, row.names=T)
#---------------- principal coordinate analysis ------------------
ll=load("MeDIP_DESeq2_full_Wald_genes.Rdata")
meth.rld.df = assay(meth.rld)
colnames(meth.rld.df) = colnames(gcounts)
dd.veg=vegdist(t(meth.rld.df), "manhattan")
div.dd.veg=dd.veg/1000
dd.pcoa=pcoa(div.dd.veg)
head(dd.pcoa)
scores=dd.pcoa$vectors
conditions=meth.coldata
conditions$treat_maternal=as.factor(conditions$treat_maternal)
conditions$treat_dev=as.factor(conditions$treat_dev)
dd.pcoa
#$values
# Eigenvalues Relative_eig Broken_stick Cumul_eig Cumul_br_stick
#1 94.636783 0.22620636 0.129911135 0.2262064 0.1299111
#2 54.482743 0.13022783 0.097653071 0.3564342 0.2275642
#3 25.012229 0.05978569 0.081524039 0.4162199 0.3090882
#4 20.507393 0.04901797 0.070771350 0.4652378 0.3798596
#5 17.006008 0.04064875 0.062706834 0.5058866 0.4425664
#---------------- plotting principal coordinate analysis ------------------
### definte maternal conditions for plotting
colorsMat <- c("coral", "turquoise4")
conditions$treat_maternal_c <- colorsMat[as.factor(conditions$treat_maternal)]
conditions$treat_maternal=as.factor(conditions$treat_maternal)
conditions$treat_maternal = factor(conditions$treat_maternal, levels = c('B', 'R'), labels = c('Non-Upwelling', 'Upwelling'))
plot(scores[,1], scores[,2], col=conditions$treat_maternal_c,pch=19, xlab="PCo1", ylab="PCo2", main="PCoA by Maternal Condition")
ordispider(scores,conditions$treat_maternal,label=F)
ordiellipse(scores,conditions$treat_maternal,label=F )
legend("topleft", legend = levels(conditions$treat_maternal), col=colorsMat, pch = 19)
### definte developmental conditions for plotting
colorsMat <- c("coral", "turquoise4")
conditions$treat_dev_c <- colorsMat[as.factor(conditions$treat_dev)]
conditions$treat_dev=as.factor(conditions$treat_dev)
conditions$treat_dev = factor(conditions$treat_dev, levels = c('B', 'R'), labels = c('Non-Upwelling', 'Upwelling'))
plot(scores[,1], scores[,2], col=conditions$treat_dev_c,pch=19, xlab="PCo1", ylab="PCo2", main="PCoA by Larval Condition")
ordispider(scores,conditions$treat_dev,label=F)
ordiellipse(scores,conditions$treat_dev,label=F )
legend("topleft", legend = levels(conditions$treat_dev), col=colorsMat, pch = 19)
### definte developmental stage for plotting
colorsStage <- c("coral", "turquoise4", "gold")
conditions$stage_c <- colorsStage[as.factor(conditions$stage)]
conditions$stage=as.factor(conditions$stage)
conditions$stage = factor(conditions$stage, levels = c('BL', 'G', "PR"), labels = c('Blastula', 'Gastrula', 'Prism'))
plot(scores[,1], scores[,2], col=conditions$stage_c,pch=19, xlab="PCo1", ylab="PCo2", main="PCoA by Developmental Stage")
ordispider(scores,conditions$stage,label=F)
ordiellipse(scores,conditions$stage,label=F )
legend("topleft", legend = levels(conditions$stage), col=colorsStage, pch = 19)
#---------------- multivariate statistics ------------------
rld=assay(meth.rld)
ad=adonis(t(rld)~treat_maternal+treat_dev+stage,data=meth.coldata,method="manhattan")
ad
#adonis(formula = t(rld) ~ treat_maternal + treat_dev + stage, data = meth.coldata, method = "manhattan")
#
#Permutation: free
#Number of permutations: 999
#
#Terms added sequentially (first to last)
#
# Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
#treat_maternal 1 27427677 27427677 2.09978 0.06556 0.015 *
#treat_dev 1 10961329 10961329 0.83917 0.02620 0.631
#stage 2 27297537 13648768 1.04491 0.06525 0.348
#Residuals 27 352678282 13062159 0.84299
#Total 31 418364825 1.00000
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
labs=c("Maternal_condition","Developmental_condition","Stage","Residuals")
cols=c("skyblue","green2","coral","grey80")
labs2 = paste(labs, round(ad$aov.tab$R2[1:4]*100, digits=1))
pie(ad$aov.tab$R2[1:4],labels=labs2,col=cols,main="GBM")
#---------------- OUTPUT FOR GO-MWU TESTS ------------------
### for maternal condition
resM_RB$lp=-log(resM_RB$pvalue,10)
resM_RB$lp[resM_RB$log2FoldChange<0]=-resM_RB$lp[resM_RB$log2FoldChange<0]
maternal_lpv=data.frame(cbind("gene"=row.names(resM_RB),"lp"=resM_RB$lp))
maternal_fc=data.frame(cbind("gene"=row.names(resM_RB),"fc"=resM_RB$log2FoldChange))
maternal_stat=data.frame(cbind("gene"=row.names(resM_RB),"stat"=resM_RB$stat))
write.csv(maternal_lpv,file="ReanalysisJuly2018/GOMWU/maternal_wald_lpv_mrna.csv",quote=F, row.names=F)
write.csv(maternal_stat,file="ReanalysisJuly2018/GOMWU/maternal_wald_stat_mrna.csv",quote=F, row.names=F)
write.csv(maternal_fc,file="ReanalysisJuly2018/GOMWU/maternal_wald_fc_mrna.csv",quote=F, row.names=F)
### output for fishers test for maternal condition
resM_RB$padj_go[resM_RB$padj<=0.01]<-1
resM_RB$padj_go[resM_RB$padj>0.01]<-0
resM_RB$padj_go=as.factor(resM_RB$padj_go)
summary(resM_RB$padj_go)
# 0 1
#23737 605
resM_RB=resM_RB[complete.cases(resM_RB),]
resM_RB$padj_go=as.character(resM_RB$padj_go)
maternal_fishers=data.frame(cbind("gene"=row.names(resM_RB),"padj_GO"=resM_RB$padj_go))
write.csv(maternal_fishers,file="ReanalysisJuly2018/GOMWU/maternal_wald_lpv_mrna.csv",quote=F, row.names=F)
# for larval condition
resD_RB$lp=-log(resD_RB$pvalue,10)
resD_RB$lp[resD_RB$log2FoldChange<0]=-resD_RB$lp[resD_RB$log2FoldChange<0]
dev_lpv=data.frame(cbind("gene"=row.names(resD_RB),"lp"=resD_RB$lp))
dev_fc=data.frame(cbind("gene"=row.names(resD_RB),"fc"=resD_RB$log2FoldChange))
dev_stat=data.frame(cbind("gene"=row.names(resD_RB),"stat"=resD_RB$stat))
write.csv(dev_lpv,file="ReanalysisJuly2018/GOMWU/larval_wald_lpv_mrna.csv",quote=F, row.names=F)
write.csv(dev_stat,file="ReanalysisJuly2018/GOMWU/larval_wald_stat_mrna.csv",quote=F, row.names=F)
write.csv(dev_fc,file="ReanalysisJuly2018/GOMWU/larval_wald_fc_mrna.csv",quote=F, row.names=F)
#for comparision between BL and G
res_BL_G$lp=-log(res_BL_G$pvalue,10)
res_BL_G$lp[res_BL_G$log2FoldChange<0]=-res_BL_G$lp[res_BL_G$log2FoldChange<0]
BL_G_lpv=data.frame(cbind("gene"=row.names(res_BL_G),"lp"=res_BL_G$lp))
BL_G_fc=data.frame(cbind("gene"=row.names(res_BL_G),"fc"=res_BL_G$log2FoldChange))
BL_G_stat=data.frame(cbind("gene"=row.names(res_BL_G),"stat"=res_BL_G$stat))
write.csv(BL_G_lpv,file="ReanalysisJuly2018/GOMWU/BL_G_wald_lpv_mrna.csv",quote=F, row.names=F)
write.csv(BL_G_stat,file="ReanalysisJuly2018/GOMWU/BL_G_wald_stat_mrna.csv",quote=F, row.names=F)
write.csv(BL_G_fc,file="ReanalysisJuly2018/GOMWU/BL_G_wald_fc_mrna.csv",quote=F, row.names=F)
#for comparision between G and PR
res_G_PR$lp=-log(res_G_PR$pvalue,10)
res_G_PR$lp[res_G_PR$log2FoldChange<0]=-res_G_PR$lp[res_G_PR$log2FoldChange<0]
G_PR_lpv=data.frame(cbind("gene"=row.names(res_G_PR),"lp"=res_G_PR$lp))
G_PR_fc=data.frame(cbind("gene"=row.names(res_G_PR),"fc"=res_G_PR$log2FoldChange))
G_PR_stat=data.frame(cbind("gene"=row.names(res_G_PR),"stat"=res_G_PR$stat))
write.csv(G_PR_lpv,file="ReanalysisJuly2018/GOMWU/G_PR_wald_lpv_mrna.csv",quote=F, row.names=F)
write.csv(G_PR_stat,file="ReanalysisJuly2018/GOMWU/G_PR_wald_stat_mrna.csv",quote=F, row.names=F)
write.csv(G_PR_fc,file="ReanalysisJuly2018/GOMWU/G_PR_wald_fc_mrna.csv",quote=F, row.names=F)
#for comparision between BL and PR
res_BL_PR$lp=-log(res_BL_PR$pvalue,10)
res_BL_PR$lp[res_BL_PR$log2FoldChange<0]=-res_BL_PR$lp[res_BL_PR$log2FoldChange<0]
BL_PR_lpv=data.frame(cbind("gene"=row.names(res_BL_PR),"lp"=res_BL_PR$lp))
BL_PR_fc=data.frame(cbind("gene"=row.names(res_BL_PR),"fc"=res_BL_PR$log2FoldChange))
BL_PR_stat=data.frame(cbind("gene"=row.names(res_BL_PR),"stat"=res_BL_PR$stat))
write.csv(BL_PR_lpv,file="ReanalysisJuly2018/GOMWU/BL_PR_wald_lpv_mrna.csv",quote=F, row.names=F)
write.csv(BL_PR_stat,file="ReanalysisJuly2018/GOMWU/BL_PR_wald_stat_mrna.csv",quote=F, row.names=F)
write.csv(BL_PR_fc,file="ReanalysisJuly2018/GOMWU/BL_PR_wald_fc_mrna.csv",quote=F, row.names=F)
#---------------- volcano plots ------------------
lnames=load("MeDIP_DESeq2_full_Wald_genes.Rdata")
with(resM_RB, plot(log2FoldChange, -log10(pvalue), pch=20, main="Maternal Condition Padj<0.01", xlim=c(-5,5), ylim=c(1,45)))
with(subset(resM_RB, padj<0.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="firebrick"))
with(resD_RB, plot(log2FoldChange, -log10(pvalue), pch=20, main="Developmental Condition Padj<0.01", xlim=c(-5,5),ylim=c(1,45)))
with(subset(resD_RB, padj<0.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
with(res_BL_PR, plot(log2FoldChange, -log10(pvalue), pch=20, main="Blastula v. Prism Condition Padj<0.01", xlim=c(-5,5),ylim=c(1,45)))
with(subset(res_BL_PR, padj<0.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="firebrick"))
#---------------- DAPC ------------------
meth.rld.df=assay(meth.rld)
degs<-rownames(meth.rld.df)
a.rld<-rld[,c(grep("BB",colnames(rld)), grep("RR",colnames(rld)))]
a.rld.supp<-rld[,c(grep("BR",colnames(rld)),grep("RB",colnames(rld)))]
##Calling PCs
pcp=prcomp(t(a.rld[degs,]), retx=TRUE, center=TRUE, scale=TRUE)
scores=pcp$x
screeplot(pcp,bstick=T)
clus=find.clusters(t(a.rld[degs,]),max.n.clus=15, n.clust=2, n.pca=4)
colnames(rld)
clus$grp=c(substr(colnames(a.rld), start=1,stop=2))
#Discriminate between maternal conditions (B or R), keep 1 DF, use PCs to account for 80% of var, sets up axis for upwelling mom-like GBM and nonupwelling mom-like GBM
dp=dapc(t(a.rld[degs,]),clus$grp, n.da=1, perc.pca=80)
scatter(dp,bg="white",scree.da=FALSE,legend=TRUE,solid=.4)
#cluster based on dev treatment
pred.sup<-predict.dapc(dp,newdata=(t(a.rld.supp[degs,])))
names(pred.sup)
pred.sup$assign
colnames(a.rld.supp)
test<-dp
test$ind.coord<-pred.sup$ind.scores
test$posterior<-pred.sup$posterior
test$assign<-pred.sup$assign
#test$grp<-as.factor(test$assign)
test$grp<-as.factor(substr(colnames(a.rld.supp), start =1, stop=2))
scatter(test,bg="white",scree.da=FALSE,legend=TRUE,solid=.4,ylim=c(0,0.5),xlim=c(-6,5))
dpc=data.frame(rbind(dp$ind.coord,pred.sup$ind.scores))
write.csv(dpc,"MeDIP_DFA.csv",quote=F)
#---------------- prepare morphometric data ------------------
#setwd("/Users/mariestrader/Dropbox/MEDIP/PPhoxMorphometrics/morph") #change to where your files are stored
gas=read.csv("PurplepHox_Gastrula_edited.csv") #load in raw file
hb=read.csv("PurplepHox_HatchedBlastula_corrected_edited.csv") #load in raw file
pr=read.csv("PurplepHox_Prism_corrected_edited.csv") #load in raw file
#### Gastrula
gas$Diameter=as.numeric(gas$Diameter)
all.G=summarySE(gas,measurevar="Diameter",groupvars=c("tubeLabel"), na.rm=T)
all.G
all.G$tubeLabel=sub("G_","", all.G$tubeLabel)
all.G$tubeLabel=paste0(all.G$tubeLabel, "_G")
colnames(all.G)=c("bucket","N","length","sd","se","ci")
#### Hatched blastula
hb$Diameter=as.numeric(hb$length)
all.BL=summarySE(hb,measurevar="length",groupvars=c("tube_label"), na.rm=T)
all.BL
all.BL$tube_label=paste0(all.BL$tube_label, "_BL")
colnames(all.margBL)=c("bucket","N","length","sd","se","ci")
#### Prism
pr$length=as.numeric(pr$length)
all.PR=summarySE(pr,measurevar="length",groupvars=c("Bucket"), na.rm=T)
all.PR
all.PR$Bucket=paste0(all.PR$Bucket, "_PR")
colnames(all.PR)=c("bucket","N","length","sd","se","ci")
#Combine length data for all stages
length=rbind(all.BL,all.G,all.PR)
length$stage<-as.factor(substr(length$bucket, start =5, stop=6))
write.csv(length,"morphData_dapc.csv",quote=F)
#---------------- correlate morphometric data with DAPC ------------------
dpc=read.csv("MeDIP_DFA.csv")
length=read.csv("/Users/mariestrader/Dropbox/MEDIP/morphData_dapc.csv")
colnames(dpc)=c("bucket","LD1")
m=merge(length, dpc, by='bucket', all.x=T)
m$treat<-as.factor(substr(m$bucket, start =1, stop=2))
m=m[complete.cases(m),]
m$treat_maternal = factor(substr(m$treat, 1,1))
m$treat_larval = factor(substr(m$treat, 2,2))
treat_colors <- c("blue", "turquoise4", "coral","red")
m$treat_c <- treat_colors[as.factor(m$treat)]
plot(m$length~m$LD1, pch=20, col=m$treat_c, xlim=c(-6,5),xlab="Discriminant Function", ylab="Embryo Length in mm")
lm1=lm(m$length~m$LD1)
summary(lm1)
#lm(formula = m$length ~ m$LD1)
#
#Residuals:
# Min 1Q Median 3Q Max
#-0.011570 -0.006462 -0.001613 0.006509 0.014196
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 0.1137131 0.0014845 76.603 <2e-16 ***
#m$LD1 0.0018398 0.0007783 2.364 0.025 *
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.008257 on 29 degrees of freedom
#Multiple R-squared: 0.1616, Adjusted R-squared: 0.1327
#F-statistic: 5.588 on 1 and 29 DF, p-value: 0.02499
abline(lm1)
legend("topleft", legend = levels(m$treat), col=treat_colors, pch = 19)