-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_ELGANcordmetals_singlemetals_DESeq2.R
976 lines (809 loc) · 46.9 KB
/
1_ELGANcordmetals_singlemetals_DESeq2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
#################################################################################################
#################################################################################################
#### ELGAN cord metals gene expression: evaluating placental transcriptomic responses to cord
#### metal levels
#### Part 1: Single metals DeSeq2 analysis, adjusted models
####
#### Using DESeq2 for sample normalization and statistical analysis to identify mRNA associated with metals
#### Code includes: data organization,PCA, and other steps prior to DESeq2
####
####
#### Code drafted by Lauren Eaves, with reference to code generated by Lauren Koval, and Julia Rager
#### Lasted updated: October 13th 2021
#################################################################################################
#################################################################################################
#################################################################################################
#################################################################################################
#### Installing appropriate packages in R (if already have these installed, SKIP THIS STEP)
#################################################################################################
#################################################################################################
sessionInfo()
rm(list=ls())
# Installing DESeq2:
# Note that BiocManager allows you to access the Bioconductor repo in which DeSeq2 package is. DeSeq2 pacakge has a "tibble" "RcppArmadillo" and "rlang" dependency.
# to install RcppArmadillo, you must install a Fortan compiler (I used gfortran-6.1.pkg, https://cran.r-project.org/bin/macosx/tools/).
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#install.packages("tibble")
#install.packages("rlang")
#install.packages("RcppArmadillo")
# Install DESeq and SVA:
#BiocManager::install("DESeq2", version = "3.10")
#BiocManager::install("sva", version ="3.10")
#BiocManager::install("BiocManager")
# Install plot packages
#install.packages("ggbeeswarm") # note that I had to manually install this package using tools --> install packages
#install.packages("gridExtra")
# Install tidyverse packages
#install.packages("tidyverse")
# Install gplots packages
#install.packages("gplots")
# Install RColorBrewer package
#install.packages("RColorBrewer")
# Install scales (needed for tidyverse activation):
#install.packages("scales")
# Install data.tables
#install.packages("data.table")
#################################################################################################
#################################################################################################
#### Activating appropriate packages in R
#################################################################################################
#################################################################################################
# Activating appropriate libraries
library(data.table)
library(rlang)
library(DESeq2)
library(limma)
library(sva)
library(ggplot2)
library(stats)
library(scales)
library(ggbeeswarm)
library(gridExtra)
library(tidyverse)
library(dplyr)
library(gplots)
library(RColorBrewer)
library(data.table)
library(ggpubr)
library(EnhancedVolcano)
#################################################################################################
#################################################################################################
#### Set working directory, output folder path, etc
#################################################################################################
#################################################################################################
# Set working directory
setwd("#yourwd")
getwd()
# Create an output folder (make sure to make the folder first, and then point to it here)
Output_Folder <- ("#youroutputfolder")
# Create a current date variable to name outputfiles
cur_date <- str_replace_all(Sys.Date(),"-","")
#################################################################################################
#################################################################################################
#### Loading count data and metadata (subject information)
#### Organizing and filtering to keep subjects present in both files
#################################################################################################
#################################################################################################
# Read in count data
countdata <- read.table(file = '#countdata', sep = '\t', header = TRUE)
dim(countdata)
#404 samples (plus Gene ID column), 37268 genes
# visualize this data quickly by viewing top left corner:
countdata[1:3,1:6]
# Check for duplicate gene IDs in the countdata
Dups <- duplicated(countdata[,1])
summary(Dups)
# Not an issue for this ELGAN dataset
# Read in metadata (subject info/covariates and metals data)
subjectinfo <- read.csv("#cohortdata", check.names=FALSE)
dim(subjectinfo)
# Visualize this data quickly by viewing top left corner:
subjectinfo[1:3,1:10]
subjectinfo[1]<-NULL
colnames(subjectinfo)
#################################################################################################
#################################################################################################
#### Applying first round of various filters to these data files, including:
#### (1) First to first keep subjects present in both files
#### (2) Background filter, to filter out genes that were universally lowly expressed
#### (3) Check subject filter, to check that we have filtered out subjects that had samples present all values of zero on RNAseq (and thus not detected)
#### (4) Re-filter to include subjects meeting the above filters in (2) and (3)
#################################################################################################
#################################################################################################
##########
##########
# (1) Filtering to first keep subjects present in both files
##########
##########
#import ids dataset and merge mRNAseq with cohort
ids <- read.csv(file="#idsfile") %>%
select(Elgan_ID, mRNA.seq_datafile_ID)
subjectinfo <- left_join(subjectinfo, ids, by="Elgan_ID") %>%
dplyr::rename(ID=mRNA.seq_datafile_ID)
# Pull all mRNAseq IDs that overlap between countdata & subjectinfo
keep_samples <- colnames(countdata[, colnames(countdata) %in% subjectinfo$ID])
# Note that in this case, this represents 253 IDs - note that in the data prep R script, we filtered the cohort data to only include observations with mRNA data
# Filter the countdata dataframe for these IDs
# Because the first column is the gene identifier information, this will get lost if we don't also grab that separately (hence, a few steps here)
countdata_temp <- countdata[, colnames(countdata) %in% keep_samples] # pulling all countdata, without gene identifier column
countdata <- cbind(countdata$Gene, countdata_temp) # adding the gene identifier column back in
colnames(countdata)[1] <- "Gene" # renaming the gene identifer column
countdata_temp <- NULL # removing temporary dataframe
# Viewing merged, filtered count data
countdata[1:3,1:10]
# Filter the subjectinfo dataframe for these IDs, just to check
subjectinfo <- subjectinfo[subjectinfo$ID %in% keep_samples, ]
#N=226
##########
##########
# (2) Background filter, to filter out genes that were universally lowly expressed
##########
##########
# Create variable for number of samples, assuming that one variable is Gene:
numberofsamples= (ncol(countdata)-1)
# Create a variable that is the median of all expression levels across all samples and genes
totalmedian=median(as.matrix(countdata[,2:ncol(countdata)]))
# Create a list of genes that pass the background filter:
Genes_backgroundfiltered <- countdata %>%
#gather so that there is one row for each gene-sample pairing
gather(key="sampleID", value="expression", 2:ncol(countdata)) %>%
#add a variable that takes the value 1 is the expression value is above the median
mutate(abovemedian=ifelse(expression>totalmedian,1,0)) %>%
#sum the abovemedian variable created above by gene so that totalabovemedian calcualtes the number of samples for each gene
#that have expression levels above the median
group_by(Gene) %>%
summarise(totalabovemedian=sum(abovemedian)) %>%
#remove all genes that have a totalabovemedian equal to less than a specified percentage of the total number of samples
filter(totalabovemedian > (0.20*numberofsamples)) %>%
#select only gene so that you essentially have a list of the genes to include
select(Gene)
# Select genes that passed background filter only to proceed with analysis
countdata <- left_join(Genes_backgroundfiltered, countdata, by="Gene")
# This resulted in filtering the original list of 37268 genes down to 11,292 genes
##########
##########
#### (3) Check subject filter, to check that we have filtered out subjects that had samples present all values of zero (and thus not detected)
##########
##########
# Note that this code was originally drafted using a transposed version of the countdata dataframe
# So here, rather than modifying, we will transpose and then transpose back at the end
#transpose count data
countdata_t <- countdata %>%
gather(key="sampleID", value="expression", 2:ncol(countdata)) %>%
spread(Gene, expression)
# view transposed count data
countdata_t[1:4,1:10]
countdata_t <- countdata_t %>%
#create variable that is the sum of count values across all genes, for each sample
mutate(rowsum= rowSums(countdata_t[,2:nrow(countdata_t)],na.rm = FALSE, dims=1L))
#identify the samples that will be removed, for records
samples_zerocounts <- countdata_t %>%
filter(rowsum ==0) %>%
select("sampleID")
countdata_t <- countdata_t %>%
#remove samples that have a total sum of all counts of zero
filter(rowsum !=0) %>%
select(-rowsum)
# SAMPLES REMOVED: No samples were removed (ie. no smaples had only zero counts across all)
# Transposing the countdata back such that the genes are the rownames and the samples are the columnnames
countdata <- countdata_t %>%
gather(Gene, value, 2:ncol(countdata_t)) %>%
spread(sampleID, value)
countdata[1:4,1:10]
##########
##########
# (4) Re-filter to include subjects meeting the above filters in (2) and (3)
##########
##########
# Pull all mRNAseq IDs that overlap between countdata & subjectinfo
keep_samples <- colnames(countdata[, colnames(countdata) %in% subjectinfo$ID])
# Note that in this case, this represents 226 IDs
# Filter the countdata dataframe for these IDs
# Because the first column is the gene identifier information, this will get lost if we don't also grab that separately (hence, a few steps here)
countdata_temp <- countdata[, colnames(countdata) %in% keep_samples] # pulling all countdata, without gene identifier column
countdata <- cbind(countdata$Gene, countdata_temp) # adding the gene identifier column back in
colnames(countdata)[1] <- "Gene" # renaming the gene identifer column
# Viewing merged, filtered count data
countdata[1:3,1:10]
# Filter the subjectinfo dataframe for these IDs
subjectinfo <- subjectinfo[subjectinfo$ID %in% keep_samples, ]
#################################################################################################
#################################################################################################
#### Create dataframes that are formatted for proceeding code, as well as DESeq2 functions
#### countdata and coldata
#################################################################################################
#################################################################################################
# Make the gene variable the rowname
countdata <- countdata %>%
column_to_rownames(var="Gene")
# Creating the coldata object, based on information in the subjectinfo file
coldata <- subjectinfo
# Set the rownames of coldata and column names of countdata to be in the same order
countdata <- setcolorder(countdata, as.character(coldata$ID))
# Double checking that the same variables appear between the two dataframes
setequal(as.character(coldata$ID), colnames(countdata))
# Additionally checking that not only the sets of variables are the same, but that they are in the same order
identical(as.character(coldata$ID), colnames(countdata))
#################################################################################################
#################################################################################################
#### RNASeq QA/QC on raw count data to identify potential outlier samples
#### This may or may not result in another filter step
#################################################################################################
#################################################################################################
# One way to evaluate sequencing data quality / identify potential outliers is through Principal Component Analysis (PCA):
# PCA helps in identifying outlying samples for quality control, and gives a feeling for the principal causes of variation in a dataset
# Calculate principal components using transposed count data
pca <- prcomp(t(countdata))
screeplot(pca)
#this scree plot indicates that nearly all variation is explained in PC1,2 and 3,
#thus PCA is a useful exercise
# Make dataframe for PCA plot generation using first three components
pca_df <- data.frame(PC1genes = pca$x[,1], PC2genes = pca$x[,2], PC3genes = pca$x[,3], ID=colnames(countdata))
# Add attributes (covariates from coldata) to pca df
pca_df <- merge(pca_df, coldata, by="ID")
# Calculating percent of the variation that is captured by each principal component
pca_percent <- round(100*pca$sdev^2/sum(pca$sdev^2),1)
# Generating PCA plot annotated by sex
pca_df$sex <- as.factor(pca_df$sex)
ggplot(pca_df, aes(PC1genes,PC2genes, color = sex))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Generating PCA plot annotated by maternal bmi
pca_df$bmicat <- as.factor(pca_df$bmicat)
ggplot(pca_df, aes(PC1genes,PC2genes, color = bmicat))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Generating PCA plot annotated by maternal smoking
pca_df$smoke <- as.factor(pca_df$smoke)
ggplot(pca_df, aes(PC1genes,PC2genes, color = smoke))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Generating PCA plot annotated by ses score
pca_df$score <- as.factor(pca_df$score)
ggplot(pca_df, aes(PC1genes,PC2genes, color = score))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Lets identify the outliers
ggplot(pca_df, aes(PC1genes,PC2genes))+
geom_text(aes(label=ID, size=3)) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# This would suggest that LS18028 and LS18076 are potential outliers,
# We will conduct heirachical clustering to investigate further and to determine if we should also remove these samples
# create a dataframe with samples as rows and genes as columns to input into hclust
countdata_forclustering <- t(countdata)
countdata_forclustering[1:5,1:10]
# run heirarchical clustering
hc <-hclust(dist(countdata_forclustering))
jpeg(filename= paste0(Output_Folder,"/",cur_date, "_heirachicalclusteringforoutliers.jpeg"),width = 4500, height = 350)
plot(hc)
dev.off()
# Here, the aforementioned samples were clearly separate from the rest, so should be removed
#################################################################################################
#################################################################################################
#### Removing sample(s) with potential QA/QC issues
#################################################################################################
#################################################################################################
# Remove these subject IDs from our "keep_samples" vector
keep_samples <- keep_samples[! keep_samples %in% c("LS18028", "LS18076")]
# Note that in this case, this represents 244 IDs
# Filter the countdata dataframe for these IDs
countdata <- countdata[, colnames(countdata) %in% keep_samples]
countdata[1:5, 1:10]
# Filter the coldata dataframe for these IDs
coldata <- coldata[coldata$ID %in% keep_samples, ]
# Export the raw data for just the included samples to potentially generate plots outside of R
write.csv(countdata, paste0(Output_Folder,"/", cur_date, "_RawCounts_SubjectsIncludedinModel.csv"), row.names= TRUE)
write.csv(coldata, paste0(Output_Folder,"/", cur_date, "_SubjectInfo_SubjectsIncludedinModel.csv"), row.names= FALSE)
# Note that if, for future studies, we exclude subjects with missing covariate data, that step will have to incorporated prior to this export
# But for this analysis, we're imputing missing covariate data (using random forest modeling), so went ahead and exported raw data here
#Now N=224
#################################################################################################
#################################################################################################
#### Rerunning PCA analysis with outlier removed
#################################################################################################
#################################################################################################
# Calculate principal components using transposed count data
pca <- prcomp(t(countdata))
screeplot(pca)
#this scree plot indicates that nearly all variation is explained in PC1,2 and 3,
#thus PCA is a useful exercise
# Make dataframe for PCA plot generation using first three components
pca_df <- data.frame(PC1genes = pca$x[,1], PC2genes = pca$x[,2], PC3genes = pca$x[,3], ID=colnames(countdata))
# Add attributes (covariates from coldata) to pca df
pca_df <- merge(pca_df, coldata, by="ID")
# Calculating percent of the variation that is captured by each principal component
pca_percent <- round(100*pca$sdev^2/sum(pca$sdev^2),1)
# Generating PCA plot annotated by sex
pca_df$sex <- as.factor(pca_df$sex)
ggplot(pca_df, aes(PC1genes,PC2genes, color = sex))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Generating PCA plot annotated by maternal bmi
pca_df$bmicat <- as.factor(pca_df$bmicat)
ggplot(pca_df, aes(PC1genes,PC2genes, color = bmicat))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Generating PCA plot annotated by maternal smoking
pca_df$smoke <- as.factor(pca_df$smoke)
ggplot(pca_df, aes(PC1genes,PC2genes, color = smoke))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Generating PCA plot annotated by ses score
pca_df$score <- as.factor(pca_df$score)
ggplot(pca_df, aes(PC1genes,PC2genes, color = score))+
geom_point(size=6) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
# Lets identify the outliers
ggplot(pca_df, aes(PC1genes,PC2genes))+
geom_text(aes(label=ID, size=3)) +
labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))
#################################################################################################
#################################################################################################
#### QA/QC and further formatting of covariate (subject information) data to adjust for missing data
#### Note that covariates have already been imputed in the Data Prep R script
#### So here we just need to set them to the appropriate variable types (e.g., factor vs numeric)
#################################################################################################
#################################################################################################
##########
##########
# (1) First filtering for covariates we're interested in including
##########
##########
# For the this analysis, we are interesting in keeping all metals data, as well as the following covariates:
# score (SES cumulative index), bmicat (maternal prepreg BMI), smoke (maternal smoking during pregnancy) and SVs
colnames(coldata)
metalvars<-colnames(coldata)[26:47]
print(metalvars)
coldata <- coldata %>% select(ID, score, bmicat, smoke, sex, SV1, SV2, SV3, all_of(metalvars))
##########
##########
# (2) Evaluating number of subjects with missing data for each covariate
##########
##########
# Look for missing covariate data in coldata
sapply(coldata, function(x) sum(is.na(x)))
##########
##########
# (3) Setting appropriate variable types to be recognized by R (e.g., factor vs numeric)
##########
##########
colnames(coldata)
cont_metalvars<- colnames(coldata)[9:19]
cat_metalvars<- colnames(coldata)[20:30]
# Designate the specific covariates as either factors or numeric variables
coldata$score <- as.numeric(coldata$score)
coldata$bmicat <- as.factor(coldata$bmicat)
coldata$smoke <- as.factor(coldata$smoke)
coldata$sex <- as.factor(coldata$sex)
coldata[cont_metalvars] <- lapply(coldata[cont_metalvars], as.numeric)
coldata[cat_metalvars] <- lapply(coldata[cat_metalvars], factor)
#################################################################################################
#################################################################################################
#### Setting up the actual DESeq2 function and associated algorithm ("experiment")
#################################################################################################
#################################################################################################
# More information can be found about the DeSeq2 package at https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8, as well as many other resources, as this package is well documented
# At this point, we need to make sure that the coldata is in the following format:
# rownames = the subject ID that matches the countdata
# all other columns indicate the covariate data that we may/may not include in the final experiment
coldata[1:5, 1:7] # viewing data
# At this point, we need to make sure that the countdata is in the following format:
# rownames = the gene identifiers
# all other columns indicate individual samples that match (and are in the same order as) the coldata
countdata[1:5, 1:10] # viewing data
# all values should be integers (DESeq2 requires integer count values), need to convert values here if haven't already
sapply(countdata, class) # checking the class of the values
countdata_2 <- as.data.frame(sapply(countdata, as.integer)) # converting into integers if showing numeric
sapply(countdata_2, class) # re-checking the class of the values
row.names(countdata_2) <- row.names(countdata) # this removed the row names (gene identifiers), so had to merge these back
countdata <- countdata_2
# Create the final DESeq2 experiment, with appropriate experimental design:
# Note in the design: last factor indicates the factor we want to test the effect of; the first factor(s) = factors we want to control for
# Note that in this particular experiment, it gives us a warning that we are using integer data as numeric variables - this is what we want for some variables
#Create the dds objects for all the continuous variables
Mn_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Mn_ugg_log)
Cu_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cu_ugg_log)
Zn_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Zn_ugg_log)
As_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + As_ngg_log)
Se_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Se_ugg_log)
Sr_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sr_ugg_log)
Cd_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cd_ngg_log)
Sb_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sb_ngg_log)
Ba_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Ba_ngg_log)
Hg_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Hg_ngg_log)
Pb_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Pb_ngg_log)
#Create the dds objects for the categorical variables
print(cat_metalvars)
As_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + As_cat)
Ba_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Ba_cat)
Cd_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cd_cat)
Cu_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cu_cat)
Hg_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Hg_cat)
Mn_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Mn_cat)
Pb_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Pb_cat)
Sb_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sb_cat)
Se_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Se_cat)
Sr_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sr_cat)
Zn_cat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~SV1 + SV2+ score + bmicat + smoke + sex + Zn_cat)
# View what the experiment contains
Zn_cat
# Let's also make sure that we have the main contrast (last term) in the order we want to calculate appropriate fold change values
# In this case, we want metal=0 as denominators (first level, aka, the "non-exposed" level), and metal=1, as numerators
As_cat$As_cat <- relevel (As_cat$As_cat, "0")
Ba_cat$Ba_cat <- relevel (Ba_cat$Ba_cat, "0")
Cd_cat$Cd_cat <- relevel (Cd_cat$Cd_cat, "0")
Cu_cat$Cu_cat <- relevel (Cu_cat$Cu_cat, "0")
Hg_cat$Hg_cat <- relevel (Hg_cat$Hg_cat, "0")
Mn_cat$Mn_cat <- relevel (Mn_cat$Mn_cat, "0")
Pb_cat$Pb_cat <- relevel (Pb_cat$Pb_cat, "0")
Sb_cat$Sb_cat <- relevel (Sb_cat$Sb_cat, "0")
Se_cat$Se_cat <- relevel (Se_cat$Se_cat, "0")
Sr_cat$Sr_cat <- relevel (Sr_cat$Sr_cat, "0")
Zn_cat$Zn_cat <- relevel (Zn_cat$Zn_cat, "0")
# Need to next, estimate the size factors, since size factors are used to normalize the counts (next step)
# The "iterate" estimator iterates between estimating the dispersion with a design of ~1, and finding a size factor vector by numerically optimizing the likelihood of the ~1 model.
for (i in 1:length(cat_metalvars)){
dds <- estimateSizeFactors(eval(parse(text = paste0(cat_metalvars[[i]]))))
sizeFactors(dds)
assign(paste0(cat_metalvars[[i]]),dds)
print(cat_metalvars[[i]])
}
for (i in 1:length(cont_metalvars)){
dds <- estimateSizeFactors(eval(parse(text = paste0(cont_metalvars[[i]]))))
sizeFactors(dds)
assign(paste0(cont_metalvars[[i]]),dds)
print(cont_metalvars[[i]])
}
#################################################################################################
#################################################################################################
#### Pulling and exporting normalized counts, for internal records and for potential future plots/figures
#################################################################################################
#################################################################################################
# Note that code is drafted to output several forms of normalized data - so users can choose which applies to their external purposes the best
# Also note that variance stabilized (vsd) values are commonly used for figures, which requires the experiment to run and adjust for covariates below, so will export these in the code below
# normalized counts:
normcounts<- counts(dds, normalized=TRUE)
write.csv(normcounts, paste0(Output_Folder,"/", cur_date, "_NormCounts_IncludedSubjects.csv"), row.names=TRUE)
# pseudocounts to make plots easier:
ps_normcounts <- normcounts + 1
write.csv(ps_normcounts, paste0(Output_Folder,"/",cur_date, "_NormCounts_ps_IncludedSubjects.csv"),row.names=TRUE)
# log2 pseudocounts (y=log2(n+1))
log2normcounts <- log2(normcounts+1)
write.csv(log2normcounts, paste0(Output_Folder,"/", cur_date, "_NormCounts_pslog2_IncludedSubjects.csv"), row.names=TRUE)
# Export variance stabilized counts, which will be influenced by the new design above
# vsd will be useful for plots and future figure generation
# CAUTION: THIS STEP TAKES SOME TIME!
#continuous variables
for (i in 1:length(cont_metalvars)){
vsd <- varianceStabilizingTransformation(eval(parse(text = paste0(cont_metalvars[[i]]))), blind=FALSE)
vsd_matrix <-as.matrix(assay(vsd))
write.csv(vsd_matrix, paste0(Output_Folder,"/", cur_date,"_", cont_metalvars[[i]], "_VSDCounts_IncludedSubjects.csv"), row.names=TRUE)
assign(paste0("vsd_matrix.",cont_metalvars[[i]]),vsd_matrix)
print(cont_metalvars[[i]])
}
#categorical variables
for (i in 1:length(cat_metalvars)){
vsd <- varianceStabilizingTransformation(eval(parse(text = paste0(cat_metalvars[[i]]))), blind=FALSE)
vsd_matrix <-as.matrix(assay(vsd))
write.csv(vsd_matrix, paste0(Output_Folder,"/", cur_date,"_", cat_metalvars[[i]], "_VSDCounts_IncludedSubjects.csv"), row.names=TRUE)
assign(paste0("vsd_matrix.",cat_metalvars[[i]]),vsd_matrix)
print(cat_metalvars[[i]])
}
#################
#################
#### Evaluating variance stabilized values to guage whether SVs accounted for unwanted sources of variation between samples
#################
#################
#check Zn_cat
plotPCA(vsd, intgroup="Zn_cat", returnData=FALSE)
plotPCA(vsd, intgroup="SV1", returnData=FALSE)
#SV1 accounts for a lot of variation along PC1 as shown by the steady gradient in color across the PC1 axis
plotPCA(vsd, intgroup="SV2", returnData=FALSE)
#In sum, it looks like the SVs are accounting for a lot of the unwanted variation which is great!
#################################################################################################
#################################################################################################
#### Statistical analysis to detect significantly differentially expressed genes
#################################################################################################
#################################################################################################
# Running the differential expression statistical pipeline
# CAUTION: THIS STEP TAKES A LOT OF TIME!
for (i in 1:length(cont_metalvars)){
dds <- DESeq(eval(parse(text = paste0(cont_metalvars[[i]]))), betaPrior=FALSE) # because we used a user-defined model matrix, need to set betaPrior=FALSE
res <- results(dds, pAdjustMethod = "BH") #with multiple test correction by the default, BH (aka FDR)
write.csv(as.data.frame(res)[order(res$padj),], paste0(Output_Folder,"/", cur_date,"_", cont_metalvars[[i]],"_AllStatResults.csv"))# Exporting statistical results
assign(paste0("res.",cont_metalvars[[i]]),res)
print(cont_metalvars[[i]])
}
for (i in 1:length(cat_metalvars)){
dds <- DESeq(eval(parse(text = paste0(cat_metalvars[[i]]))), betaPrior=FALSE) # because we used a user-defined model matrix, need to set betaPrior=FALSE
res <- results(dds, pAdjustMethod = "BH") #with multiple test correction by the default, BH (aka FDR)
write.csv(as.data.frame(res)[order(res$padj),], paste0(Output_Folder,"/", cur_date,"_", cat_metalvars[[i]],"_AllStatResults.csv"))# Exporting statistical results
assign(paste0("res.",cat_metalvars[[i]]),res)
print(cat_metalvars[[i]])
}
#note receiving some messages: 156 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
#################################################################################################
#################################################################################################
#### MA plots
#################################################################################################
#################################################################################################
#Continuous variables
for (i in 1:length(cont_metalvars)){
MA <- as.data.frame(eval(parse(text = paste0("res.",cont_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cont_metalvars[[i]])))$padj),]
MA[is.na(MA)] <- 1
MA1 <- MA[ which(MA$padj>=0.1),] # all the rest
MA2 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > log2(1.5)),] # sig up-regulated w/ FC filter-FIREBRICK
MA3 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < -log2(1.5)),] # sig down-regulated w/ FC filter-dodgerblue4
MA4 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > 0 & MA$log2FoldChange <= log2(1.5)),] # sig up-regulated w/o FC filter- darkorchid3
MA5 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < 0 & MA$log2FoldChange >= -log2(1.5)), ] # sig down-regulated w/o FC filter- chartreuse4
plot <-
ggplot(subset(MA1, baseMean>=0), aes(x = baseMean, y = log2FoldChange)) + #can subset to exclude the very low counts (<1 average count) if we want
geom_point(color="gray56", size = 1, alpha=0.5) +
geom_point(data = MA2, color="firebrick", size=1.5, show.legend = TRUE) + # colors used: firebrick, dodgerblue4, chocolate1, darkorchid3, chartreuse4
geom_point(data = MA3, color="dodgerblue4", size=1.5, show.legend = TRUE) +
geom_point(data = MA4, color="darkorchid3", size=1.5, show.legend = TRUE) +
geom_point(data = MA5, color="chartreuse4", size=1.5, show.legend = TRUE) +
theme_bw()+ theme(axis.text.x = element_text(size = 10), axis.text.y = element_text(size=10)) +
scale_x_continuous(trans = "log10", breaks=c(1,10,100, 1000, 10000, 100000, 1000000), labels=c("1","10","100", "1000", "10000", "100000", "1000000")) +
xlab("Expression (Normalized Count)") +
ylab("(log2)FC, each log unit increase") +
geom_hline(yintercept=0)+
ggtitle(paste0(cont_metalvars[[i]])) +
theme(plot.title = element_text(hjust = 0.5))
assign(paste0("plot.",cont_metalvars[[i]]),plot)
}
print(cont_metalvars)
all_cont <- ggarrange(plot.As_ngg_log, plot.Ba_ngg_log, plot.Cd_ngg_log, plot.Cu_ugg_log, plot.Hg_ngg_log, plot.Mn_ugg_log,
plot.Pb_ngg_log, plot.Sb_ngg_log, plot.Se_ugg_log, plot.Sr_ugg_log, plot.Zn_ugg_log,
nrow=4, ncol=3)
all_cont
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot.pdf"),
width = 10, height = 12)
all_cont
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot.pdf"),
width = 10, height = 12)
all_cont
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_As.pdf"),
width = 8, height = 6)
plot.As_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Ba.pdf"),
width = 8, height = 6)
plot.Ba_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Cd.pdf"),
width = 8, height = 6)
plot.Cd_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Cu.pdf"),
width = 8, height = 6)
plot.Cu_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Hg.pdf"),
width = 8, height = 6)
plot.Hg_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Mn.pdf"),
width = 8, height = 6)
plot.Mn_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Pb.pdf"),
width = 8, height = 6)
plot.Pb_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Sb.pdf"),
width = 8, height = 6)
plot.Sb_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Se.pdf"),
width = 8, height = 6)
plot.Se_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Sr.pdf"),
width = 8, height = 6)
plot.Sr_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Zn.pdf"),
width = 8, height = 6)
plot.Zn_ugg_log
dev.off()
#Categorical variables
for (i in 1:length(cat_metalvars)){
MA <- as.data.frame(eval(parse(text = paste0("res.",cat_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cat_metalvars[[i]])))$padj),]
MA[is.na(MA)] <- 1
MA1 <- MA[ which(MA$padj>=0.1),] # all the rest
MA2 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > log2(1.5)),] # sig up-regulated w/ FC filter-FIREBRICK
MA3 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < -log2(1.5)),] # sig down-regulated w/ FC filter-dodgerblue4
MA4 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > 0 & MA$log2FoldChange <= log2(1.5)),] # sig up-regulated w/o FC filter- darkorchid3
MA5 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < 0 & MA$log2FoldChange >= -log2(1.5)), ] # sig down-regulated w/o FC filter- chartreuse4
plot <-
ggplot(subset(MA1, baseMean>=0), aes(x = baseMean, y = log2FoldChange)) + #can subset to exclude the very low counts (<1 average count) if we want
geom_point(color="gray56", size = 1, alpha=0.5) +
geom_point(data = MA2, color="firebrick", size=1.5, show.legend = TRUE) + # colors used: firebrick, dodgerblue4, chocolate1, darkorchid3, chartreuse4
geom_point(data = MA3, color="dodgerblue4", size=1.5, show.legend = TRUE) +
geom_point(data = MA4, color="darkorchid3", size=1.5, show.legend = TRUE) +
geom_point(data = MA5, color="chartreuse4", size=1.5, show.legend = TRUE) +
theme_bw()+ theme(axis.text.x = element_text(size = 10), axis.text.y = element_text(size=10)) +
scale_x_continuous(trans = "log10", breaks=c(1,10,100, 1000, 10000, 100000, 1000000), labels=c("1","10","100", "1000", "10000", "100000", "1000000")) +
xlab("Expression (Normalized Count)") +
ylab("(log2)FC, >= vs. < median") +
geom_hline(yintercept=0)+
ggtitle(paste0(cat_metalvars[[i]])) +
theme(plot.title = element_text(hjust = 0.5))
assign(paste0("plot.",cat_metalvars[[i]]),plot)
}
#export plots
print(cat_metalvars)
all_cat <- ggarrange(plot.As_cat, plot.Ba_cat, plot.Cd_cat, plot.Cu_cat, plot.Hg_cat, plot.Mn_cat,
plot.Pb_cat, plot.Sb_cat, plot.Se_cat, plot.Sr_cat, plot.Zn_cat,
nrow=4, ncol=3)
all_cat
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot.pdf"),
width = 8, height = 10)
all_cat
dev.off()
#export individual plots with DE genes
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Ba.pdf"),
width = 8, height = 6)
plot.Ba_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Cd.pdf"),
width = 8, height = 6)
plot.Cd_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Cu.pdf"),
width = 8, height = 6)
plot.Cu_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Hg.pdf"),
width = 8, height = 6)
plot.Hg_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Mn.pdf"),
width = 8, height = 6)
plot.Mn_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Pb.pdf"),
width = 8, height = 6)
plot.Pb_cat
dev.off()
#################################################################################################
#################################################################################################
#### Volcano plots
#################################################################################################
#################################################################################################
#categorical variables
for (i in 1:length(cat_metalvars)){
siggenes <- as.data.frame(eval(parse(text = paste0("res.",cat_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cat_metalvars[[i]])))$padj),]
siggenes <- as.data.frame(siggenes) %>%
rownames_to_column(var="gene") %>%
filter(padj <0.1) %>%
mutate(gene=as.character(gene))
siggenes<-as.vector(siggenes$gene)
FC_pvalue <- EnhancedVolcano(eval(parse(text = paste0("res.",cat_metalvars[[i]]))),
lab = rownames(eval(parse(text = paste0("res.",cat_metalvars[[i]])))),
selectLab = c(siggenes),
x = 'log2FoldChange',
y = 'padj',
pCutoff= 0.1,
FCcutoff = log2(1.5),
cutoffLineWidth = 0.6,
title="",
ylim = c(0,5),
subtitle = "",
caption = "",
gridlines.minor = FALSE,
legendPosition = 'bottom',
legendLabSize = 12,
legendIconSize = 3.0,
axisLabSize = 12,
legend = c("Not sig.", "Log2 FC>1.5","adj p <0.1","Log2 FC>1.5 & adj p <0.1"))
FC_pvalue <- FC_pvalue + ggtitle(paste0(cat_metalvars[[i]])) +theme(title = element_text(size=5))
assign(paste0("plotvolcano.",cat_metalvars[[i]]),FC_pvalue)
}
all_cat <- ggarrange(plotvolcano.As_cat, plotvolcano.Ba_cat, plotvolcano.Cd_cat, plotvolcano.Cu_cat, plotvolcano.Hg_cat, plotvolcano.Mn_cat,
plotvolcano.Pb_cat, plotvolcano.Sb_cat, plotvolcano.Se_cat, plotvolcano.Sr_cat, plotvolcano.Zn_cat,
nrow=4, ncol=3, common.legend=TRUE)
all_cat
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_volcanoplot.pdf"),
width = 10, height = 20)
all_cat
dev.off()
#continuous variables
for (i in 1:length(cont_metalvars)){
siggenes <- as.data.frame(eval(parse(text = paste0("res.",cont_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cont_metalvars[[i]])))$padj),]
siggenes <- as.data.frame(siggenes) %>%
rownames_to_column(var="gene") %>%
filter(padj <0.1) %>%
mutate(gene=as.character(gene))
siggenes<-as.vector(siggenes$gene)
FC_pvalue <- EnhancedVolcano(eval(parse(text = paste0("res.",cont_metalvars[[i]]))),
lab = rownames(eval(parse(text = paste0("res.",cont_metalvars[[i]])))),
selectLab = c(siggenes),
x = 'log2FoldChange',
y = 'padj',
pCutoff= 0.1,
FCcutoff = log2(1.5),
cutoffLineWidth = 0.6,
title="",
ylim = c(0,5),
subtitle = "",
caption = "",
gridlines.minor = FALSE,
legendPosition = 'bottom',
legendLabSize = 12,
legendIconSize = 3.0,
axisLabSize = 12,
legend = c("Not sig.", "Log2 FC>1.5","adj p <0.1","Log2 FC>1.5 & adj p <0.1"))
FC_pvalue <- FC_pvalue + ggtitle(paste0(cont_metalvars[[i]])) +theme(title = element_text(size=5))
assign(paste0("plotvolcano.",cont_metalvars[[i]]),FC_pvalue)
}
print(cont_metalvars)
all_cont <- ggarrange(plotvolcano.As_ngg_log, plotvolcano.Ba_ngg_log, plotvolcano.Cd_ngg_log, plotvolcano.Cu_ugg_log, plotvolcano.Hg_ngg_log, plotvolcano.Mn_ugg_log,
plotvolcano.Pb_ngg_log, plotvolcano.Sb_ngg_log, plotvolcano.Se_ugg_log, plotvolcano.Sr_ugg_log, plotvolcano.Zn_ugg_log,
nrow=4, ncol=3, common.legend=TRUE)
all_cont
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_volcanoplot.pdf"),
width = 10, height = 20)
all_cont
dev.off()
#################################################################################################
#################################################################################################
#### Save everything in environment that might be useful in future analyses
#################################################################################################
#################################################################################################
allres <- mget(ls(pattern = "res."))
allsvseqs <- mget(ls(pattern = "svseq."))
allvsd <- mget(ls(pattern = "vsd_matrix."))
deseqobj_cat <- mget(cat_metalvars)
deseqobj_cont <- mget(cont_metalvars)
plots <-mget(ls(pattern = "plot"))
save(allres, allsvseqs, allvsd,deseqobj_cat,deseqobj_cont,plots,
file = file.path(paste0(Output_Folder,"/",cur_date,"_DESeq2_Analysis_Data.RData")))