1_ELGANcordmetals_singlemetals_DESeq2.R

#################################################################################################
#################################################################################################
#### ELGAN cord metals gene expression: evaluating placental transcriptomic responses to cord 
#### metal levels 
#### Part 1: Single metals DeSeq2 analysis, adjusted models 
####
#### Using DESeq2 for sample normalization and statistical analysis to identify mRNA associated with metals
#### Code includes: data organization,PCA, and other steps prior to DESeq2
#### 
#### 
#### Code drafted by Lauren Eaves, with reference to code generated by Lauren Koval, and Julia Rager
#### Lasted updated: October 13th 2021
#################################################################################################
#################################################################################################

#################################################################################################
#################################################################################################
#### Installing appropriate packages in R (if already have these installed, SKIP THIS STEP)
#################################################################################################
#################################################################################################
sessionInfo()
rm(list=ls())

# Installing DESeq2:
# Note that BiocManager allows you to access the Bioconductor repo in which DeSeq2 package is. DeSeq2 pacakge has a "tibble" "RcppArmadillo" and "rlang" dependency.
# to install RcppArmadillo, you must install a Fortan compiler (I used gfortran-6.1.pkg, https://cran.r-project.org/bin/macosx/tools/).

#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")

#install.packages("tibble")
#install.packages("rlang")
#install.packages("RcppArmadillo")

# Install DESeq and SVA:
#BiocManager::install("DESeq2", version = "3.10")
#BiocManager::install("sva", version ="3.10")
#BiocManager::install("BiocManager")

# Install plot packages
#install.packages("ggbeeswarm") # note that I had to manually install this package using tools --> install packages
#install.packages("gridExtra")

# Install tidyverse packages 
#install.packages("tidyverse")

# Install gplots packages
#install.packages("gplots")

# Install RColorBrewer package
#install.packages("RColorBrewer")

# Install scales (needed for tidyverse activation):
#install.packages("scales")

# Install data.tables
#install.packages("data.table")


#################################################################################################
#################################################################################################
#### Activating appropriate packages in R
#################################################################################################
#################################################################################################


# Activating appropriate libraries
library(data.table)
library(rlang)
library(DESeq2)
library(limma)
library(sva)
library(ggplot2)
library(stats)
library(scales)
library(ggbeeswarm)
library(gridExtra)
library(tidyverse)
library(dplyr)
library(gplots)
library(RColorBrewer)
library(data.table)
library(ggpubr)
library(EnhancedVolcano)

#################################################################################################
#################################################################################################
#### Set working directory, output folder path, etc
#################################################################################################
#################################################################################################

# Set working directory
setwd("#yourwd")
getwd()

# Create an output folder (make sure to make the folder first, and then point to it here)
Output_Folder <- ("#youroutputfolder")

# Create a current date variable to name outputfiles
cur_date <- str_replace_all(Sys.Date(),"-","")


#################################################################################################
#################################################################################################
#### Loading count data and metadata (subject information)
#### Organizing and filtering to keep subjects present in both files
#################################################################################################
#################################################################################################

# Read in count data
countdata <- read.table(file = '#countdata', sep = '\t', header = TRUE)
dim(countdata)
#404 samples (plus Gene ID column), 37268 genes
# visualize this data quickly by viewing top left corner:
countdata[1:3,1:6]

# Check for duplicate gene IDs in the countdata 
Dups <- duplicated(countdata[,1])
summary(Dups)
# Not an issue for this ELGAN dataset

# Read in metadata (subject info/covariates and metals data)
subjectinfo <- read.csv("#cohortdata", check.names=FALSE)
dim(subjectinfo)
# Visualize this data quickly by viewing top left corner:
subjectinfo[1:3,1:10]
subjectinfo[1]<-NULL
colnames(subjectinfo)

#################################################################################################
#################################################################################################
#### Applying first round of various filters to these data files, including:
#### (1) First to first keep subjects present in both files
#### (2) Background filter, to filter out genes that were universally lowly expressed
#### (3) Check subject filter, to check that we have filtered out subjects that had samples present all values of zero on RNAseq (and thus not detected)
#### (4) Re-filter to include subjects meeting the above filters in (2) and (3)
#################################################################################################
#################################################################################################

##########
##########
# (1) Filtering to first keep subjects present in both files
##########
##########


#import ids dataset and merge mRNAseq with cohort 
ids <- read.csv(file="#idsfile") %>% 
  select(Elgan_ID, mRNA.seq_datafile_ID)
subjectinfo <- left_join(subjectinfo, ids, by="Elgan_ID") %>% 
  dplyr::rename(ID=mRNA.seq_datafile_ID)

# Pull all mRNAseq IDs that overlap between countdata & subjectinfo
keep_samples <- colnames(countdata[, colnames(countdata) %in% subjectinfo$ID]) 
# Note that in this case, this represents 253 IDs - note that in the data prep R script, we filtered the cohort data to only include observations with mRNA data

# Filter the countdata dataframe for these IDs
# Because the first column is the gene identifier information, this will get lost if we don't also grab that separately (hence, a few steps here)
countdata_temp <- countdata[, colnames(countdata) %in% keep_samples]   # pulling all countdata, without gene identifier column
countdata <- cbind(countdata$Gene, countdata_temp)    # adding the gene identifier column back in
colnames(countdata)[1] <- "Gene"   # renaming the gene identifer column
countdata_temp <- NULL    # removing temporary dataframe

# Viewing merged, filtered count data
countdata[1:3,1:10]

# Filter the subjectinfo dataframe for these IDs, just to check
subjectinfo <- subjectinfo[subjectinfo$ID %in% keep_samples, ]

#N=226

##########
##########
# (2) Background filter, to filter out genes that were universally lowly expressed
##########
##########

# Create variable for number of samples, assuming that one variable is Gene: 
numberofsamples= (ncol(countdata)-1)

# Create a variable that is the median of all expression levels across all samples and genes
totalmedian=median(as.matrix(countdata[,2:ncol(countdata)]))
# Create a list of genes that pass the background filter:
Genes_backgroundfiltered <- countdata %>% 
  #gather so that there is one row for each gene-sample pairing
  gather(key="sampleID", value="expression", 2:ncol(countdata)) %>% 
  #add a variable that takes the value 1 is the expression value is above the median 
  mutate(abovemedian=ifelse(expression>totalmedian,1,0)) %>% 
  #sum the abovemedian variable created above by gene so that totalabovemedian calcualtes the number of samples for each gene
  #that have expression levels above the median
  group_by(Gene) %>% 
  summarise(totalabovemedian=sum(abovemedian)) %>% 
  #remove all genes that have a totalabovemedian equal to less than a specified percentage of the total number of samples 
  filter(totalabovemedian > (0.20*numberofsamples)) %>% 
  #select only gene so that you essentially have a list of the genes to include 
  select(Gene) 
# Select genes that passed background filter only to proceed with analysis
countdata <- left_join(Genes_backgroundfiltered, countdata, by="Gene")
# This resulted in filtering the original list of 37268 genes down to 11,292 genes


##########
##########
#### (3) Check subject filter, to check that we have filtered out subjects that had samples present all values of zero (and thus not detected)
##########
##########

# Note that this code was originally drafted using a transposed version of the countdata dataframe
# So here, rather than modifying, we will transpose and then transpose back at the end

#transpose count data 
countdata_t <- countdata %>% 
  gather(key="sampleID", value="expression", 2:ncol(countdata)) %>% 
  spread(Gene, expression)  
# view transposed count data
countdata_t[1:4,1:10]


countdata_t <- countdata_t %>% 
  #create variable that is the sum of count values across all genes, for each sample
  mutate(rowsum= rowSums(countdata_t[,2:nrow(countdata_t)],na.rm = FALSE, dims=1L)) 

#identify the samples that will be removed, for records 
samples_zerocounts <- countdata_t %>% 
  filter(rowsum ==0) %>% 
  select("sampleID") 

countdata_t <- countdata_t %>% 
  #remove samples that have a total sum of all counts of zero 
  filter(rowsum !=0) %>% 
  select(-rowsum)

# SAMPLES REMOVED: No samples were removed (ie. no smaples had only zero counts across all)


# Transposing the countdata back such that the genes are the rownames and the samples are the columnnames
countdata <- countdata_t %>% 
  gather(Gene, value, 2:ncol(countdata_t)) %>% 
  spread(sampleID, value) 
countdata[1:4,1:10]


##########
##########
# (4) Re-filter to include subjects meeting the above filters in (2) and (3)
##########
##########

# Pull all mRNAseq IDs that overlap between countdata & subjectinfo
keep_samples <- colnames(countdata[, colnames(countdata) %in% subjectinfo$ID]) 
# Note that in this case, this represents 226 IDs

# Filter the countdata dataframe for these IDs
# Because the first column is the gene identifier information, this will get lost if we don't also grab that separately (hence, a few steps here)
countdata_temp <- countdata[, colnames(countdata) %in% keep_samples]   # pulling all countdata, without gene identifier column
countdata <- cbind(countdata$Gene, countdata_temp)    # adding the gene identifier column back in
colnames(countdata)[1] <- "Gene"   # renaming the gene identifer column
# Viewing merged, filtered count data
countdata[1:3,1:10]

# Filter the subjectinfo dataframe for these IDs
subjectinfo <- subjectinfo[subjectinfo$ID %in% keep_samples, ]

#################################################################################################
#################################################################################################
#### Create dataframes that are formatted for proceeding code, as well as DESeq2 functions
#### countdata and coldata
#################################################################################################
#################################################################################################

# Make the gene variable the rowname 
countdata <- countdata %>% 
  column_to_rownames(var="Gene") 

# Creating the coldata object, based on information in the subjectinfo file
coldata <- subjectinfo

# Set the rownames of coldata and column names of countdata to be in the same order 
countdata <- setcolorder(countdata, as.character(coldata$ID))

# Double checking that the same variables appear between the two dataframes
setequal(as.character(coldata$ID), colnames(countdata))

# Additionally checking that not only the sets of variables are the same, but that they are in the same order
identical(as.character(coldata$ID), colnames(countdata))

#################################################################################################
#################################################################################################
#### RNASeq QA/QC on raw count data to identify potential outlier samples
#### This may or may not result in another filter step
#################################################################################################
#################################################################################################
# One way to evaluate sequencing data quality / identify potential outliers is through Principal Component Analysis (PCA):
# PCA helps in identifying outlying samples for quality control, and gives a feeling for the principal causes of variation in a dataset

# Calculate principal components using transposed count data
pca <- prcomp(t(countdata))
screeplot(pca)
#this scree plot indicates that nearly all variation is explained in PC1,2 and 3, 
#thus PCA is a useful exercise 

# Make dataframe for PCA plot generation using first three components
pca_df <- data.frame(PC1genes = pca$x[,1], PC2genes = pca$x[,2], PC3genes = pca$x[,3], ID=colnames(countdata))
# Add attributes (covariates from coldata) to pca df
pca_df <- merge(pca_df, coldata, by="ID")

# Calculating percent of the variation that is captured by each principal component
pca_percent <- round(100*pca$sdev^2/sum(pca$sdev^2),1)

# Generating PCA plot annotated by sex
pca_df$sex <- as.factor(pca_df$sex)
ggplot(pca_df, aes(PC1genes,PC2genes, color = sex))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Generating PCA plot annotated by maternal bmi 
pca_df$bmicat <- as.factor(pca_df$bmicat)
ggplot(pca_df, aes(PC1genes,PC2genes, color = bmicat))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Generating PCA plot annotated by maternal smoking
pca_df$smoke <- as.factor(pca_df$smoke)
ggplot(pca_df, aes(PC1genes,PC2genes, color = smoke))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Generating PCA plot annotated by ses score 
pca_df$score <- as.factor(pca_df$score)
ggplot(pca_df, aes(PC1genes,PC2genes, color = score))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Lets identify the outliers
ggplot(pca_df, aes(PC1genes,PC2genes))+
  geom_text(aes(label=ID, size=3)) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))


# This would suggest that LS18028 and LS18076 are potential outliers,
# We will conduct heirachical clustering to investigate further and to determine if we should also remove these samples


# create a dataframe with samples as rows and genes as columns to input into hclust
countdata_forclustering <- t(countdata)
countdata_forclustering[1:5,1:10]


# run heirarchical clustering
hc <-hclust(dist(countdata_forclustering))
jpeg(filename= paste0(Output_Folder,"/",cur_date, "_heirachicalclusteringforoutliers.jpeg"),width = 4500, height = 350)
plot(hc)
dev.off()

# Here, the aforementioned samples were clearly separate from the rest, so should be removed


#################################################################################################
#################################################################################################
#### Removing sample(s) with potential QA/QC issues
#################################################################################################
#################################################################################################

# Remove these subject IDs from our "keep_samples" vector
keep_samples <- keep_samples[! keep_samples %in% c("LS18028", "LS18076")]
# Note that in this case, this represents 244 IDs

# Filter the countdata dataframe for these IDs
countdata <- countdata[, colnames(countdata) %in% keep_samples]
countdata[1:5, 1:10]

# Filter the coldata dataframe for these IDs
coldata <- coldata[coldata$ID %in% keep_samples, ]

# Export the raw data for just the included samples to potentially generate plots outside of R
write.csv(countdata, paste0(Output_Folder,"/", cur_date, "_RawCounts_SubjectsIncludedinModel.csv"), row.names= TRUE)
write.csv(coldata, paste0(Output_Folder,"/", cur_date, "_SubjectInfo_SubjectsIncludedinModel.csv"), row.names= FALSE)
# Note that if, for future studies, we exclude subjects with missing covariate data, that step will have to incorporated prior to this export
# But for this analysis, we're imputing missing covariate data (using random forest modeling), so went ahead and exported raw data here

#Now N=224

#################################################################################################
#################################################################################################
#### Rerunning PCA analysis with outlier removed
#################################################################################################
#################################################################################################

# Calculate principal components using transposed count data
pca <- prcomp(t(countdata))
screeplot(pca)
#this scree plot indicates that nearly all variation is explained in PC1,2 and 3, 
#thus PCA is a useful exercise 

# Make dataframe for PCA plot generation using first three components
pca_df <- data.frame(PC1genes = pca$x[,1], PC2genes = pca$x[,2], PC3genes = pca$x[,3], ID=colnames(countdata))
# Add attributes (covariates from coldata) to pca df
pca_df <- merge(pca_df, coldata, by="ID")

# Calculating percent of the variation that is captured by each principal component
pca_percent <- round(100*pca$sdev^2/sum(pca$sdev^2),1)

# Generating PCA plot annotated by sex
pca_df$sex <- as.factor(pca_df$sex)
ggplot(pca_df, aes(PC1genes,PC2genes, color = sex))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Generating PCA plot annotated by maternal bmi 
pca_df$bmicat <- as.factor(pca_df$bmicat)
ggplot(pca_df, aes(PC1genes,PC2genes, color = bmicat))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Generating PCA plot annotated by maternal smoking
pca_df$smoke <- as.factor(pca_df$smoke)
ggplot(pca_df, aes(PC1genes,PC2genes, color = smoke))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Generating PCA plot annotated by ses score 
pca_df$score <- as.factor(pca_df$score)
ggplot(pca_df, aes(PC1genes,PC2genes, color = score))+
  geom_point(size=6) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))

# Lets identify the outliers
ggplot(pca_df, aes(PC1genes,PC2genes))+
  geom_text(aes(label=ID, size=3)) +
  labs(x=paste0("PC1 (",pca_percent[1],"%)"), y=paste0("PC2 (",pca_percent[2],"%)"))


#################################################################################################
#################################################################################################
#### QA/QC and further formatting of covariate (subject information) data to adjust for missing data
#### Note that covariates have already been imputed in the Data Prep R script  
#### So here we just need to set them to the appropriate variable types (e.g., factor vs numeric)
#################################################################################################
#################################################################################################

##########
##########
# (1) First filtering for covariates we're interested in including
##########
##########

# For the this analysis, we are interesting in keeping all metals data, as well as the following covariates:
# score (SES cumulative index), bmicat (maternal prepreg BMI), smoke (maternal smoking during pregnancy) and SVs
colnames(coldata)
metalvars<-colnames(coldata)[26:47]
print(metalvars)
coldata <- coldata %>% select(ID, score, bmicat, smoke, sex, SV1, SV2, SV3, all_of(metalvars))


##########
##########
# (2) Evaluating number of subjects with missing data for each covariate
##########
##########

# Look for missing covariate data in coldata
sapply(coldata, function(x) sum(is.na(x)))


##########
##########
# (3) Setting appropriate variable types to be recognized by R (e.g., factor vs numeric)
##########
##########
colnames(coldata)
cont_metalvars<- colnames(coldata)[9:19]
cat_metalvars<- colnames(coldata)[20:30]

# Designate the specific covariates as either factors or numeric variables
coldata$score <- as.numeric(coldata$score)
coldata$bmicat <- as.factor(coldata$bmicat)
coldata$smoke <- as.factor(coldata$smoke)
coldata$sex <- as.factor(coldata$sex)
coldata[cont_metalvars] <- lapply(coldata[cont_metalvars], as.numeric)
coldata[cat_metalvars] <- lapply(coldata[cat_metalvars], factor)

#################################################################################################
#################################################################################################
#### Setting up the actual DESeq2 function and associated algorithm ("experiment")
#################################################################################################
#################################################################################################
# More information can be found about the DeSeq2 package at https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8, as well as many other resources, as this package is well documented


# At this point, we need to make sure that the coldata is in the following format:
# rownames = the subject ID that matches the countdata
# all other columns indicate the covariate data that we may/may not include in the final experiment
coldata[1:5, 1:7]   # viewing data


# At this point, we need to make sure that the countdata is in the following format:
# rownames = the gene identifiers
# all other columns indicate individual samples that match (and are in the same order as) the coldata
countdata[1:5, 1:10]   # viewing data


# all values should be integers (DESeq2 requires integer count values), need to convert values here if haven't already
sapply(countdata, class)   # checking the class of the values
countdata_2 <- as.data.frame(sapply(countdata, as.integer))   # converting into integers if showing numeric
sapply(countdata_2, class)   # re-checking the class of the values
row.names(countdata_2) <- row.names(countdata)  # this removed the row names (gene identifiers), so had to merge these back
countdata <- countdata_2

# Create the final DESeq2 experiment, with appropriate experimental design:
# Note in the design: last factor indicates the factor we want to test the effect of; the first factor(s) = factors we want to control for
# Note that in this particular experiment, it gives us a warning that we are using integer data as numeric variables - this is what we want for some variables

#Create the dds objects for all the continuous variables 
Mn_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Mn_ugg_log)
Cu_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cu_ugg_log)
Zn_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Zn_ugg_log)
As_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + As_ngg_log)
Se_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Se_ugg_log)
Sr_ugg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sr_ugg_log)
Cd_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cd_ngg_log)
Sb_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sb_ngg_log)
Ba_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Ba_ngg_log)
Hg_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Hg_ngg_log)
Pb_ngg_log <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Pb_ngg_log)

#Create the dds objects for the categorical variables
print(cat_metalvars)
As_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + As_cat)
Ba_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Ba_cat)
Cd_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cd_cat)
Cu_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Cu_cat)
Hg_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Hg_cat)
Mn_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Mn_cat)
Pb_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Pb_cat)
Sb_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sb_cat)
Se_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Se_cat)
Sr_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Sr_cat)
Zn_cat <- DESeqDataSetFromMatrix(countData = countdata,
                                     colData = coldata,
                                     design = ~SV1 + SV2+ score + bmicat + smoke + sex + Zn_cat)
# View what the experiment contains
Zn_cat

# Let's also make sure that we have the main contrast (last term) in the order we want to calculate appropriate fold change values
# In this case, we want metal=0 as denominators (first level, aka, the "non-exposed" level), and metal=1, as numerators
As_cat$As_cat <- relevel (As_cat$As_cat, "0")
Ba_cat$Ba_cat <- relevel (Ba_cat$Ba_cat, "0")
Cd_cat$Cd_cat <- relevel (Cd_cat$Cd_cat, "0")
Cu_cat$Cu_cat <- relevel (Cu_cat$Cu_cat, "0")
Hg_cat$Hg_cat <- relevel (Hg_cat$Hg_cat, "0")
Mn_cat$Mn_cat <- relevel (Mn_cat$Mn_cat, "0")
Pb_cat$Pb_cat <- relevel (Pb_cat$Pb_cat, "0")
Sb_cat$Sb_cat <- relevel (Sb_cat$Sb_cat, "0")
Se_cat$Se_cat <- relevel (Se_cat$Se_cat, "0")
Sr_cat$Sr_cat <- relevel (Sr_cat$Sr_cat, "0")
Zn_cat$Zn_cat <- relevel (Zn_cat$Zn_cat, "0")

# Need to next, estimate the size factors, since size factors are used to normalize the counts (next step)
# The "iterate" estimator iterates between estimating the dispersion with a design of ~1, and finding a size factor vector by numerically optimizing the likelihood of the ~1 model.
for (i in 1:length(cat_metalvars)){
  dds <- estimateSizeFactors(eval(parse(text = paste0(cat_metalvars[[i]]))))
  sizeFactors(dds)
  assign(paste0(cat_metalvars[[i]]),dds) 
  print(cat_metalvars[[i]])
}
for (i in 1:length(cont_metalvars)){
  dds <- estimateSizeFactors(eval(parse(text = paste0(cont_metalvars[[i]]))))
  sizeFactors(dds)
  assign(paste0(cont_metalvars[[i]]),dds) 
  print(cont_metalvars[[i]])
}


#################################################################################################
#################################################################################################
#### Pulling and exporting normalized counts, for internal records and for potential future plots/figures
#################################################################################################
#################################################################################################
# Note that code is drafted to output several forms of normalized data - so users can choose which applies to their external purposes the best
# Also note that variance stabilized (vsd) values are commonly used for figures, which requires the experiment to run and adjust for covariates below, so will export these in the code below

# normalized counts:
normcounts<- counts(dds, normalized=TRUE)
write.csv(normcounts, paste0(Output_Folder,"/", cur_date, "_NormCounts_IncludedSubjects.csv"), row.names=TRUE)

# pseudocounts to make plots easier:
ps_normcounts <- normcounts + 1
write.csv(ps_normcounts, paste0(Output_Folder,"/",cur_date, "_NormCounts_ps_IncludedSubjects.csv"),row.names=TRUE)

# log2 pseudocounts (y=log2(n+1))
log2normcounts <- log2(normcounts+1)
write.csv(log2normcounts, paste0(Output_Folder,"/", cur_date, "_NormCounts_pslog2_IncludedSubjects.csv"), row.names=TRUE)

# Export variance stabilized counts, which will be influenced by the new design above
# vsd will be useful for plots and future figure generation
# CAUTION: THIS STEP TAKES SOME TIME! 
#continuous variables
for (i in 1:length(cont_metalvars)){
  vsd <- varianceStabilizingTransformation(eval(parse(text = paste0(cont_metalvars[[i]]))), blind=FALSE)
  vsd_matrix <-as.matrix(assay(vsd))
  write.csv(vsd_matrix, paste0(Output_Folder,"/", cur_date,"_", cont_metalvars[[i]], "_VSDCounts_IncludedSubjects.csv"), row.names=TRUE)
  assign(paste0("vsd_matrix.",cont_metalvars[[i]]),vsd_matrix) 
  print(cont_metalvars[[i]])
}
#categorical variables
for (i in 1:length(cat_metalvars)){
  vsd <- varianceStabilizingTransformation(eval(parse(text = paste0(cat_metalvars[[i]]))), blind=FALSE)
  vsd_matrix <-as.matrix(assay(vsd))
  write.csv(vsd_matrix, paste0(Output_Folder,"/", cur_date,"_", cat_metalvars[[i]], "_VSDCounts_IncludedSubjects.csv"), row.names=TRUE)
  assign(paste0("vsd_matrix.",cat_metalvars[[i]]),vsd_matrix) 
  print(cat_metalvars[[i]])
}


#################
#################
#### Evaluating variance stabilized values to guage whether SVs accounted for unwanted sources of variation between samples
#################
#################

#check Zn_cat 
plotPCA(vsd, intgroup="Zn_cat", returnData=FALSE)
plotPCA(vsd, intgroup="SV1", returnData=FALSE)
#SV1 accounts for a lot of variation along PC1 as shown by the steady gradient in color across the PC1 axis
plotPCA(vsd, intgroup="SV2", returnData=FALSE)

#In sum, it looks like the SVs are accounting for a lot of the unwanted variation which is great!

#################################################################################################
#################################################################################################
#### Statistical analysis to detect significantly differentially expressed genes
#################################################################################################
#################################################################################################

# Running the differential expression statistical pipeline
# CAUTION: THIS STEP TAKES A LOT OF TIME! 
for (i in 1:length(cont_metalvars)){
  dds <- DESeq(eval(parse(text = paste0(cont_metalvars[[i]]))), betaPrior=FALSE)  # because we used a user-defined model matrix, need to set betaPrior=FALSE
  res <- results(dds, pAdjustMethod = "BH") #with multiple test correction by the default, BH (aka FDR)
  write.csv(as.data.frame(res)[order(res$padj),], paste0(Output_Folder,"/", cur_date,"_", cont_metalvars[[i]],"_AllStatResults.csv"))# Exporting statistical results
  assign(paste0("res.",cont_metalvars[[i]]),res) 
  print(cont_metalvars[[i]])
}
for (i in 1:length(cat_metalvars)){
  dds <- DESeq(eval(parse(text = paste0(cat_metalvars[[i]]))), betaPrior=FALSE)  # because we used a user-defined model matrix, need to set betaPrior=FALSE
  res <- results(dds, pAdjustMethod = "BH") #with multiple test correction by the default, BH (aka FDR)
  write.csv(as.data.frame(res)[order(res$padj),], paste0(Output_Folder,"/", cur_date,"_", cat_metalvars[[i]],"_AllStatResults.csv"))# Exporting statistical results
  assign(paste0("res.",cat_metalvars[[i]]),res) 
  print(cat_metalvars[[i]])
}

#note receiving some messages: 156 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest

#################################################################################################
#################################################################################################
#### MA plots
#################################################################################################
#################################################################################################

#Continuous variables 
for (i in 1:length(cont_metalvars)){
  
  MA <- as.data.frame(eval(parse(text = paste0("res.",cont_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cont_metalvars[[i]])))$padj),]
  MA[is.na(MA)] <- 1
  MA1 <- MA[ which(MA$padj>=0.1),]  # all the rest
  MA2 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > log2(1.5)),]   # sig up-regulated w/ FC filter-FIREBRICK
  MA3 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < -log2(1.5)),]  # sig down-regulated w/ FC filter-dodgerblue4
  MA4 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > 0 & MA$log2FoldChange <= log2(1.5)),]  # sig up-regulated w/o FC filter- darkorchid3
  MA5 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < 0 & MA$log2FoldChange >= -log2(1.5)), ] # sig down-regulated w/o FC filter- chartreuse4
  
  plot <-
    ggplot(subset(MA1, baseMean>=0), aes(x = baseMean, y = log2FoldChange)) + #can subset to exclude the very low counts (<1 average count) if we want
    geom_point(color="gray56", size = 1, alpha=0.5) + 
    geom_point(data = MA2, color="firebrick", size=1.5, show.legend = TRUE) + # colors used: firebrick, dodgerblue4, chocolate1, darkorchid3, chartreuse4
    geom_point(data = MA3, color="dodgerblue4", size=1.5, show.legend = TRUE) +
    geom_point(data = MA4, color="darkorchid3", size=1.5, show.legend = TRUE) +
    geom_point(data = MA5, color="chartreuse4", size=1.5, show.legend = TRUE) +
    theme_bw()+ theme(axis.text.x = element_text(size = 10), axis.text.y = element_text(size=10)) +
    scale_x_continuous(trans = "log10", breaks=c(1,10,100, 1000, 10000, 100000, 1000000), labels=c("1","10","100", "1000", "10000", "100000", "1000000")) + 
    xlab("Expression (Normalized Count)") + 
    ylab("(log2)FC, each log unit increase") +
    geom_hline(yintercept=0)+
    ggtitle(paste0(cont_metalvars[[i]])) +
    theme(plot.title = element_text(hjust = 0.5))
  
  assign(paste0("plot.",cont_metalvars[[i]]),plot) 
}

print(cont_metalvars)
all_cont <- ggarrange(plot.As_ngg_log, plot.Ba_ngg_log, plot.Cd_ngg_log, plot.Cu_ugg_log, plot.Hg_ngg_log, plot.Mn_ugg_log, 
                     plot.Pb_ngg_log, plot.Sb_ngg_log, plot.Se_ugg_log, plot.Sr_ugg_log, plot.Zn_ugg_log,
                     nrow=4, ncol=3)

all_cont

pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot.pdf"),
    width = 10, height = 12)
all_cont
dev.off()


pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot.pdf"),
    width = 10, height = 12)
all_cont
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_As.pdf"),
    width = 8, height = 6)
plot.As_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Ba.pdf"),
    width = 8, height = 6)
plot.Ba_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Cd.pdf"),
    width = 8, height = 6)
plot.Cd_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Cu.pdf"),
    width = 8, height = 6)
plot.Cu_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Hg.pdf"),
    width = 8, height = 6)
plot.Hg_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Mn.pdf"),
    width = 8, height = 6)
plot.Mn_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Pb.pdf"),
    width = 8, height = 6)
plot.Pb_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Sb.pdf"),
    width = 8, height = 6)
plot.Sb_ngg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Se.pdf"),
    width = 8, height = 6)
plot.Se_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Sr.pdf"),
    width = 8, height = 6)
plot.Sr_ugg_log
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_MAplot_Zn.pdf"),
    width = 8, height = 6)
plot.Zn_ugg_log
dev.off()


#Categorical variables
for (i in 1:length(cat_metalvars)){
  
  MA <- as.data.frame(eval(parse(text = paste0("res.",cat_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cat_metalvars[[i]])))$padj),]
  MA[is.na(MA)] <- 1
  MA1 <- MA[ which(MA$padj>=0.1),]  # all the rest
  MA2 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > log2(1.5)),]   # sig up-regulated w/ FC filter-FIREBRICK
  MA3 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < -log2(1.5)),]  # sig down-regulated w/ FC filter-dodgerblue4
  MA4 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange > 0 & MA$log2FoldChange <= log2(1.5)),]  # sig up-regulated w/o FC filter- darkorchid3
  MA5 <- MA[ which(MA$padj<0.1 & MA$log2FoldChange < 0 & MA$log2FoldChange >= -log2(1.5)), ] # sig down-regulated w/o FC filter- chartreuse4
  
  plot <-
    ggplot(subset(MA1, baseMean>=0), aes(x = baseMean, y = log2FoldChange)) + #can subset to exclude the very low counts (<1 average count) if we want
    geom_point(color="gray56", size = 1, alpha=0.5) + 
    geom_point(data = MA2, color="firebrick", size=1.5, show.legend = TRUE) + # colors used: firebrick, dodgerblue4, chocolate1, darkorchid3, chartreuse4
    geom_point(data = MA3, color="dodgerblue4", size=1.5, show.legend = TRUE) +
    geom_point(data = MA4, color="darkorchid3", size=1.5, show.legend = TRUE) +
    geom_point(data = MA5, color="chartreuse4", size=1.5, show.legend = TRUE) +
    theme_bw()+ theme(axis.text.x = element_text(size = 10), axis.text.y = element_text(size=10)) +
    scale_x_continuous(trans = "log10", breaks=c(1,10,100, 1000, 10000, 100000, 1000000), labels=c("1","10","100", "1000", "10000", "100000", "1000000")) + 
    xlab("Expression (Normalized Count)") + 
    ylab("(log2)FC, >= vs. < median") +
    geom_hline(yintercept=0)+
    ggtitle(paste0(cat_metalvars[[i]])) +
    theme(plot.title = element_text(hjust = 0.5))

  assign(paste0("plot.",cat_metalvars[[i]]),plot) 
}
#export plots 
print(cat_metalvars)
all_cat <- ggarrange(plot.As_cat, plot.Ba_cat, plot.Cd_cat, plot.Cu_cat, plot.Hg_cat, plot.Mn_cat, 
                     plot.Pb_cat, plot.Sb_cat, plot.Se_cat, plot.Sr_cat, plot.Zn_cat,
                     nrow=4, ncol=3)

all_cat

pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot.pdf"),
      width = 8, height = 10)
      all_cat
dev.off()

#export individual plots with DE genes 
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Ba.pdf"),
    width = 8, height = 6)
plot.Ba_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Cd.pdf"),
    width = 8, height = 6)
plot.Cd_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Cu.pdf"),
    width = 8, height = 6)
plot.Cu_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Hg.pdf"),
    width = 8, height = 6)
plot.Hg_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Mn.pdf"),
    width = 8, height = 6)
plot.Mn_cat
dev.off()
pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_MAplot_Pb.pdf"),
    width = 8, height = 6)
plot.Pb_cat
dev.off()


#################################################################################################
#################################################################################################
#### Volcano plots
#################################################################################################
#################################################################################################
#categorical variables 
for (i in 1:length(cat_metalvars)){
    siggenes <- as.data.frame(eval(parse(text = paste0("res.",cat_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cat_metalvars[[i]])))$padj),]
    siggenes <- as.data.frame(siggenes) %>% 
      rownames_to_column(var="gene") %>% 
      filter(padj <0.1) %>% 
      mutate(gene=as.character(gene)) 
    siggenes<-as.vector(siggenes$gene)
    
    FC_pvalue <- EnhancedVolcano(eval(parse(text = paste0("res.",cat_metalvars[[i]]))),
                                 lab = rownames(eval(parse(text = paste0("res.",cat_metalvars[[i]])))),
                                 selectLab = c(siggenes),
                                 x = 'log2FoldChange',
                                 y = 'padj',
                                 pCutoff= 0.1,
                                 FCcutoff = log2(1.5),
                                 cutoffLineWidth = 0.6,
                                 title="",
                                 ylim = c(0,5),
                                 subtitle = "",
                                 caption = "",
                                 gridlines.minor = FALSE,
                                 legendPosition = 'bottom',
                                 legendLabSize = 12,
                                 legendIconSize = 3.0,
                                 axisLabSize = 12,
                                 legend = c("Not sig.", "Log2 FC>1.5","adj p <0.1","Log2 FC>1.5 & adj p <0.1"))
    
    FC_pvalue <- FC_pvalue + ggtitle(paste0(cat_metalvars[[i]])) +theme(title = element_text(size=5))
    
    assign(paste0("plotvolcano.",cat_metalvars[[i]]),FC_pvalue)
}
all_cat <- ggarrange(plotvolcano.As_cat, plotvolcano.Ba_cat, plotvolcano.Cd_cat, plotvolcano.Cu_cat, plotvolcano.Hg_cat, plotvolcano.Mn_cat, 
                     plotvolcano.Pb_cat, plotvolcano.Sb_cat, plotvolcano.Se_cat, plotvolcano.Sr_cat, plotvolcano.Zn_cat,
                     nrow=4, ncol=3, common.legend=TRUE)

all_cat

pdf(paste0(Output_Folder, "/", cur_date, "_","categoricalvariables", "_volcanoplot.pdf"),
    width = 10, height = 20)
all_cat
dev.off()

#continuous variables 
for (i in 1:length(cont_metalvars)){
  siggenes <- as.data.frame(eval(parse(text = paste0("res.",cont_metalvars[[i]]))))[order(eval(parse(text = paste0("res.",cont_metalvars[[i]])))$padj),]
  siggenes <- as.data.frame(siggenes) %>% 
    rownames_to_column(var="gene") %>% 
    filter(padj <0.1) %>% 
    mutate(gene=as.character(gene)) 
  siggenes<-as.vector(siggenes$gene)
  
  FC_pvalue <- EnhancedVolcano(eval(parse(text = paste0("res.",cont_metalvars[[i]]))),
                               lab = rownames(eval(parse(text = paste0("res.",cont_metalvars[[i]])))),
                               selectLab = c(siggenes),
                               x = 'log2FoldChange',
                               y = 'padj',
                               pCutoff= 0.1,
                               FCcutoff = log2(1.5),
                               cutoffLineWidth = 0.6,
                               title="",
                               ylim = c(0,5),
                               subtitle = "",
                               caption = "",
                               gridlines.minor = FALSE,
                               legendPosition = 'bottom',
                               legendLabSize = 12,
                               legendIconSize = 3.0,
                               axisLabSize = 12,
                               legend = c("Not sig.", "Log2 FC>1.5","adj p <0.1","Log2 FC>1.5 & adj p <0.1"))
  
  FC_pvalue <- FC_pvalue + ggtitle(paste0(cont_metalvars[[i]])) +theme(title = element_text(size=5))
  
  assign(paste0("plotvolcano.",cont_metalvars[[i]]),FC_pvalue)
}
print(cont_metalvars)
all_cont <- ggarrange(plotvolcano.As_ngg_log, plotvolcano.Ba_ngg_log, plotvolcano.Cd_ngg_log, plotvolcano.Cu_ugg_log, plotvolcano.Hg_ngg_log, plotvolcano.Mn_ugg_log, 
                     plotvolcano.Pb_ngg_log, plotvolcano.Sb_ngg_log, plotvolcano.Se_ugg_log, plotvolcano.Sr_ugg_log, plotvolcano.Zn_ugg_log,
                     nrow=4, ncol=3, common.legend=TRUE)

all_cont

pdf(paste0(Output_Folder, "/", cur_date, "_","continuousvariables", "_volcanoplot.pdf"),
    width = 10, height = 20)
all_cont
dev.off()

#################################################################################################
#################################################################################################
#### Save everything in environment that might be useful in future analyses
#################################################################################################
#################################################################################################

allres <- mget(ls(pattern = "res."))
allsvseqs <- mget(ls(pattern = "svseq."))
allvsd <- mget(ls(pattern = "vsd_matrix."))
deseqobj_cat <- mget(cat_metalvars)
deseqobj_cont <- mget(cont_metalvars)
plots <-mget(ls(pattern = "plot"))

save(allres, allsvseqs, allvsd,deseqobj_cat,deseqobj_cont,plots,
     file = file.path(paste0(Output_Folder,"/",cur_date,"_DESeq2_Analysis_Data.RData")))