Skip to content

Latest commit

 

History

History
1998 lines (1561 loc) · 77.6 KB

rnaseq_atacseq_analysis.md

File metadata and controls

1998 lines (1561 loc) · 77.6 KB

This notebook contains the workflow used for the processing and analysis of RNA-Seq and ATAC-Seq data underlying the manuscript "Investigation of the effects of phthalates on in vitro thyroid models with RNA-Seq and ATAC-Seq".

RNA-Seq data analysis

1 - Raw sequencing data processing

Used the script available at https://github.com/marta-nazzari/CODA/blob/main/CODA_preprocess.sh.

2 - Differential expression analysis

# load required packages
library(DESeq2)
library(magrittr)
library(dplyr)
library(data.table)
library(edgeR)
library(ggrepel)
library(biomaRt)
library(PCAtools)
library(apeglm)
library(RUVSeq)

# Define %notin% function
`%notin%` = Negate(`%in%`) 

# General parameters
Control = 'DMSO'
FDR = 0.01
k_ruv = 2
mart = read.table('D:/files/mouse_ensgid_to_gene_name.txt', header = T, fill = T)


Drugs = c('DEHP', 'DIDP', 'DINP', 'DnOP')
  
# Working directories
MAIN.DIR = '/project_folder/RNA-Seq/DE_analysis/'

if (dir.exists(MAIN.DIR)) { 
setwd(MAIN.DIR) 
print('Main directory exists')
} else { 
dir.create(MAIN.DIR)
setwd(MAIN.DIR) 
print('Created working directory')
}

# Raw gene read count table
genes = read.table('project_folder/RNA-Seq/DE_analysis/genes.data.tsv', header = TRUE)
rownames(genes) %<>% sub('\\.(\\d){1,2}', '', .) # remove gene version

# Raw miRNA read count table
mirna = read.table('project_folder/RNA-Seq/DE_analysis/miR_counts.txt', header = T)

# read the metadata table
samplekey = read.table('project_folder/RNA-Seq/DE_analysis/metadata.txt', header = T) %>% 
tibble::column_to_rownames(var = 'SampleName') %>% 
mutate_all(as.factor)

# Need to exclude DMSO_1 because it's an outlier
samplekey %<>% filter(rownames(.) != 'DMSO_ctrl_1')

# update gene and mirna tables
genes %<>% dplyr::select(-DMSO_ctrl_1) 
mirna %<>% dplyr::select(-DMSO_ctrl_1) 

# colorblind palette for plotting mapped reads
cbPalette = c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2")


# Where list of genes that pass or fail the filters go
filters_dir = paste0(filtered_dir, 'Filtered_genes/')

if (dir.exists(unfiltered_dir)) { print('Directory exists') } else { 
  dir.create(unfiltered_dir)
  print('Created directory') 
}

if (dir.exists(filtered_dir)) { print('Directory exists') } else { 
  dir.create(filtered_dir)
  print('Created directory') 
}

if (dir.exists(filters_dir)) { print('Directory exists') } else { 
  dir.create(filters_dir)
  print('Created directory') 
}  

# PREPARE GENES TABLE
# check which samples are in `samplekey` and NOT in `genes`
rownames(samplekey) %notin% colnames(genes) %>% rownames(samplekey)[.]
colnames(genes) %notin% rownames(samplekey) %>% colnames(genes)[.]

# show colnames in alphabetical order
colnames(genes) %>% order %>% colnames(genes)[.]

# keep only samples in both `genes` and `samplekey`
common_samples = intersect(colnames(genes), rownames(samplekey))
samplekey %<>% .[rownames(samplekey) %in% common_samples, ] 
genes %<>% .[, colnames(genes) %in% common_samples] 

# reorder samples in genes to the same order of rownames(samplekey)
genes %<>% data.table::setcolorder(., rownames(samplekey))

# Check. Needed for DESeq2
colnames(genes) == rownames(samplekey)

# PREPARE MIRNA TABLE
# check which samples are in `samplekey` and NOT in `mirna`
rownames(samplekey) %notin% colnames(mirna) %>% rownames(samplekey)[.]
colnames(mirna) %notin% rownames(samplekey) %>% colnames(mirna)[.]

# show colnames in alphabetical orer
colnames(mirna) %>% order %>% colnames(mirna)[.]

# keep only samples in both `mirna` and `samplekey`
common_samples = intersect(colnames(mirna), rownames(samplekey))
samplekey %<>% .[rownames(samplekey) %in% common_samples, ] 
mirna %<>% .[, colnames(mirna) %in% common_samples] 

# reorder samples in mirna to the same order of samplekey$SampleName
mirna %<>% data.table::setcolorder(., as.character(rownames(samplekey)))


# plot number of aligned reads per samples
samplekey %<>% mutate(read_count_genes = colSums(genes))
samplekey %<>% mutate(read_count_mirna = colSums(mirna))

ggplot(samplekey, aes(x = rownames(samplekey), y = read_count_genes, fill = Compound)) +
  geom_bar(stat = 'identity') +
  scale_x_discrete(breaks = samplekey$SampleCode, labels = rownames(samplekey)) +
  scale_fill_manual(values = cbPalette) +
  ggtitle('Genes aligned reads') +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        axis.title.x = element_blank(),
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))
ggsave('genes_read_count.png', width = 8000, height = 6000, dpi = 600, units = 'px')

ggplot(samplekey, aes(x = rownames(samplekey), y = read_count_mirna, fill = Compound)) +
  geom_bar(stat = 'identity') + 
  scale_fill_manual(values = cbPalette) +
  ggtitle('miRNA aligned reads') +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        axis.title.x = element_blank(),
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))
ggsave('mirna_read_count.png', width = 8000, height = 6000, dpi = 600, units = 'px')


# Remove snoRNA genes. To analyze snoRNAs, change `!=` to `==`
if (remove_snorna == T) { genes %<>% .[rownames(.) %in% mart$ensembl_gene_id[mart$gene_biotype != 'snoRNA'], ] }



###
# DE analysis genes
###

# PART 1: DE ANALYSIS
DEanalysisCompound = function (COMPARISON, DRUG, CONTROL, RNA, CONTRAST.FACTOR, OUTPUT.DIR) {	
  
  # Select output directory
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  # Define DESeq2 design 
  DE_Design = samplekey[c(grep(DRUG[COMPARISON], samplekey$Compound), grep(CONTROL, samplekey$Compound)), ]
  
  # Select samples for DESeq2 analysis
  samples = RNA[, colnames(RNA) %in% rownames(DE_Design)]
  
  ColData = samplekey[rownames(samplekey) %in% colnames(samples), ] %>% droplevels
  
  print(paste(DRUG[COMPARISON], "vs", CONTROL))		
  
  dds = DESeqDataSetFromMatrix(countData = round(samples), 
                               colData = ColData,
                               design = ~ Compound)
  
  dds %<>% DESeq()
  
  res = results(dds, alpha = FDR, contrast = c(CONTRAST.FACTOR, DRUG[COMPARISON], CONTROL), pAdjustMethod = 'fdr')
  res = res[order(res$padj), ]	
  
  DEgenes = subset(res, res$padj < FDR)	
  DEgenes = DEgenes[order(DEgenes$padj), ]
  
  # Counts (all genes)
  normCounts = counts(dds, normalized = TRUE)
  
  ## Export tables
  FileName = paste(DRUG[COMPARISON], 'vs', CONTROL, 'FDR', FDR, sep = '_')		
  
  # results (all genes)
  write.table(res, file = paste0(FileName, '_Results_unfiltered.txt'), sep = '\t', quote = FALSE)
  
  # Normalized read count	(all genes)	
  write.table(normCounts, file = paste0(FileName, '_Norm_count_unfiltered.txt'), sep = '\t', quote = FALSE)
  
  # Normalized read count (DE genes)
  write.table(normCounts[rownames(normCounts) %in% rownames(DEgenes), ], file = paste0(FileName, '_Norm_count_DEG.txt'), sep = '\t', quote = F)
  
  # DE gene names only
  write.table(rownames(DEgenes), file = paste0(FileName, '_DEG_names.txt'), sep = '\t', quote = F, row.names = F, col.names = F)
  
  # assign variables to global env
  assign('dds', dds, envir = .GlobalEnv)
}	

# FILTER 1 - LOW COUNT: PASS if at least 75% of samples in either group have CPM >=1 
CPMfilter = function(COMPARISON, DRUG, CONTROL, DDS, CONTRAST.FACTOR, OUTPUT.DIR, ...) { 
  
  # Select output directory
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  # Convert dds counts to CPM
  CPMdds = cpm(counts(DDS, normalized = TRUE))
  
  # Count how many DRUG and CONTROL samples there are
  nDrug = grep(DRUG[COMPARISON], colnames(CPMdds)) %>% length
  nCont = grep(CONTROL, colnames(CPMdds)) %>% length
  
  # Create df with only DRUG or CONTROL samples
  CPM_drug = CPMdds %>% as.data.frame %>% dplyr::select(grep(DRUG[COMPARISON], colnames(CPMdds)))
  CPM_drug[, 'PassFail'] = NA
  
  CPM_cont = CPMdds %>% as.data.frame %>% dplyr::select(grep(CONTROL, colnames(CPMdds)))
  CPM_cont[, 'PassFail'] = NA
  
  for (x in 1:nrow(CPM_drug)) {
    # Vector for gene x with TRUE if CPM >= 1, FALSE if CPM < 1
    countCPMpass_drug = CPM_drug %>% .[x,-ncol(CPM_drug)] >= 1  # Exclude PassFail column
    countCPMpass_cont = CPM_cont %>% .[x,-ncol(CPM_cont)] >= 1  # Exclude PassFail column
    
    # Sum samples for which CPM >= 1
    sumCPMpass_drug = countCPMpass_drug[countCPMpass_drug == TRUE] %>% length
    sumCPMpass_cont = countCPMpass_cont[countCPMpass_cont == TRUE] %>% length
    # If this value is HIGHER than 75% of tot samples, the gene passes the filter
    if (sumCPMpass_drug >= 0.75*nDrug) { CPM_drug$PassFail[x] = 'PASS' }
    else { CPM_drug$PassFail[x] = 'FAIL' }
    if (sumCPMpass_cont >= 0.75*nCont) { CPM_cont$PassFail[x] = 'PASS' }
    else { CPM_cont$PassFail[x] = 'FAIL' }
  }
  
  # Select genes that pass the filter in DRUG or CONTROL
  CPMpass = data.frame(genes = rownames(DDS),
                       drug_filter = CPM_drug$PassFail,
                       ctrl_filter = CPM_cont$PassFail) %>%
    mutate(PassFail = if_else(drug_filter == 'PASS' | ctrl_filter == 'PASS', 'PASS', 'FAIL'))
  
  CPMpass = CPMpass$genes[CPMpass$PassFail == 'PASS'] %>% #droplevels %>% as.vector
    as.vector()
  
  # Extract dds results that pass FILTER 1
  res = results(DDS[rownames(DDS) %in% CPMpass, ], 
                alpha = 0.01, cooksCutoff = F, independentFiltering = F, 
                contrast = c(CONTRAST.FACTOR, DRUG[COMPARISON], CONTROL), 
                pAdjustMethod = 'fdr')
  
  # Apply RUVSeq
  set = newSeqExpressionSet(counts(DDS))
  png(paste0(DRUG[COMPARISON], '_before_RUVg.png'), width = 7000, height = 4000, units = 'px', res = 600)
  par(mar = c(8,5,2,2), mfrow = c(1, 2))
  plotRLE(set, outline = FALSE, col = ifelse(grepl(DRUG[COMPARISON], colnames(DDS)), 'red', 'blue'), las = 2,
          main = 'DESeq2 normalized data, no RUVSeq applied')
  plotPCA(set, col = ifelse(grepl(DRUG[COMPARISON], colnames(DDS)), 'red', 'blue'), 
          las = 2, pch = 20, labels = F,
          main = 'DESeq2 normalized data, no RUVSeq applied')
  dev.off()
  
  set = betweenLaneNormalization(set[rownames(set) %in% CPMpass, ], which = "upper")
  not.sig = rownames(res)[which(res$pvalue > .1)]
  empirical = rownames(set)[ rownames(set) %in% not.sig ]
  set = RUVg(set, empirical, k = k_ruv)
  pData(set)
  
  png(paste0(DRUG[COMPARISON], '_after_RUVg.png'), width = 7000, height = 4000, units = 'px', res = 600)
  par(mar = c(8,5,2,2), mfrow = c(1, 2))
  plotRLE(set, outline = FALSE, col = ifelse(grepl(DRUG[COMPARISON], colnames(dds)), 'red', 'blue'), las = 2,
          main = paste0('DESeq2 normalized data, RUVg (k = ', k_ruv, ')'))
  plotPCA(set, col = ifelse(grepl(DRUG[COMPARISON], colnames(dds)), 'red', 'blue'), 
          las = 2, pch = 20, labels = F,
          main = paste0('DESeq2 normalized data, RUVg (k = ', k_ruv, ')'))
  dev.off()
  
  # plot the factors estimated by RUV
  # par(mfrow = c(2, 1), mar = c(3,5,3,1))
  # for (i in 1:2) {
  #   stripchart(as.formula(paste0('pData(set)[, i] ~ DDS$', CONTRAST.FACTOR)), vertical = TRUE, main = paste0("W", i))
  #   abline(h = 0)
  # }
  
  # control for these factors by adding them to the DESeqDataSet and to the design
  DDSRUV = DDS
  DDSRUV$W1 = set$W_1
  DDSRUV$W2 = set$W_2
  design(DDSRUV) = as.formula(paste0('~ W1 + W2 + ', CONTRAST.FACTOR))
  
  DDSRUV %<>% DESeq()
  
  # re-estimate he DESeq2 parameters and results
  res = results(DDSRUV[rownames(DDSRUV) %in% CPMpass, ], 
                alpha = 0.01, cooksCutoff = F, independentFiltering = F, 
                contrast = c(CONTRAST.FACTOR, DRUG[COMPARISON], CONTROL), 
                pAdjustMethod = 'fdr')  
  
  # Normalized read count for genes that pass FILTER 1
  normData_all = counts(DDSRUV[rownames(DDSRUV) %in% CPMpass, ], normalized = T)
  
  # DE genes (choose FDR)
  DEgenes = subset(res, res$padj < FDR)
  
  # Normalized read count of DE genes
  normData_DEG = normData_all[rownames(normData_all) %in% rownames(DEgenes), , drop = F]
  
  # Print info on filtering
  print(paste0(nrow(CPM_drug), ' genes in total.'))
  print(paste0(length(CPMpass), ' genes passed the CPM filter.'))
  print(paste0(nrow(CPM_drug[rownames(CPM_drug) %notin% rownames(res), ]), ' genes failed the CPM filter.'))
  print(paste0(nrow(DEgenes), ' genes that passed the CPM filter are differentilly expressed (fdr <', FDR, ').'))
  print(paste0(nrow(res[rownames(res) %notin% rownames(DEgenes), ]), ' genes that passed the CPM filter are not differentially expressed.'))
  
  # Save list of genes that pass FILTER 1
  write.table(CPMpass, file = paste0('Filtered_genes/', DRUG[COMPARISON], ..., '_vs_', CONTROL, '_CPMpass.txt'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  # Save results of DE analysis (for the volcano plot)
  FileName = paste(DRUG[COMPARISON], ...,  'vs', CONTROL, 'FDR', FDR, sep = '_')
  write.table(res, file = paste0(FileName, '_Results_all.txt'), sep = '\t', quote = F)
  write.table(normData_all, file = paste0(FileName, '_Norm_count_all.txt'), sep = '\t', quote = F)
  write.table(normData_DEG, file = paste0(FileName, '_Norm_count_DEG_prefilters.txt'), sep = '\t', quote = F)
  
  # assign variables to global env 
  assign('res', DEgenes, envir = .GlobalEnv)
  assign('normData_DEG', normData_DEG, envir = .GlobalEnv)
  assign('CPMpass', CPMpass, envir = .GlobalEnv)	
}


# FILTER 2 - SPIKE: DE gene PASSES if (Max/Sum) < 1.4*Samples of a group^(-0.66)
spikeFilter = function(COMPARISON, DRUG, CONTROL, NORM.DEG, OUTPUT.DIR, ...) {
  
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  Drug = grep(DRUG[COMPARISON], colnames(NORM.DEG))
  Cont = grep(CONTROL, colnames(NORM.DEG))
  
  # Check genes in DRUG group
  Check_drug = data.frame(Max = apply(NORM.DEG[, Drug, drop = F], 1, max),
                          Sum = apply(NORM.DEG[, Drug, drop = F], 1, sum),
                          Length = length(Drug)) %>% 
    mutate(SpikeCheck = if_else( (Max/Sum) < (1.4*Length^(-0.66)), 'PASS', 'FAIL')) 
  
  
  # Extract genes that PASS in DRUG
  SpikePass_drug = Check_drug[grep('PASS', Check_drug$SpikeCheck), ] %>% rownames
  
  # Check genes in CONTROL group
  Check_cont = data.frame(Max = apply(NORM.DEG[, Cont, drop = F], 1, max),
                          Sum = apply(NORM.DEG[, Cont, drop = F], 1, sum),
                          Length = length(Cont)) %>% 
    mutate(SpikeCheck = if_else( (Max/Sum) < (1.4*Length^(-0.66)) , 'PASS', 'FAIL')) 
  
  # If read count in all replicates is 0, SpikeCheck = <NA>
  # Change all <NA> to PASS
  Check_drug[is.na(Check_drug)] = 'PASS' 
  Check_cont[is.na(Check_cont)] = 'PASS' 
  
  # Extract genes that PASS in CONTROL
  SpikePass_cont = Check_cont[grep('PASS', Check_cont$SpikeCheck), ] %>% rownames
  
  # Intersect the common genes: list of genes that pass the spike filter
  SpikePass = intersect(SpikePass_drug, SpikePass_cont) 
  SpikeFail = NORM.DEG[rownames(NORM.DEG) %notin% SpikePass, , drop = F] %>% rownames	# drop = F is needed when length(SpikeFail)=1
  
  # print info on filtering
  print(paste0(length(SpikePass), ' genes passed the Spike filter.'))
  print(paste0(length(SpikeFail), ' genes failed the Spike filter.'))
  
  # Save list of genes that pass FILTER 2
  write.table(SpikePass, file = paste(DRUG[COMPARISON], ..., 'vs', CONTROL, 'SpikePass.txt', sep = '_'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.table(SpikeFail, file = paste(DRUG[COMPARISON], ..., 'vs', CONTROL, 'SpikeFail.txt', sep = '_'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  # assign variable to global env
  assign('SpikePass', SpikePass, envir = .GlobalEnv)
}

# Function that keeps normalized data of DE genes that pass FILTER 2 
filterNormData = function(COMPARISON, DRUG, CONTROL, SPIKEPASS, RES.DEG, NORM.DEG, OUTPUT.DIR, ...) {
  
  # Select output directory
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  # Extract genes that pass FILTER 2
  idx = SPIKEPASS
  
  FileName = paste(DRUG[COMPARISON], ..., 'vs', CONTROL, 'FDR', FDR, sep = '_')	
  
  # Results of ONLY DE genes that PASS all filters
  res_filtered = RES.DEG[rownames(RES.DEG) %in% idx, , drop = F]
  
  # Normalized read count of ONLY DE genes that PASS all filters
  normData_filtered = NORM.DEG[rownames(NORM.DEG) %in% idx, , drop = F]
  
  # Info on all filters applied
  print(paste0(nrow(res_filtered), ' out of ', nrow(RES.DEG), ' DE genes passed the Spike filter - ', nrow(RES.DEG[rownames(RES.DEG) %notin% idx, ]), ' DE genes excluded.'))
  
  # Save tables
  write.table(res_filtered, file = paste0(FileName, '_Results_DEG.txt'), sep = '\t', quote = FALSE)
  write.table(rownames(res_filtered), file = paste0(FileName, '_DEG_names.txt'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)	
  write.table(normData_filtered, file = paste0(FileName, '_Norm_count_DEG.txt'), sep = '\t', quote = FALSE)
  
  # assign variable to global env
  assign('res_filtered', res_filtered, envir = .GlobalEnv)
}


###
# DE ANALYSIS miRNAs
###

# PART 1: DE ANALYSIS
DEanalysisCompound_miRNA = function (COMPARISON, DRUG, CONTROL, RNA, CONTRAST.FACTOR, OUTPUT.DIR) {	
  
  # Select output directory
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  # Define DESeq2 design 
  DE_Design = samplekey[c(grep(DRUG[COMPARISON], samplekey$Compound), grep(CONTROL, samplekey$Compound)), ]
  
  # Select samples for DESeq2 analysis
  samples = RNA[, rownames(DE_Design) ]
  ColData = samplekey[rownames(samplekey) %in% colnames(samples), ] %>% droplevels
  
  print(paste(DRUG[COMPARISON], "vs", CONTROL))		
  
  dds = DESeqDataSetFromMatrix(countData = round(samples), 
                               colData = ColData,
                               design = ~ Compound)
  
  dds %<>% DESeq()
  
  res = results(dds, alpha = 0.01, contrast = c(CONTRAST.FACTOR, DRUG[COMPARISON], CONTROL), pAdjustMethod = 'fdr')
  res = res[order(res$padj), ]	
  
  DEgenes = subset(res, res$padj < 0.01)	
  DEgenes = DEgenes[order(DEgenes$padj), ]
  
  # Counts (all genes)
  normCounts = counts(dds, normalized = TRUE)
  
  ## Export tables
  FileName = paste('miRNA', DRUG[COMPARISON], 'vs', CONTROL, 'FDR_0.01', sep = '_')		
  
  # results (all genes)
  write.table(res, file = paste0(FileName, '_Results_unfiltered.txt'), sep = '\t', quote = FALSE)
  
  # Normalized read count	(all genes)	
  write.table(normCounts, file = paste0(FileName, '_Norm_count_unfiltered.txt'), sep = '\t', quote = FALSE)
  
  # Normalized read count (DE genes)
  write.table(normCounts[rownames(normCounts) %in% rownames(DEgenes), ], file = paste0(FileName, '_Norm_count_DEG.txt'), sep = '\t', quote = F)
  
  # DE gene names only
  write.table(rownames(DEgenes), file = paste0(FileName, '_DEG_names.txt'), sep = '\t', quote = F, row.names = F, col.names = F)
  
  # assign variables to global env
  assign('dds', dds, envir = .GlobalEnv)
}	

# FILTER 1 - LOW COUNT: PASS if at least 75% of samples in either group have CPM >=1 
CPMfilter_miRNA = function(COMPARISON, DRUG, CONTROL, DDS, CONTRAST.FACTOR, OUTPUT.DIR, ...) { 
  
  # Select output directory
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  # Convert dds counts to CPM
  CPMdds = cpm(counts(DDS, normalized = TRUE))
  
  # Count how many DRUG and CONTROL samples there are
  nDrug = grep(DRUG[COMPARISON], colnames(CPMdds)) %>% length
  nCont = grep(CONTROL, colnames(CPMdds)) %>% length
  
  # Create df with only DRUG or CONTROL samples
  CPM_drug = CPMdds %>% as.data.frame %>% dplyr::select(grep(DRUG[COMPARISON], colnames(CPMdds)))
  CPM_drug[, 'PassFail'] = NA
  
  CPM_cont = CPMdds %>% as.data.frame %>% dplyr::select(grep(CONTROL, colnames(CPMdds)))
  CPM_cont[, 'PassFail'] = NA
  
  for (x in 1:nrow(CPM_drug)) {
    # Vector for gene x with TRUE if CPM >= 1, FALSE if CPM < 1
    countCPMpass_drug = CPM_drug %>% .[x,-ncol(CPM_drug)] >= 1  # Exclude PassFail column
    countCPMpass_cont = CPM_cont %>% .[x,-ncol(CPM_cont)] >= 1  # Exclude PassFail column
    
    # Sum samples for which CPM >= 1
    sumCPMpass_drug = countCPMpass_drug[countCPMpass_drug == TRUE] %>% length
    sumCPMpass_cont = countCPMpass_cont[countCPMpass_cont == TRUE] %>% length
    # If this value is HIGHER than 75% of tot samples, the gene passes the filter
    if (sumCPMpass_drug >= 0.75*nDrug) { CPM_drug$PassFail[x] = 'PASS' }
    else { CPM_drug$PassFail[x] = 'FAIL' }
    if (sumCPMpass_cont >= 0.75*nCont) { CPM_cont$PassFail[x] = 'PASS' }
    else { CPM_cont$PassFail[x] = 'FAIL' }
  }
  
  # Select genes that pass the filter in DRUG or CONTROL
  CPMpass = data.frame(genes = rownames(DDS),
                       drug_filter = CPM_drug$PassFail,
                       ctrl_filter = CPM_cont$PassFail) %>%
    mutate(PassFail = if_else(drug_filter == 'PASS' | ctrl_filter == 'PASS', 'PASS', 'FAIL'))
  
  CPMpass = CPMpass$genes[CPMpass$PassFail == 'PASS'] %>% #droplevels %>% as.vector
    as.vector()
  
  # Extract dds results that pass FILTER 1
  res = results(DDS[rownames(DDS) %in% CPMpass, ], 
                alpha = 0.01, cooksCutoff = F, independentFiltering = F, 
                contrast = c(CONTRAST.FACTOR, DRUG[COMPARISON], CONTROL), 
                pAdjustMethod = 'fdr')
  
  # Apply RUVSeq
  set = newSeqExpressionSet(counts(DDS))
  png(paste0('miRNA_', DRUG[COMPARISON], '_before_RUVg.png'), width = 7000, height = 4000, units = 'px', res = 600)
  par(mar = c(8,5,2,2), mfrow = c(1, 2))
  plotRLE(set, outline = FALSE, col = ifelse(grepl(DRUG[COMPARISON], colnames(DDS)), 'red', 'blue'), las = 2,
          main = 'DESeq2 normalized data, no RUVSeq applied')
  plotPCA(set, col = ifelse(grepl(DRUG[COMPARISON], colnames(DDS)), 'red', 'blue'), 
          las = 2, pch = 20, labels = F,
          main = 'DESeq2 normalized data, no RUVSeq applied')
  dev.off()
  
  set = betweenLaneNormalization(set[rownames(set) %in% CPMpass, ], which = "upper")
  not.sig = rownames(res)[which(res$pvalue > .1)]
  empirical = rownames(set)[ rownames(set) %in% not.sig ]
  set = RUVg(set, empirical, k = k_ruv)
  pData(set)
  
  png(paste0('miRNA_', DRUG[COMPARISON], '_after_RUVg.png'), width = 7000, height = 4000, units = 'px', res = 600)
  par(mar = c(8,5,2,2), mfrow = c(1, 2))
  plotRLE(set, outline = FALSE, col = ifelse(grepl(DRUG[COMPARISON], colnames(dds)), 'red', 'blue'), las = 2,
          main = paste0('DESeq2 normalized data, RUVg (k = ', k_ruv, ')'))
  plotPCA(set, col = ifelse(grepl(DRUG[COMPARISON], colnames(dds)), 'red', 'blue'), 
          las = 2, pch = 20, labels = F,
          main = paste0('DESeq2 normalized data, RUVg (k = ', k_ruv, ')'))
  dev.off()
  
  # control for these factors by adding them to the DESeqDataSet and to the design
  DDSRUV = DDS
  DDSRUV$W1 = set$W_1
  DDSRUV$W2 = set$W_2
  design(DDSRUV) = as.formula(paste0('~ W1 + W2 + ', CONTRAST.FACTOR))
  
  DDSRUV %<>% DESeq()
  
  # re-estimate he DESeq2 parameters and results
  res = results(DDSRUV[rownames(DDSRUV) %in% CPMpass, ], 
                alpha = 0.01, cooksCutoff = F, independentFiltering = F, 
                contrast = c(CONTRAST.FACTOR, DRUG[COMPARISON], CONTROL), 
                pAdjustMethod = 'fdr')  
  
  # Normalized read count for genes that pass FILTER 1
  normData_all = counts(DDS[rownames(DDS) %in% CPMpass, ], normalized = T)
  
  # DE genes (fdr < 0.01)
  DEgenes = subset(res, res$padj < 0.01)
  
  # Normalized read count of DE genes
  normData_DEG = normData_all[rownames(normData_all) %in% rownames(DEgenes), , drop = F]
  
  # Print info on filtering
  print(paste0(nrow(CPM_drug), ' genes in total.'))
  print(paste0(length(CPMpass), ' genes passed the CPM filter.'))
  print(paste0(nrow(CPM_drug[rownames(CPM_drug) %notin% rownames(res), ]), ' genes failed the CPM filter.'))
  print(paste0(nrow(DEgenes), ' genes that passed the CPM filter are differentilly expressed (fdr < 0.01).'))
  print(paste0(nrow(res[rownames(res) %notin% rownames(DEgenes), ]), ' genes that passed the CPM filter are not differentially expressed.'))
  
  # Save list of genes that pass FILTER 1	
  write.table(CPMpass, file = paste('Filtered_genes/miRNA', DRUG[COMPARISON], ..., 'vs', CONTROL, 'CPMpass.txt', sep = '_'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  # Save results of DE analysis (for the volcano plot)
  FileName = paste('miRNA', DRUG[COMPARISON], ...,  'vs', CONTROL, 'FDR_0.01', sep = '_')
  write.table(res, file = paste0(FileName, '_Results_all.txt'), sep = '\t', quote = F)
  write.table(normData_all, file = paste0(FileName, '_Norm_count_all.txt'), sep = '\t', quote = F)
  write.table(normData_DEG, file = paste0(FileName, '_Norm_count_DEG_prefilters.txt'), sep = '\t', quote = F)
  
  # assign variables to global env 
  assign('res', DEgenes, envir = .GlobalEnv)
  assign('normData_DEG',normData_DEG, envir = .GlobalEnv)
  assign('CPMpass', CPMpass, envir = .GlobalEnv)
}


# FILTER 2 - SPIKE: DE gene PASSES if (Max/Sum) < 1.4*Samples of a group^(-0.66)
spikeFilter_miRNA = function(COMPARISON, DRUG, CONTROL, NORM.DEG, OUTPUT.DIR, ...) {
  
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  Drug = grep(DRUG[COMPARISON], colnames(NORM.DEG))
  Cont = grep(CONTROL, colnames(NORM.DEG))
  
  # Check genes in DRUG group
  Check_drug = data.frame(Max = apply(NORM.DEG[, Drug, drop = F], 1, max),
                          Sum = apply(NORM.DEG[, Drug, drop = F], 1, sum),
                          Length = length(Drug)) %>% 
    mutate(SpikeCheck = if_else( (Max/Sum) < (1.4*Length^(-0.66)), 'PASS', 'FAIL')) 
  
  
  # Extract genes that PASS in DRUG
  SpikePass_drug = Check_drug[grep('PASS', Check_drug$SpikeCheck), ] %>% rownames
  
  # Check genes in CONTROL group
  Check_cont = data.frame(Max = apply(NORM.DEG[, Cont, drop = F], 1, max),
                          Sum = apply(NORM.DEG[, Cont, drop = F], 1, sum),
                          Length = length(Cont)) %>% 
    mutate(SpikeCheck = if_else( (Max/Sum) < (1.4*Length^(-0.66)) , 'PASS', 'FAIL')) 
  
  # If read count in all replicates is 0, SpikeCheck = <NA>
  # Change all <NA> to PASS
  Check_drug[is.na(Check_drug)] = 'PASS' 
  Check_cont[is.na(Check_cont)] = 'PASS' 
  
  # Extract genes that PASS in CONTROL
  SpikePass_cont = Check_cont[grep('PASS', Check_cont$SpikeCheck), ] %>% rownames
  
  # Intersect the common genes: list of genes that pass the spike filter
  SpikePass = intersect(SpikePass_drug, SpikePass_cont) 
  SpikeFail = NORM.DEG[rownames(NORM.DEG) %notin% SpikePass, , drop = F] %>% rownames	# drop = F is needed when length(SpikeFail)=1
  
  # print info on filtering
  print(paste0(length(SpikePass), ' genes passed the Spike filter.'))
  print(paste0(length(SpikeFail), ' genes failed the Spike filter.'))
  
  # Save list of genes that pass FILTER 3	
  write.table(SpikePass, file = paste('miRNA', DRUG[COMPARISON], ..., 'vs', CONTROL, 'SpikePass.txt', sep = '_'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.table(SpikeFail, file = paste('miRNA', DRUG[COMPARISON], ..., 'vs', CONTROL, 'SpikeFail.txt', sep = '_'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  # assign variable to global env
  assign('SpikePass', SpikePass, envir = .GlobalEnv)
}

# Function that keeps normalized data of DE genes that pass FILTER 2
filterNormData_miRNA = function(COMPARISON, DRUG, CONTROL, SPIKEPASS, RES.DEG, NORM.DEG, OUTPUT.DIR, ...) {
  
  # Select output directory
  if (dir.exists(OUTPUT.DIR)) { 
    setwd(OUTPUT.DIR) 
  } else { 
    dir.create(OUTPUT.DIR)
    setwd(OUTPUT.DIR) }
  
  # Extract genes that pass FILTER 2
  idx = SPIKEPASS
  
  FileName = paste('miRNA', DRUG[COMPARISON], ..., 'vs', CONTROL, 'FDR_0.01', sep = '_')	
  
  # Results of ONLY DE genes that PASS all filters
  res_filtered = RES.DEG[rownames(RES.DEG) %in% idx, ]
  
  # Normalized read count of ONLY DE genes that PASS all filters
  normData_filtered = NORM.DEG[rownames(NORM.DEG) %in% idx, ]
  
  # Info on all filters applied
  print(paste0(nrow(res_filtered), ' out of ', nrow(RES.DEG), ' DE genes passed the Spike filter - ', nrow(RES.DEG[rownames(RES.DEG) %notin% idx, ]), ' DE genes excluded.'))
  
  # Save tables
  write.table(res_filtered, file = paste0(FileName, '_Results_DEG.txt'), sep = '\t', quote = FALSE)
  write.table(rownames(res_filtered), file = paste0(FileName, '_DEG_names.txt'), sep = '\t', quote = FALSE, row.names = FALSE, col.names = FALSE)	
  write.table(normData_filtered, file = paste0(FileName, '_Norm_count_DEG.txt'), sep = '\t', quote = FALSE)
  
  # assign variable to global env
  assign('res_filtered', res_filtered, envir = .GlobalEnv)
}


# Perform DE analysis on genes
for (x in seq(1, length(Drugs))) { 
  
  DEanalysisCompound(COMPARISON = x, Drugs, Control, genes, 'Compound', unfiltered_dir) 
  CPMfilter(COMPARISON = x, Drugs, Control, dds, 'Compound', filtered_dir)
  spikeFilter(COMPARISON = x, Drugs, Control, normData_DEG, filters_dir)
  filterNormData(COMPARISON = x, Drugs, Control, SpikePass, res, normData_DEG, filtered_dir)
  
} 

# Perform DE analysis on miRNAs
for (x in seq(1, length(Drugs))) { 
  
  DEanalysisCompound_miRNA(COMPARISON = x, Drugs, Control, mirna, 'Compound', unfiltered_dir) 
  CPMfilter_miRNA(COMPARISON = x, Drugs, Control, dds, 'Compound', filtered_dir)
  spikeFilter_miRNA(COMPARISON = x, Drugs, Control, normData_DEG, filters_dir)
  filterNormData_miRNA(COMPARISON = x, Drugs, Control, SpikePass, res, normData_DEG, filtered_dir)
  
} 


# Add gene names to all the output files in the `filtered_dir` to make files inspection easier
setwd(filtered_dir)


for (i in seq(1, length(Drugs))) {
  
  # Result_all
  a = read.table(paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Results_all.txt'), header = T, sep = '\t')
  a %<>% tibble::rownames_to_column(., 'ensembl_gene_id')
  
  a = left_join(a, mart, by = 'ensembl_gene_id')
  write.table(a, file = paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Results_all.txt'), quote = F, sep ='\t', row.names = F)
  
  # Results_DEG
  a = read.table(paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Results_DEG.txt'), header = T, sep = '\t')
  a %<>% tibble::rownames_to_column(., 'ensembl_gene_id')
  
  a = left_join(a, mart, by = 'ensembl_gene_id')
  write.table(a, file = paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Results_DEG.txt'), quote = F, sep ='\t', row.names = F)
  
  # Norm_count_all
  a = read.table(paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Norm_count_all.txt'), header = T, sep = '\t')
  a %<>% tibble::rownames_to_column(., 'ensembl_gene_id')
  
  a = left_join(a, mart, by = 'ensembl_gene_id')
  write.table(a, file = paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Norm_count_all.txt'), quote = F, sep ='\t', row.names = F)
  
  # Norm_count_DEG
  a = read.table(paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Norm_count_DEG.txt'), header = T, sep = '\t')
  a %<>% tibble::rownames_to_column(., 'ensembl_gene_id')
  
  a = left_join(a, mart, by = 'ensembl_gene_id')
  write.table(a, file = paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_Norm_count_DEG.txt'), quote = F, sep ='\t', row.names = F)
  
  # DEG_names
  a = scan(paste0(Drugs[i], '_vs_DMSO_FDR_0.01_DEG_names.txt'), what = character()) %>% as.data.frame()
  colnames(a) = 'ensembl_gene_id'
  
  mart_names = mart %>% dplyr::select(ensembl_gene_id, external_gene_name)
  
  a = left_join(a, mart_names, by = 'ensembl_gene_id')
  write.table(a, file = paste0(Drugs[i], '_vs_', Control, '_FDR_0.01_DEG_names.txt'), quote = F, sep ='\t', row.names = F)
  
}

3 - MaSigPro analysis

Note: I have modified some maSigPro plotting functions to place the doses with the right lables (instead of time) on the x-axis annotation, and to plot clusters even with only one gene. Before running this code, redefine the maSigPro functions at the bottom of this chunk.

 # MaSigPro
library(maSigPro)
library(MASS)
library(tidyverse)
library(magrittr)

cluster_all_edcs = tibble(Gene = character())

drug = c('DEHP', 'DIDP', 'DINP', 'DnOP')

# iterated over all compounds 
for (i in 1:length(drug)) {
  
  print(drug[i])
  
  norm_count = read.table(paste0('project_folder/RNA-Seq/DE_analysis/DE_filtered/', drug[i], '_vs_DMSO_FDR_0.01_Norm_count_all.txt'), header = T, fill = T)
  
  norm_count %<>% mutate(gene_id = paste0(ensembl_gene_id, ' - ', external_gene_name)) %>% # make ids with name 
    mutate_at('gene_id', ~ str_replace(., pattern = ' - $', replacement = '')) %>% # remove empty names
    dplyr::select(-gene_biotype, -ensembl_gene_id, -external_gene_name) %>% 
    column_to_rownames(var = 'gene_id') 

  samplekey = read.table('project_folder/RNA-Seq/masigpro_analysis/masigpro_metadata.txt', header = T) %>% 
    .[rownames(.) %in% colnames(norm_count), ] %>% 
    dplyr::select(c('Dose', 'Replicates', contains(drug[i])))
  
  num_degrees = 3
  d = make.design.matrix(samplekey, degree = num_degrees) # degree = 2 is default

  NBp = p.vector(norm_count, d, counts = TRUE) # 1st regression step
  NBt = T.fit(NBp) # 2nd regression step
  get = get.siggenes(NBt, vars = "groups", rsq = 0.7) # rsq = 0.7 is default
  get$summary
  get$sig.genes

  # get the genes in the clusters
  num_clusters = 9 # 9 is the default

  # see the genes and sort by cluster number
  drug_name = drug[i] 
  
  cluster_all_edcs = see.genes(get$sig.genes[[1]], k = num_clusters)$cut %>% 
    as.data.frame() %>%
    rename({{drug_name}} := '.') %>% # note: putting {{drug[i]}} throws an error
    arrange({{drug_name}}) %>% 
    rownames_to_column(var = 'Gene') %>% 
    full_join(cluster_all_edcs, ., by = 'Gene')

  # save plot with clusters
  svg(filename = paste0('project_folder/RNA-Seq/masigpro_analysis/masigpro_k', num_clusters, 
                        '_degree', num_degrees, '/masigpro_', drug[i], '.svg'), width = 6, height = 6)
  see.genes(get$sig.genes[[1]], show.fit = T, dis = d$dis, color.mode = 'rainbow', col = i, # 1 = black, 2 = red, 3 = green, 4 = blue, 5 = turquoise
            cluster.method = "hclust", cluster.data = 1, k = num_clusters)
  dev.off()

  # save table with all clusters for a given EDC class
  if (i == length(drug)) { write.table(cluster_all_edcs, quote = F, sep = '\t', row.names = F,
                            file = paste0('project_folder/RNA-Seq/masigpro_analysis/masigpro_degree', 
                                          num_degrees, '_all_phthalates.txt')) }
}

# NEED TO MODIFY MaSigPro FUNCTIONS see.genes(), PlotProfiles() AND PlotGroups() TO PERSONALIZE PLOT WITH DOSES (INSTEAD OF TIME) AND TO PLOT CLUSTERS WITH ONLY 1 GENE
PlotProfiles = function (data, cond, cex.axis = 0.5, ylim = NULL, repvect, main = NULL, 
          sub = NULL, color.mode = "rainbow", item = NULL) {
  pos.vline <- repvect[1:length(repvect)] - c(0, repvect[1:(length(repvect) - 
                                                              1)])
  for (i in 1:length(pos.vline)) {
    if (pos.vline[i] != 0) 
      pos.vline[i] <- i
    else pos.vline[i] <- pos.vline[i - 1]
  }
  if (is.null(ylim)) {
    ylim = c(min(as.matrix(data), na.rm = TRUE) * 1.1, max(as.matrix(data), 
                                                           na.rm = TRUE) * 1.1)
  }
  if (!is.vector(data)) {
    n = dim(data)[2]
    m = dim(data)[1]
    if (m == 1)
      nom <- rownames(data)
    else nom <- paste("Cluster", main, "(", m, item, ")",
                      sep = " ")
    plot(x = c(1:n), y = as.numeric(data[1, ]), type = "b", col = 1, # modified: added as.numeric() and drop = F
         ylim = ylim, ylab = "expression value", xlab = " ",
         main = nom, xaxt = "n")
    axis(1, at = 1:n, labels = substr(cond, 1, 26), cex.axis = cex.axis,
         las = 2)
    if (color.mode == "rainbow") {
      abline(v = pos.vline, col = "light gray")
      for (i in 1:dim(data)[1]) {
        lines(x = c(1:n), y = data[i, , drop = F], col = i)
      }
    }
    else if (color.mode == "gray") {
      for (i in 1:dim(data)[1]) {
        lines(x = c(1:n), y = data[i, , drop = F], col = "light gray")
      }
      yy <- apply(as.matrix(data), 2, median, na.rm = TRUE)
      lines(x = c(1:n), y = yy, col = "black")
    }
    else stop("Invalid mode, must be one of rainbow, gray")
  }
  else {
    n = length(data)
    plot(x = c(1:n), y = data, type = "l", col = 1, ylim = ylim,
         ylab = "expression value", sub, xaxt = "n", xlab = " ")
    axis(1, at = 1:n, labels = cond, cex.axis = cex.axis,
         las = 2)
    abline(v = pos.vline, col = "light gray")
  }
}

PlotGroups = function (data, edesign = NULL, time = edesign[, 1], groups = edesign[, 
                                                                                   c(3:ncol(edesign))], repvect = edesign[, 2], show.lines = TRUE, 
                       show.fit = FALSE, dis = NULL, step.method = "backward", min.obs = 2, 
                       alfa = 0.05, nvar.correction = FALSE, summary.mode = "median", 
                       groups.vector = NULL, main = NULL, sub = NULL, xlab = "Dose", 
                       ylab = "Expression value", item = NULL, ylim = NULL, pch = 21, 
                       col = NULL, legend = TRUE, cex.legend = 1, lty.legend = NULL, 
                       ...) 
{
  if (!is.vector(data)) {
    if (summary.mode == "representative") {
      distances <- apply(as.matrix(dist(data, diag = TRUE, 
                                        upper = TRUE)), 1, sum)
      representative <- names(distances)[distances == min(distances)]
      yy <- as.numeric(data[rownames(data) == representative, 
      ])
      sub <- paste("Representative:", representative)
    }
    else if (summary.mode == "median") {
      yy <- apply(as.matrix(data), 2, median, na.rm = TRUE)
      if (is.null(sub)) {
        sub <- paste("Median profile of", nrow(data), 
                     item, sep = " ")
      }
    }
    else stop("not valid summary.mode")
    if (dim(data)[1] == 1) {
      main <- rownames(data)
      sub = NULL
    }
  }
  else if (length(data) != 0) {
    yy <- as.numeric(data)
    sub <- rownames(data)
  }
  else stop("empty data")
  if (is.null(ncol(groups))) {
    ncol = 1
    legend = FALSE
    codeg = "group"
  }
  else {
    ncol = ncol(groups)
    codeg <- as.character(colnames(groups))
  }
  reps <- i.rank(repvect)
  y <- vector(mode = "numeric", length = length(unique(reps)))
  x <- vector(mode = "numeric", length = length(unique(reps)))
  g <- matrix(nrow = length(unique(reps)), ncol = ncol)
  for (k in 1:length(y)) {
    y[k] <- mean(yy[reps == k], na.rm = TRUE)
    x[k] <- mean(time[reps == k])
    for (j in 1:ncol) {
      g[k, j] <- mean(groups[reps == k, j])
    }
  }
  if (is.null(ylim)) 
    ylim = c(min(as.numeric(yy), na.rm = TRUE), max(as.numeric(yy), 
                                                    na.rm = TRUE))
  abcissa <- x
  xlim = c(min(abcissa, na.rm = TRUE), max(abcissa, na.rm = TRUE) * 
             1.3)
  if (is.null(col)) {
    color1 <- as.numeric(sort(factor(colnames(groups)))) + 
      1
    color2 <- groups
    for (j in 1:ncol) {
      color2[, j] <- color2[, j] * j
    }
    color2 <- as.vector(apply(color2, 1, sum) + 1)
  }
  else {
    color1 <- col
    color2 <- groups
    for (j in 1:ncol) {
      color2[, j] <- color2[, j] * col[j]
    }
    color2 <- as.vector(apply(color2, 1, sum))
  }
  plot(x = time, y = yy, pch = pch, xlab = xlab, ylab = ylab, 
       main = main, xaxt = "n", sub = sub, ylim = ylim, xlim = xlim, 
       col = color2, ...)
  arg = list(...)
  axis(1, at = 0:5, labels = c('0', '1 nM', '10 nM', '100 nM', '1 uM', '10 uM'), lwd = 1, # EDCs
  #axis(1, at = 0:3, labels = c('0', '10 uM', '100 uM', '1 mM'), lwd = 1, # reference compounds
       cex.axis = arg$cex.axis, las = 3)
  if (show.fit) {
    rm <- matrix(yy, nrow = 1, ncol = length(yy))
    rownames(rm) <- c("ratio medio")
    colnames(rm) <- rownames(dis)
    fit.y <- T.fit(rm, design = dis, step.method = step.method, 
                   min.obs = min.obs, alfa = alfa, nvar.correction = nvar.correction)
    betas <- fit.y$coefficients
  }
  for (i in 1:ncol(groups)) {
    group <- g[, i]
    if ((show.fit) && !is.null(betas)) {
      li <- c(2:6)
      a <- reg.coeffs(coefficients = betas, groups.vector = groups.vector, 
                      group = colnames(groups)[i])
      a <- c(a, rep(0, (7 - length(a))))
      curve(a[1] + a[2] * x + a[3] * (x^2) + a[4] * (x^3) + 
              a[5] * (x^4) + a[6] * (x^5) + a[7] * (x^5), from = min(time), 
            to = max(time), col = color1[i], add = TRUE, 
            lty = li[i], ...)
    }
    if (show.lines) {
      lx <- abcissa[group != 0]
      ly <- y[group != 0]
      ord <- order(lx)
      lxo <- lx[ord]
      lyo <- ly[ord]
      lines(lxo, lyo, col = color1[i], ...)
    }
  }
  op <- par(bg = "white")
  if (legend) 
    legend(max(abcissa, na.rm = TRUE) * 1.02, ylim[1], legend = codeg, 
           text.col = color1, col = color1, lty = lty.legend, 
           yjust = 0, bty = "n", cex = cex.legend)
  par(op)
}


see.genes = function (data, edesign = data$edesign, time.col = 1, repl.col = 2, 
                      group.cols = c(3:ncol(edesign)), names.groups = colnames(edesign)[3:ncol(edesign)], 
                      cluster.data = 1, groups.vector = data$groups.vector, k = 9, 
                      k.mclust = FALSE, cluster.method = "hclust", distance = "cor", 
                      agglo.method = "ward.D", show.lines = TRUE, show.fit = FALSE, 
                      dis = NULL, step.method = "backward", min.obs = 3, alfa = 0.05, 
                      nvar.correction = FALSE, iter.max = 500, summary.mode = "median", 
                      color.mode = "rainbow", ylim = NULL, item = "genes", legend = TRUE, 
                      cex.legend = 1, lty.legend = NULL, ...) 
{
  time = edesign[, time.col]
  repvect = edesign[, repl.col]
  groups = edesign[, group.cols]
  narrays <- length(time)
  if (!is.null(dim(data))) {
    dat <- as.data.frame(data)
    clusterdata <- data
  }
  else {
    clusterdata <- data[[cluster.data]]
    dat <- as.data.frame(data$sig.profiles)
  }
  if (nrow(dat) > 1) {
    dat <- as.data.frame(dat[, (ncol(dat) - length(time) + 
                                  1):ncol(dat)])
    count.noNa <- function(x) (length(x) - length(x[is.na(x)]))
    dat <- dat[which(apply(as.matrix(dat), 1, count.noNa) >= 
                       length(unique(repvect))), ]
    clusterdata <- dat
    if (any(is.na(clusterdata))) {
      if (cluster.method == "kmeans" || cluster.method == 
          "Mclust") {
        if (all(cluster.data != 1, cluster.data != "sig.profiles")) {
          clusterdata[is.na(clusterdata)] <- 0
        }
        else {
          mean.replic <- function(x) {
            tapply(as.numeric(x), repvect, mean, na.rm = TRUE)
          }
          MR <- t(apply(clusterdata, 1, mean.replic))
          if (any(is.na(MR))) {
            row.mean <- t(apply(MR, 1, mean, na.rm = TRUE))
            MRR <- matrix(row.mean, nrow(MR), ncol(MR))
            MR[is.na(MR)] <- MRR[is.na(MR)]
          }
          data.noNA <- matrix(NA, nrow(clusterdata), 
                              ncol(clusterdata))
          u.repvect <- unique(repvect)
          for (i in 1:nrow(clusterdata)) {
            for (j in 1:length(u.repvect)) {
              data.noNA[i, repvect == u.repvect[j]] = MR[i, 
                                                         u.repvect[j]]
            }
          }
          clusterdata <- data.noNA
        }
      }
    }
    if (!is.null(clusterdata)) {
      k <- min(k, nrow(dat), na.rm = TRUE)
      if (cluster.method == "hclust") {
        if (distance == "cor") {
          dcorrel <- matrix(rep(1, nrow(clusterdata)^2), 
                            nrow(clusterdata), nrow(clusterdata)) - cor(t(clusterdata), 
                                                                        use = "pairwise.complete.obs")
          clust <- hclust(as.dist(dcorrel), method = agglo.method)
          c.algo.used = paste(cluster.method, "cor", 
                              agglo.method, sep = "_")
        }
        else {
          clust <- hclust(dist(clusterdata, method = distance), 
                          method = agglo.method)
          c.algo.used = paste(cluster.method, distance, 
                              agglo.method, sep = "_")
        }
        cut <- cutree(clust, k = k)
      }
      else if (cluster.method == "kmeans") {
        cut <- kmeans(clusterdata, k, iter.max)$cluster
        c.algo.used = paste("kmeans", k, iter.max, sep = "_")
      }
      else if (cluster.method == "Mclust") {
        if (k.mclust) {
          my.mclust <- Mclust(clusterdata)
          k = my.mclust$G
        }
        else {
          my.mclust <- Mclust(clusterdata, k)
        }
        cut <- my.mclust$class
        c.algo.used = paste("Mclust", k, sep = "_")
      }
      else stop("Invalid cluster algorithm")
      groups <- as.matrix(groups)
      colnames(groups) <- names.groups
      if (k <= 4) 
        par(mfrow = c(3, 3))
      else if (k <= 6) 
        par(mfrow = c(3, 3))
      else if (k > 6) 
        par(mfrow = c(3, 3))
      for (i in 1:(k)) {
        PlotProfiles(data = dat[cut == i, , drop = F], repvect = repvect, 
                     main = i, ylim = ylim, color.mode = color.mode, 
                     cond = rownames(edesign), item = item)
      }
      if (k <= 4) {
        par(mfrow = c(3, 3))
      }
      else if (k <= 6) {
        par(mfrow = c(3, 3))
      }
      else if (k > 6) {
        par(mfrow = c(3, 3))
      }
      for (j in 1:(k)) {
        PlotGroups(data = dat[cut == j, ], show.fit = show.fit, 
                   dis = dis, step.method = step.method, min.obs = min.obs, 
                   alfa = alfa, nvar.correction = nvar.correction, 
                   show.lines = show.lines, time = time, groups = groups, 
                   repvect = repvect, summary.mode = summary.mode, 
                   xlab = "", main = paste("Cluster", j, sep = " "), 
                   ylim = ylim, legend = legend, groups.vector = groups.vector, 
                   item = item, cex.legend = cex.legend, lty.legend = lty.legend, 
                   ...)
      }
    }
    else {
      print("warning: impossible to compute hierarchical clustering")
      c.algo.used <- NULL
      cut <- 1
    }
  }
  else if (nrow(dat) == 1) {
    PlotProfiles(data = dat, repvect = repvect, ylim = ylim, 
                 color.mode = color.mode, cond = rownames(edesign))
    PlotGroups(data = dat, show.fit = show.fit, dis = dis, 
               step.method = step.method, min.obs = min.obs, alfa = alfa, 
               nvar.correction = nvar.correction, show.lines = show.lines, 
               time = time, groups = groups, repvect = repvect, 
               summary.mode = summary.mode, xlab = "time", ylim = ylim, 
               legend = legend, groups.vector = groups.vector, cex.legend = cex.legend, 
               lty.legend = lty.legend, ...)
    c.algo.used <- NULL
    cut <- 1
  }
  else {
    print("warning: NULL data. No visualization possible")
    c.algo.used <- NULL
    cut <- NULL
  }
  OUTPUT <- list(cut, c.algo.used, groups)
  names(OUTPUT) <- c("cut", "cluster.algorithm.used", "groups")
  OUTPUT
}

4 - Gene Set Enrichment Analysis on Reactome

How to interpret GSEA results: http://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html?_Interpreting_GSEA_Results

library(clusterProfiler)
library(enrichplot)
library(ggridges)
library(pathview)
library(org.Mm.eg.db)
library(tidyverse)
library(ReactomePA)


# set seed
set.seed(1234)

drug = 'DnOP' # can be any of 'DEHP', 'DIDP', 'DINP', 'DnOP'

res = read.table(paste0('project_folder/RNA-Seq/DE_analysis/DE_filtered/', drug, '_vs_DMSO_FDR_0.01_Results_all.txt'), header = T, fill = T) %>% 
  dplyr::rename(ENSEMBL = ensembl_gene_id)


# convert gene names to Entrez ID
ENTREZ = clusterProfiler::bitr(res$ENSEMBL, fromType = 'ENSEMBL', toType = 'ENTREZID', OrgDb = 'org.Mm.eg.db')

# Not all ENSEMBL ID are mapped to ENTREZ. Select only the `stats` value of mapped IDs
res_ENTREZ = left_join(res, ENTREZ, by = 'ENSEMBL') %>% 
  filter(!is.na(ENTREZID)) 

# named vector for analysis (ENTREZ IDs)
gl_entrez = res_ENTREZ$stat
names(gl_entrez) = make.names(res_ENTREZ$ENTREZID, unique = T) %>% 
  gsub('^X', '', .) # somehow, `X` is added at the beginning of the name
gl_entrez = gl_entrez[order(gl_entrez, decreasing = T)]


Reactome = ReactomePA::gsePathway(gl_entrez, 
                                  organism = 'mouse',
                                  pvalueCutoff = 1, # to retrieve all terms
                                  pAdjustMethod = "BH", 
                                  verbose = FALSE,
                                  eps = 1e-50,  # or try using eps = 0
                                  seed = T)
View(Reactome@result)

# save table
write.table(Reactome@result, file = paste0('project_folder/RNA-Seq/DE_analysis/reactome_GSEA_', drug, '.txt'), row.names = F, quote = F, sep = '\t')


# Example of leading edge plot ('Fatty acid metabolism') 
pw = 'Fatty acid metabolism'

p = gseaplot2(Reactome, geneSetID = Reactome@result[Reactome@result$Description == 'Fatty acid metabolism',]$ID, 
            title = paste0(drug, ' vs DMSO\n',
                           'Fatty acid metabolism',
                           '\n(NES = ', round(Reactome@result[Reactome@result$Description == 'Fatty acid metabolism',]$NES, digits = 3),
                           ', q-value = ', round(Reactome@result[Reactome@result$Description == 'Fatty acid metabolism',]$qvalue, digits = 3), 
                           ')')) 

ggsave(plot = p, file = paste0('project_folder/RNA-Seq/DE_analysis/plots/Fatty_acid_metabolism_', drug, '.png'), 
         dpi = 600, height = 3000, width = 4000, units = 'px')

ATAC-Seq data analysis

1 - Process raw data with PEPATAC

The PEPATAC paper is available at https://doi.org/10.1093/nargab/lqab101.

Script: 1_atac_preprocess_pepatac.sh.

#!/bin/bash 

# PEPATAC version 0.10.4

source /home/m.nazzari/miniconda3/etc/profile.d/conda.sh
conda activate pepatac

DataDir='/project_folder/ATAC-Seq/raw_data/'
cd ${DataDir}

FileNames=( DEHP_1* DEHP_2* DEHP_3* DEHP_4* DEHP_5* DEHP_6* DMSO_1* DMSO_3* DMSO_4* DMSO_5* DMSO_6* DINP_1* DINP_2* DINP_3* DINP_4* DINP_5* DINP_6* )

# get only sample names (e.g. DEHP_1)
Samples=($(printf "%s\n" "${FileNames[@]}" | sed "s/_R/ /" | awk -F"[ ]" "{print $1}" | sort -u))

# Run sample-level pipeline
for S in ${!Samples[@]}; do

echo "Analyzing sample: ${Samples[S]}"

INPUT_R1=${DataDir}${Samples[S]}"_R1_001.fastq.gz"
INPUT_R2=${DataDir}${Samples[S]}"_R2_001.fastq.gz"
	
/project_folder/pepatac/pipelines/pepatac.py -R \
--input ${INPUT_R1} \
--input2 ${INPUT_R2} \
--output-parent /project_folder/ATAC-Seq/processed_data/ \
--sample-name ${Samples[S]:0:6} \
--single-or-paired paired \
--trimmer pyadapt \
--aligner bowtie2 \
--deduplicator samtools \
--peak-caller hmmratac \
--genome GRCh38 \
--genome-size 2.7e9 \
--peak-type variable \
--prealignment-names GRCh38_chrM \
--prealignment-index GRCh38_chrM=/project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_chrM_bowtie2 \
--genome-index /project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_bowtie2 \
--chrom-sizes /project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_chr_size.txt \
--TSS-name /project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_TSS.bed \
--blacklist /project_folder/genomes/hg38-blacklist.v2.bed \
--anno-name /project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_feat_annotations.bed

done

# Run project-level pipeline
/project_folder/pepatac/pipelines/pepatac_collator.py \
--config /project_folder/ATAC-Seq/analysis_config.yaml \
--output-parent /project_folder/ATAC-Seq/processed_data/ \
--results /project_folder/ATAC-Seq/processed_data/ \
--new-start

conda deactivate 

2 - Shift BAM files to account for transposase duplication

Script 2_shift_bam.sh.

#!/bin/bash 

# using deeptools to shift the aligned BAM files generated by PEPATAC

# deeptools v3.5.1

Samples=(DEHP_1 DEHP_2 DEHP_3 DEHP_4 DEHP_5 DEHP_6 DMSO_1 DMSO_2 DMSO_3 DMSO_4 DMSO_5 DMSO_6 DINP_1 DINP_2 DINP_3 DINP_4 DINP_5 DINP_6)


for S in ${!Samples[@]}; do

echo "Analyzing sample: ${Samples[S]}"

# Perform ATAC shifting of the BAM file
/home/m.nazzari/Tools/deeptools3.5.1/bin/alignmentSieve \
--numberOfProcessors 16 \
--bam /project_folder/ATAC-Seq/processed_data/${Samples[S]}/aligned_GRCh38/${Samples[S]}_fixed_header.bam \
--ATACshift \
--outFile /project_folder/ATAC-Seq/shifted_bam/${Samples[S]}_shifted_presort.bam

# Sort the new ATAC shifted BAM file
samtools sort -@ 16 /project_folder/ATAC-Seq/shifted_bam/${Samples[S]}_shifted_presort.bam \
-o /project_folder/ATAC-Seq/shifted_bam/${Samples[S]}_ATACshift.bam

# Index the new ATAC shifted BAM file
samtools index /project_folder/ATAC-Seq/shifted_bam/${Samples[S]}_ATACshift.bam \
/project_folder/ATAC-Seq/shifted_bam/${Samples[S]}_ATACshift.bam.bai

rm /project_folder/ATAC-Seq/shifted_bam/${Samples[S]}_shifted_presort.bam

done

3 - Accessibility analysis using CSAW and sliding window approach

Modified from Sheik and Blais (2022) on bioRvix (https://doi.org/10.1101/2022.03.16.484118).

# Set working directory 
setwd('/project_folder/ATAC-Seq/DA_analysis/')

# Load libraries
library(BiocParallel) # Enables parallelization
library(csaw) # Used for window based analysis
library(edgeR) # Used for differential testing
library(rtracklayer) # Used to import BED files as GRanges or export GRanges as BED files
library(RUVSeq) # Used for normalization, remove unwanted variance
library(dplyr)
library(magrittr)

# Specify directory to export results to
read_type = '5-prime-reads' # 'pe-reads' or '5-prime-reads'
window_size = 'window50'
compound = 'DEHP'
exportDir_analysis = paste0('/project_folder/ATAC-Seq/DA_analysis/', compound, '_', read_type, '_', window_size)

# Create export directory in case it doesn't exist
dir.create(exportDir_analysis, showWarnings = TRUE, recursive = TRUE)

# Create a BiocParallel object that uses specified number of CPU threads with a progress bar
my_multicoreParam = MulticoreParam(workers = 12, progressbar = TRUE, exportglobals = FALSE)
register(my_multicoreParam)

# Load tab-delimited TXT file with information on ATACSeq samples (i.e, name, treatment, drug, and the path to appropriate BAM file)
# ATAC-shifted BAM files are used.
atac_samples = read.table('metadata.txt', sep = '\t', header = TRUE) %>%
	as.data.frame() %>%
	dplyr::filter(Group == 'DMSO' | Group == compound)
	
# Set sample names as row names for later downstream analysis
rownames(atac_samples) = atac_samples$Sample

# Specify path to bam files of ATAC samples
bam_files = atac_samples$BAM_Path

# Design to compare samples, matching order of BAM files
# Adjust as needed depending on the study design
treatment = factor(atac_samples$Group)
design = model.matrix(~0 + treatment)
colnames(design) = levels(treatment)
treat_vs_ctrl = makeContrasts(DEHP - DMSO, levels = design)

# Specify chromosomes to limit analysis to (autosomes and sex chrs)
chrLimit = paste0('chr', c(1:22, 'X', 'Y'))

# Set regions to discard by importing blacklist loci BED file
blacklistPath = '/project_folder/genomes/hg38-blacklist.v2.bed'


##### CSAW-specific parameters
# Set parameters for CSAW on regions to discard and chromosomes to restrict analysis to, indicate if data is paired-end (see also CSAW manual)
# To count each read end independently (Tn5 insertions) (i.e. single-end), set `pe = 'none'`
# To use reads as paired end, set `pe = 'both'`
if (read_type == 'pe-reads') { my_pe_mode = 'both' } else if (read_type == '5-prime-reads') { my_pe_mode = 'none' }

# indicate whether to remove duplicates (TRUE) or not (FALSE)
duplicate_removal = F

# Set Window Size
windowSize = 50

# Set spacing between windows. Use `windowSize` to have adjacent windows without intervening gaps
windowSpacing = windowSize

# Set the minimum number of counts for a window to be retained 
minCount_filterSize = 50

# Set bin size for calculating global enrichment
bin_size = 10000

# Set the fold change value to filter windows for enrichment over global enrichment in the binned dataset
globalBckg_foldChange = 3

##### No further parameters to adjust #####


##### CSAW Analysis #####
blacklist = import.bed(blacklistPath)
insertParam = readParam(discard = blacklist, 
                        dedup = duplicate_removal, 
                        minq = NA, 
                        pe = my_pe_mode, 
                        restrict = chrLimit)

# Count genome into bins for filtering via fold change over global background using the insertParam
binned_data = windowCounts(bam_files,
                           bin = TRUE,  
                           param = insertParam, 
                           width = bin_size, 
                           BPPARAM = my_multicoreParam)

# Count Tn5 integration sites with CSAW
window_data = windowCounts(bam_files, 
                           bin = FALSE,
                           ext = 1,
                           filter = minCount_filterSize,
                           param = insertParam,
                           shift = 0,
                           spacing = windowSpacing, 
                           width = windowSize,      
                           BPPARAM = my_multicoreParam)


# Distinguish signal to noise based on global or local background?
filter_criteria = 'global'

if (filter_criteria == 'global') {
	# Filter data by fold change
  
	# Create a filter of windows that have signal well above global background
	keep_fcFilter = filterWindowsGlobal(window_data, background = binned_data)$filter > log2(globalBckg_foldChange)
	summary(keep_fcFilter)
	
	# Filter out unwanted windows by fold change
	filtered_winData = window_data[keep_fcFilter, ]

	# Calculate normalization factors on filtered high-abundance windows with CSAWs implementation of the TMM method to remove efficiency biases. 
	filtered_winData = normFactors(filtered_winData)

} else if (filter_criteria = 'local') {
	
  # filter uninteresting features (windows) by local enrichment
	# local background estimator: 2kb neighborhood
	# change width parameter as desired
	neighbor = resize(rowRanges(window_data), width = 2000, fix = 'center') %>% suppressWarnings()
	wider_data = regionCounts(bam_files, regions = neighbor, param = param) # count reads in neighborhoods
	keep_fcFilter = filterWindowsLocal(counts, wider_data)
	
	# threshold of 3-fold increase in enrichment over 2 kb neighborhood abundance; change as desired 
	filtered_winData = window_data[keep_fcFilter$filter > log2(3), ] 
	filtered_winData = normOffsets(filtered_winData, se.out = TRUE)
}

## Differential accessibility analysis
# Create an edgeR object from the normalized data, set appropriate column and row names
y_TMM = asDGEList(filtered_winData)
colnames(y_TMM$counts) = atac_samples$Sample
rownames(y_TMM$samples) = atac_samples$Sample

# Estimate dispersions and fit a GLM to data
y_TMM = estimateDisp(y_TMM, design)
fit_TMM = glmQLFit(y_TMM, design, robust = TRUE)

# Generate a SeqExpressionSet from TMM data to use with RUVs
set_TMM = newSeqExpressionSet(counts = y_TMM$counts, phenoData = atac_samples)
cpm_TMM = cpm(y_TMM, normalized.lib.sizes = TRUE, log = FALSE)
multiplier_TMM = 1e-6*(mean(y_TMM$samples$lib.size))
normCounts(set_TMM) = round(multiplier_TMM * cpm_TMM, digits = 0)

# See normalized count matrix
set_TMM@assayData$normalizedCounts 

# Plot PCA of data before RUV-Seq 
png(paste0(exportDir_analysis, 'before_RUVs.png'), width = 7000, height = 4000, units = 'px', res = 600)
par(mar = c(8,5,2,2), mfrow = c(1, 2))
plotPCA(set_TMM, col = c(rep('red', 6), rep('blue', 6)), pch = 20, labels = F)
plotRLE(set_TMM, col= c(rep('red',6), rep('blue',6)), las = 2)
dev.off()

# Define the replicate groups
replicateGroups = makeGroups(atac_samples$Group)
replicateGroups

# Create a RUVs set
set_RUVs = RUVs(set_TMM, rownames(set_TMM), k = 5, replicateGroups)

# plot PCA and RLE of data with RUV-Seq
png(paste0(exportDir_analysis, 'after_RUVs.png'), width = 7000, height = 4000, units = 'px', res = 600)
par(mar = c(8,5,2,2), mfrow = c(1, 2))
plotPCA(set_RUVs, col = c(rep('red', 6), rep('blue', 6)), main = 'k = 5', pch = 20, labels = F)
plotRLE(set_RUVs, col = c(rep('red',6), rep('blue',6)), las = 2, main = 'k = 5')
dev.off()

# Create a design matrix for differential testing that incorporates W, the variable representing unwanted variance. 
design_RUVs = model.matrix(~0 + treatment + W_1 + W_2 + W_3 + W_4 + W_5, data = pData(set_RUVs))

# Make contrasts for RUVs data based on the new design matrix
treat_vs_ctrl_RUVs = makeContrasts(treatmentDEHP - treatmentDMSO, levels = design_RUVs)


## Estimate dispersion with one command
y_RUVs = estimateDisp(y_TMM, design_RUVs)

# Fit QL model to data
fit_RUVs = glmQLFit(y_RUVs, design_RUVs)

# Test for differential gene expression
results_RUVs = glmQLFTest(fit_RUVs, contrast = treat_vs_ctrl_RUVs)

# Apply FDR to results table
resultsTable_RUVs = topTags(results_RUVs, adjust.method = 'fdr', n = Inf, sort.by = 'none')$table

# Calculate scores using the FDR values
resultsTable_RUVs$score = -10*log10(resultsTable_RUVs$FDR)

# Add FDR corrected results data to our filtered dataset of windows
winResults_Data = filtered_winData
rowData(winResults_Data) = cbind(rowData(winResults_Data), resultsTable_RUVs)

# Use base R to explore results table
winResults_Data@rowRanges %>% .[order(.$FDR),] %>% head
set_TMM@assayData$normalizedCounts # windows normalized counts 



# Merge nearby windows up to 'tol' distance apart: 500 bp in this case max merged window width: 5000 bp
# Reasoning: FDR correction should be done on `regions` and not `windows` (https://bioconductor.org/books/release/csawBook/correction-for-multiple-testing.html)
merged_peaks = mergeWindows(rowRanges(winResults_Data), tol = 500L, max.width = 5000L)

summary(width(merged_peaks$regions))

# For every region, use the window with lowest p-val as being representative of the whole region (https://bioconductor.org/books/release/csawBook/correction-for-multiple-testing.html). 
tab_best = getBestTest(merged_peaks$id, rowData(winResults_Data))

# Boxplot of regions size distribution
boxplot(width(merged_peaks$regions), main = paste0(compound, ' vs DMSO\nregions size distribution'), ylab = 'Window width, bp')

# Report the start location of the best window within each cluster. For example, the sequence of the DB subinterval can be extracted for motif discovery.
tab_best$rep.start = start(rowRanges(winResults_Data))[tab_best$rep.test]

# combine merged peaks window range with statistics
final_merged_peaks = merged_peaks$region
final_merged_peaks@elementMetadata = cbind(final_merged_peaks@elementMetadata, tab_best)

# give score to regions
final_merged_peaks$score = -10*log10(final_merged_peaks$PValue)
names(final_merged_peaks) = paste0('region_', 1:length(final_merged_peaks))

# Export table of merged regions 
# with dplyr...
final_merged_peaks %>% 
	as.data.frame() %>% 
	write.table(., paste0(exportDir_analysis, 'all_regions.txt'), quote = F, row.names = T, col.names = T, sep = '\t')

# ... or GRanges
export(final_merged_peaks, file = paste0(exportDir_analysis, 'all_regions.bed'))


# isolate significant regions, and ones that go up/down
PVAL = 0.01
final_merged_peaks_sig = subset(final_merged_peaks, abs(rep.logFC) >= 1 & PValue < PVAL)
names(final_merged_peaks_sig) = paste(names(final_merged_peaks_sig), final_merged_peaks_sig$direction, sep = '.')
final_merged_peaks_sig_up = subset(final_merged_peaks_sig, rep.logFC >= 1)
final_merged_peaks_sig_down = subset(final_merged_peaks_sig, rep.logFC <= -1)

# Isolate GRanges objects 
GR_Sig_All = rowRanges(final_merged_peaks_sig)
GR_Sig_Up = rowRanges(final_merged_peaks_sig_up)
GR_Sig_Down = rowRanges(final_merged_peaks_sig_down)

# export significant windows in .bed files
export(GR_Sig_All, file = paste0(exportDir_analysis, compound, '_merged_sigAll_', PVAL, '.bed'))
export(GR_Sig_Up, file = paste0(exportDir_analysis, compound, '_merged_sigUp_', PVAL, '.bed'))
export(GR_Sig_Up, file = paste0(exportDir_analysis, compound, '_merged_sigDown_', PVAL, '.bed'))

# Save image of workspace
save.image(file = paste0(exportDir_analysis, '/', compound, '.RData'))

4 - Annotate peaks with HOMER

HOMER main page http://homer.ucsd.edu/homer/index.html. Script 4_annotate_peaks.sh.

#!/bin/bash 

# some of these variables depend on the parameters decided during the previous DA analysis (step 3)
READ_TYPE="5-prime-reads" # pe-reads, 5-prime-reads
WINDOW="window50"
DIRECTION="All" # All, Up, Down

# DEHP vs DMSO 
COMPOUND="DEHP"
MAIN_DIR="/project_folder/ATAC-Seq/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

mkdir -p "${MAIN_DIR}HOMER/"

annotatePeaks.pl \
${MAIN_DIR}/${COMPOUND}_merged_sig${DIRECTION}.bed \
hg38 \
> ${MAIN_DIR}HOMER/${COMPOUND}_sig${DIRECTION}.txt \
-go ${MAIN_DIR}HOMER/${PVAL}_${DIRECTION}_GO \
-annStats ${MAIN_DIR}HOMER/annStats_${DIRECTION}.txt

# DINP vs DMSO
COMPOUND="DINP"
MAIN_DIR="/project_folder/ATAC-Seq/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

mkdir -p "${MAIN_DIR}HOMER/"

annotatePeaks.pl \
${MAIN_DIR}/${COMPOUND}_merged_sig${DIRECTION}.bed \
hg38 \
> ${MAIN_DIR}HOMER/${COMPOUND}_sig${DIRECTION}.txt \
-go ${MAIN_DIR}HOMER/${PVAL}_${DIRECTION}_GO \
-annStats ${MAIN_DIR}HOMER/annStats_${DIRECTION}.txt

5 - Take screenshots of tracks on IGV normalized with BeCorrect

BeCorrect is used to scale track files based on the normalization values. It's used for visualization purposes. Here I used it to obtain scaled track files to visualize on the Integrative Genomics Viewer. BeCorrect is available at https://doi.org/10.1038/s41598-020-66998-4.

5.1 - Convert shifted BAM to BEDGRAPH with deepTools

Script 5.1_BAM_to_BEDGRAPH.sh.

#!/bin/bash 

# deepTools v3.5.1

# DEHP vs DMSO
Samples=(DEHP_1 DEHP_2 DEHP_3 DEHP_4 DEHP_5 DEHP_6 DMSO_1 DMSO_2 DMSO_3 DMSO_4 DMSO_5 DMSO_6)
fileNames=(/project_folder/ATAC-Seq/shifted_bam/DEHP*ATACshift.bam /project_folder/shifted_bam/DMSO*ATACshift.bam)

COMPOUND="DEHP"
READ_TYPE="5-prime-reads" # pe-reads, 5-prime-reads
WINDOW="window50"
MAIN_DIR="/project_folder/ATAC-Seq/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

mkdir -p "${MAIN_DIR}BeCorrect/"


for S in ${!Samples[@]}; do

echo "Converting sample: ${Samples[S]}"

/home/Tools/deeptools3.5.1/bin/bamCoverage --bam ${fileNames[S]} -o ${MAIN_DIR}/BeCorrect/1_raw_bedgraph/${Samples[S]}.bg --outFileFormat "bedgraph" --binSize 1 --numberOfProcessors 25

done


# DINP vs DMSO
Samples=(DINP_1 DINP_2 DINP_3 DINP_4 DINP_5 DINP_6 DMSO_1 DMSO_2 DMSO_3 DMSO_4 DMSO_5 DMSO_6)
fileNames=(/project_folder/DA_analysis/DINP*ATACshift.bam /project_folder/DA_analysis/DMSO*ATACshift.bam)

COMPOUND="DINP"
READ_TYPE="5-prime-reads" # pe-reads, 5-prime-reads
WINDOW="window50"
MAIN_DIR="/project_folder/ATAC-Seq/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

mkdir -p "${MAIN_DIR}BeCorrect/"


for S in ${!Samples[@]}; do

echo "Converting sample: ${Samples[S]}"

/home/Tools/deeptools3.5.1/bin/bamCoverage --bam ${fileNames[S]} -o ${MAIN_DIR}/BeCorrect/1_raw_bedgraph/${Samples[S]}.bg --outFileFormat "bedgraph" --binSize 1 --numberOfProcessors 25

done

5.2 - Retrieve raw and normalized windows counts for BeCorrect visualization

library(tidyverse)

# load image saved on step 3
COMPOUND = 'DINP' # or 'DEHP'
READ_TYPE = '5-prime-reads' # pe-reads, 5-prime-reads
WINDOW = 'window50'
MAIN_DIR = paste0('/project_folder/ATAC-Seq/DA_analysis/', COMPOUND, '_', READ_TYPE, '_', WINDOW, '/')

setwd(MAIN_DIR)

load(paste0(COMPOUND, '.RData'))

# retrieve counts table with chr start end as columns. Remove any chromosome that doesn't start with 'chr'
normalized_counts = do.call('cbind', list( 
				chr = as.data.frame(winResults_Data@rowRanges@seqnames),
				start = as.data.frame(winResults_Data@rowRanges@ranges)[, 1],
				end = as.data.frame(winResults_Data@rowRanges@ranges)[, 2],
				set_TMM@assayData$normalizedCounts)) %>% 
	dplyr::rename(., chr = 'value') %>% # if column isn't automatically named `chr`, set it manually
	dplyr::filter(str_detect(chr, 'chr'))

# normalized count file 
write.table(normalized_counts, paste0(COMPOUND, '_normalized_counts.txt'), quote = F, row.names = F, col.names = T, sep = '\t')

# raw count file
raw_count = do.call('cbind', list( 
				chr = as.data.frame(winResults_Data@rowRanges@seqnames),
				start = as.data.frame(winResults_Data@rowRanges@ranges)[, 1],
				end = as.data.frame(winResults_Data@rowRanges@ranges)[, 2],
				set.TMM@assayData$counts)) %>% 
	dplyr::rename(., chr = 'value') %>% # if column isn't automatically named `chr`, set it manually
	dplyr::filter(str_detect(chr, 'chr'))


write.table(raw_count, paste0(COMPOUND, '_raw_counts.txt'), quote = F, row.names = F, col.names = T, sep = '\t')

5.3 - Make chromosomes names file

TXT file with names of chromosomes. Retrieve from the file used for PEPATAC (/project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_chr_size.txt). If needed, modify file to only retain autosomes and sex chromosomes.

5.4 - Retain only regions that fall within autosomes and sex chromosomes from the BEDGRAPH files exported in Step 3

READ_TYPE = '5-prime-reads' # pe-reads, 5-prime-reads
WINDOW = 'window50'

COMPOUND = c('DEHP', 'DINP')

# For both DEHP and DINP
for (x in 1:2) {

	MAIN_DIR = paste0('/project_folder/ATAC-Seq/DA_analysis/', COMPOUND[x], '_', READ_TYPE, '_', WINDOW, '/')
	
	bg_list = list.files(path = paste0(MAIN_DIR, 'BeCorrect/1_raw_bedgraph'), pattern = '.bg', full.names = T)

	for (i in seq(1, length(bg_list))) {
	
		read.table(bg_list[i], header = F) %>%
			dplyr::filter(V4 != 0) %>% # remove lines which are 0 to decrease file size
			dplyr::filter(str_detect(V1, 'chr')) %>%
			write.table(., file = bg_list[i], quote = F, row.names = F, col.names = F, sep = '\t')
	}
}

5.5 - Download BeCorrect script and compile it on the terminal

wget -P /project_folder/DA_analysis/ https://raw.githubusercontent.com/Zhang-lab/BeCorrect/master/BeCorrect.cpp 
g++ /project_folder/DA_analysis/BeCorrect.cpp -o /project_folder/DA_analysis/BeCorrect

5.6 - Run BeCorrect

Script 5.6_BeCorrect.sh.

READ_TYPE="5-prime-reads" # pe-reads, 5-prime-reads
WINDOW="window50"

# DEHP
COMPOUND="DEHP"
MAIN_DIR="/project_folder/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

mkdir -p ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/
cd -p ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/

/project_folder/DA_analysis/BeCorrect \
/project_folder/DA_analysis/${MAIN_DIR}/DEHP_raw_counts.txt \
/project_folder/DA_analysis/${MAIN_DIR}/DEHP_normalized_counts.txt \
/project_folder/DA_analysis/Chr_names_GRCh38.txt \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DEHP_1.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DEHP_2.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DEHP_3.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DEHP_4.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DEHP_5.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DEHP_6.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_1.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_2.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_3.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_4.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_5.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_6.bg


# DINP
COMPOUND="DINP"
MAIN_DIR="/project_folder/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

mkdir -p ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/
cd -p ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/

/project_folder/DA_analysis/BeCorrect \
/project_folder/DA_analysis/${MAIN_DIR}/DINP_raw_counts.txt \
/project_folder/DA_analysis/${MAIN_DIR}/DINP_normalized_counts.txt \
/project_folder/DA_analysis/Chr_names_GRCh38.txt \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DINP_1.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DINP_2.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DINP_3.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DINP_4.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DINP_5.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DINP_6.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_1.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_2.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_3.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_4.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_5.bg \
/project_folder/DA_analysis/${MAIN_DIR}/BeCorrect/1_raw_bedgraph/DMSO_6.bg

5.7 - If the BEDGRAPH files are too big and slow down IGV, you can convert to BIGWIG (the binary equivalent)

Script 5.7_BEDGRAPH_to_BIGWIG. Use UCSC utility http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/

READ_TYPE="5-prime-reads" # pe-reads, 5-prime-reads
WINDOW="window50"

## DEHP vs DMSO
COMPOUND="DEHP"
MAIN_DIR="/project_folder/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

cd -p ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/

# rename files to change extension from .bg to .bedGraph (the official one)
for file in *.bg; do mv -- '$file' '${file%.bg}.bedGraph'; done

# convert to BIGWIG
Samples=(DEHP_1 DEHP_2 DEHP_3 DEHP_4 DEHP_5 DEHP_6 DMSO_1 DMSO_2 DMSO_3 DMSO_4 DMSO_5 DMSO_6)
fileNames=( ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/* )

for S in ${!Samples[@]}; do

echo 'Converting from BEDGRAPH to BIGWIG sample: ${Samples[S]}'

# sort BEDGRAPH
sort -k1,1 -k2,2n ${fileNames[S]} > ${MAIN_DIR}BeCorrect/3_sorted_scaled_bedgraph/${Samples[S]}_sorted.bedGraph

/home/bedGraphToBigWig \
${MAIN_DIR}BeCorrect/3_sorted_scaled_bedgraph/${Samples[S]}_sorted.bedGraph \
/project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_chr_size.txt \
${MAIN_DIR}BeCorrect/4_scaled_bigwig/${Samples[S]}.bw

done


## DINP vs DMSO
COMPOUND="DINP"
MAIN_DIR="/project_folder/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

cd -p ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/

# rename files to change extension from .bg to .bedGraph (the official one)
for file in *.bg; do mv -- '$file' '${file%.bg}.bedGraph'; done

# convert to BIGWIG
Samples=(DINP_1 DINP_2 DINP_3 DINP_4 DINP_5 DINP_6 DMSO_1 DMSO_2 DMSO_3 DMSO_4 DMSO_5 DMSO_6)
fileNames=( ${MAIN_DIR}BeCorrect/2_scaled_bedgraph/* )

for S in ${!Samples[@]}; do

echo 'Converting from BEDGRAPH to BIGWIG sample: ${Samples[S]}'

# sort BEDGRAPH
sort -k1,1 -k2,2n ${fileNames[S]} > ${MAIN_DIR}BeCorrect/3_sorted_scaled_bedgraph/${Samples[S]}_sorted.bedGraph

/home/bedGraphToBigWig \
${MAIN_DIR}BeCorrect/3_sorted_scaled_bedgraph/${Samples[S]}_sorted.bedGraph \
/project_folder/genomes/ATAC-Seq_GRCh38/GRCh38_chr_size.txt \
${MAIN_DIR}BeCorrect/4_scaled_bigwig/${Samples[S]}.bw

done

5.8 - Make IGV batch script

Two scripts are created, PART1 and PART2. The usage is: run PART1, group all tracks and autoscale, then run PART2. I have not yet found a successful way to group and autoscale from within the script!

# add -1000 and +1000 to BED file coordinates 
padding = 1000

READ_TYPE = '5-prime-reads' # pe-reads, 5-prime-reads
WINDOW = 'window50'
DIRECTION = 'All' # All, Up, Down
COMPOUND = c('DEHP', 'DINP')

# For both DEHP and DINP
for (x in 1:2) {

	MAIN_DIR = paste0('/project_folder/ATAC-Seq/DA_analysis/', COMPOUND[x], '_', READ_TYPE, '_', WINDOW, '/')

	bigwig_compound = list.files(paste0(MAIN_DIR, 'BeCorrect/4_scaled_bigwig'), pattern = compound, full.names = T)
	bigwig_dmso = list.files(paste0(MAIN_DIR, 'BeCorrect/4_scaled_bigwig'), pattern = 'DMSO', full.names = T)

	# read BED file with regions to screenshot
	mybed = read.table(paste0(MAIN_DIR, compound, '_merged_sigAll_0.01.bed'), header = F)

	## PART 1: load and color tracks
	# add LOAD GENOME and ANNOTATIONS lines
	part1 = do.call('rbind', list('genome /project_folder/genomes/ATAC-Seq_GRCh38/GRCh38.primary_assembly.genome.fa',
					'load /project_folder/genomes/ATAC-Seq_GRCh38/gencode.v38.primary_assembly.annotation.sorted.gtf',
					'load /project_folder/genomes/ATAC-Seq_GRCh38/GRCh38-ccREs.bed',
					paste0('load ', MAIN_DIR, '/', compound, '_merged_sigAll_0.01.bed')))

	# load and color compound tracks. Different colors for DEHP and DINP
	if (compound == 'DEHP') {
		for (i in seq(1, 6)) {
			part1 = do.call('rbind', list(part1, 
							paste0('load ', bigwig_compound[i]),
							paste0('setColor #85CC6F ', bigwig_compound[i]),
							paste0('setDataRange auto ', bigwig_compound[i])))
					
	}} else if (compound == 'DINP') {
		for (i in seq(1, 6)) {
			part1 = do.call('rbind', list(part1, 
							paste0('load ', bigwig_compound[i]),
							paste0('setColor #7C80D1 ', bigwig_compound[i]),
							paste0('setDataRange auto ', bigwig_compound[i])))
	}}	
	
	# load and color DMSO tracks
	for (i in seq(1, 6)) {
		part1 = do.call('rbind', list(part1, 
						paste0('load ', bigwig_dmso[i]),
						paste0('setColor #7F7F7F ', bigwig_dmso[i]),
						paste0('setDataRange auto ', bigwig_dmso[i])))
	}


	## PART 2: add take screenshots lines. The filename is structure geneName_compound_regionName.png
	part2 = paste0('snapshotDirectory ', MAIN_DIR, '/IGV_snapshots/')
			  
	for (i in seq(1, nrow(mybed))) {
		
		# gene name comes from the HOMER annotation file created in step 4
		homer = read.table(paste0(MAIN_DIR, 'HOMER/', COMPOUND, '_sig', DIRECTION, '.txt'), header = T, sep = '\t')
		
	part2 = do.call('rbind', list(part2, 
					paste0('goto ', mybed$V1[i], ':', mybed$V2[i]-padding, '-', mybed$V3[i]+padding),
					'maxPanelHeight 3000',
					paste0('snapshot ', homer[homer$PeakID == mybed$V4[i], ]$Gene_Name, '_', compound, '_', mybed$V4[i], '.png')))
					
	}

	write.table(part1, paste0(MAIN_DIR, 'PART1_script_IGV_batch_screenshot.txt'), quote = F, row.names = F, col.names = F)
	write.table(part2, paste0(MAIN_DIR, 'PART2_script_IGV_batch_screenshot.txt'), quote = F, row.names = F, col.names = F)
}

5.9 - Run script on IGV

How to (DEHP or DINP):

  1. Open IGV
  2. Run part 1 script PART1_script_IGV_batch_screenshot.txt
  3. Group and autoscale manually
  4. Run part 2 script PART2_script_IGV_batch_screenshot.txt

5.10 - Move UP and DOWN screenshots to their respective folder

Script 5.10_move_screenshots.sh.

READ_TYPE="5-prime-reads" # pe-reads, 5-prime-reads
WINDOW="window50"

# DEHP
COMPOUND="DEHP"
MAIN_DIR="/project_folder/ATAC-Seq/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

cd -p ${MAIN_DIR}IGV_snapshots/
mkdir -p ./up/
mkdir -p ./down/
mv *up.png ./up/
mv *down.png ./down/


# DINP
COMPOUND="DINP"
MAIN_DIR="/project_folder/ATAC-Seq/DA_analysis/${COMPOUND}_${READ_TYPE}_${WINDOW}/"

cd -p ${MAIN_DIR}IGV_snapshots/
mkdir -p ./up/
mkdir -p ./down/
mv *up.png ./up/
mv *down.png ./down/