-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWGBS_bioinformatics.txt
226 lines (184 loc) · 8.9 KB
/
WGBS_bioinformatics.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
##Marianthi Tangili
##Nov 2022
#this script is based on jackdaw samples (JD001-JD022), analogous process was followed for zebra finch samples
#all samples were run in 2 lanes so we processed data for *L001* and *L002* and with *R1* (forward read) and *R2* (reverse read) per sample
#samples were received from sequencing facility as *.fastq and *.fastq.gz.md5 files, a total of 8 files per sample
###STEP 1
##this step was performed via computational cluster (Peregrine/Habrok)
#reference genome preparation/bisulfite conversion in Bismark
#download NCBI: bCorHaw1.pri.cur to a Genome folder
#prepare separately for males without W chromosome, remove W from the Genome folder and prepare again for females
#this step results in *.bt2 files for CT conversion and GA_conversion
bismark_genome_preparation /corvus_hawaiiensis_Genome/
###STEP 2
##this step was performed via computational cluster (Peregrine)
#Check all file integrity
md5sum -c *.md5
###STEP 3
##this step was performed via computational cluster (Peregrine)
#check if R1 and R2 match
#all files
sdiff <(zcat $_JD0$_S$_L001_R1_001.fastq.gz| grep @| head -20) <(zcat $_JD0$_S$_L001_R2_001.fastq.gz| grep @| head -20) | less
sdiff <(zcat $_JD0$_S$_L002_R1_001.fastq.gz| grep @| head -20) <(zcat $_JD0$_S$_L002_R2_001.fastq.gz| grep @| head -20) | less
#single file
sdiff <(zcat *L001_R2_001.fastq.gz| grep @| head -20) <(zcat *L001_R1_001.fastq.gz| grep @| head -20) | less
###STEP 4
##this step was performed via computational cluster (Peregrine)
#quality control via FastQC
#this step results in *fastqc.html files readable online: FastQC Report
fastqc *fastq.gz
#quality control via MultiQC
#this step results in *multiqc_report.html readable online: MultiQC Report
multiqc -- *fastqc.zip
###STEP 5
##this step was performed via computational cluster (Peregrine)
#adapter sequence trimming via Trim_Galore
#we trimmed 10 bp from the forward read and 20 bp from the reverse read
#this step results in *val_1.fq.gz (for R1) and *val_2.fq.gz (for R2) files
#FastQC is performed automatically on the output files after trimming
trim_galore --clip_r1 10 --clip_r2 20 --paired --fastqc *L001_R1_001.fastq.gz *L001_R2_001.fastq.gz
trim_galore --clip_r1 10 --clip_r2 20 --paired --fastqc *L002_R1_001.fastq.gz *L002_R2_001.fastq.gz
###STEP 6
##this step was performed via computational cluster (Peregrine)
#quality control via MultiQC after trimming
#this step results in *multiqc_report.html readable online: MiltiQC Report
multiqc -- *fastqc.zip
###STEP 7
##this step was performed via computational cluster (Peregrine)
#alignment to the reference genome in Bismark
#separate for female samples and male samples
#this step results in *.val_1_bismark_bt2_pe.bam files
#R1 and R2 are merged in this step so all steps from now on are run for one file per sample, per lane
#lane 1
bismark /corvus_hawaiiensis_Genome/ -1 *L001_R1_001_val_1.fq.gz -2 *L001_R2_001_val_2.fq.gz
#lane 2
bismark /corvus_hawaiiensis_Genome/ -1 *L002_R1_001_val_1.fq.gz -2 *L002_R2_001_val_2.fq.gz
###STEP 8
##this step was performed via computational cluster (Peregrine)
#merge *.val_1_bismark_bt2_pe.bam files from all lanes per each sample via SAMtools
#this step results in a single *.bam file per sample
samtools merge S*.bam *_L001_R1_001_val_1_bismark_bt2_pe.bam *_L002_R1_001_val_1_bismark_bt2_pe.bam
###STEP 9
##this step was performed via computational cluster (Peregrine)
#deduplication
#this step results in *.deduplicated.bam file
deduplicate_bismark --bam S*.bam
###STEP 10
##this step was performed via computational cluster (Peregrine)
#methylation extraction in Bismark
#this step results in *.deduplicated.bismark.cov file, *.deduplicated.bedGraph, *.deduplicated.M-bias and *.deduplicated_splitting_report files per sample
bismark_methylation_extractor --bedGraph --gzip S*.deduplicated.bam
###STEP 11
##this step was performed via computational cluster (Peregrine)
#save all sample information in one data frame
#read all coverage files
# Initialize an empty list to store the data frames
cov_data_list <- list()
# Loop through files, read data, add Sample column, and set column names
for (i in 1:22) {
file_path <- paste0(directory_path, "S", i, ".deduplicated.bismark.cov.gz")
cov_data <- read.delim(file_path)
cov_data$Sample <- as.character(i)
# Set column names
colnames(cov_data) <- c("Chromosome", "StartPosition", "EndPosition", "MethylationPercentage", "CountMethylated", "CountNonMethylated", "Sample")
cov_data_list[[paste0("Cov", i)]] <- cov_data
}
# Combine the data frames into a single data frame
CovN <- do.call(rbind, cov_data_list)
#replace chromosome names
CovN[CovN == "NC_063213.1"] <- "1"
CovN[CovN == "NC_063214.1"] <- "2"
CovN[CovN == "NC_063215.1"] <- "3"
CovN[CovN == "NC_063216.1"] <- "4"
CovN[CovN == "NC_063217.1"] <- "5"
CovN[CovN == "NC_063218.1"] <- "6"
CovN[CovN == "NC_063219.1"] <- "7"
CovN[CovN == "NC_063220.1"] <- "8"
CovN[CovN == "NC_063221.1"] <- "9"
CovN[CovN == "NC_063222.1"] <- "10"
CovN[CovN == "NC_063223.1"] <- "11"
CovN[CovN == "NC_063224.1"] <- "12"
CovN[CovN == "NC_063225.1"] <- "13"
CovN[CovN == "NC_063226.1"] <- "14"
CovN[CovN == "NC_063227.1"] <- "15"
CovN[CovN == "NC_063228.1"] <- "16"
CovN[CovN == "NC_063229.1"] <- "17"
CovN[CovN == "NC_063230.1"] <- "18"
CovN[CovN == "NC_063231.1"] <- "19"
CovN[CovN == "NC_063232.1"] <- "20"
CovN[CovN == "NC_063233.1"] <- "21"
CovN[CovN == "NC_063234.1"] <- "22"
CovN[CovN == "NC_063235.1"] <- "23"
CovN[CovN == "NC_063236.1"] <- "24"
CovN[CovN == "NC_063237.1"] <- "25"
CovN[CovN == "NC_063238.1"] <- "26"
CovN[CovN == "NC_063239.1"] <- "27"
CovN[CovN == "NC_063240.1"] <- "28"
CovN[CovN == "NC_063241.1"] <- "29"
CovN[CovN == "NC_063242.1"] <- "30"
CovN[CovN == "NC_063243.1"] <- "31"
CovN[CovN == "NC_063244.1"] <- "32"
CovN[CovN == "NC_063245.1"] <- "33"
CovN[CovN == "NC_063246.1"] <- "34"
CovN[CovN == "NC_063247.1"] <- "35"
CovN[CovN == "NC_063248.1"] <- "36"
CovN[CovN == "NC_063249.1"] <- "37"
CovN[CovN == "NC_063250.1"] <- "38"
CovN[CovN == "NC_063251.1"] <- "39"
CovN[CovN == "NC_063252.1"] <- "40"
CovN[CovN == "NC_063253.1"] <- "41"
CovN[CovN == "NC_063255.1"] <- "Z"
CovN[CovN == "NC_063254.1"] <- "W"
CovN[CovN == "NC_026783.1"] <- "MT"
#save data frame
saveRDS(CovN, file="CovJD.rds")
###STEP 12
##this step was performed via computational cluster (Peregrine)
#calculate average DNAm per sample per chromosome
library(dplyr)
CovN <- readRDS("CovJD.rds")
m<- CovN %>% group_by(Chromosome, Sample) %>% summarise(mean = mean(MethylationPercentage),sd = sd(MethylationPercentage))
write.csv(m, "NonDupAllAvgMethCov_JD.csv")
###STEP 13
#this step was performed via PC
library(readr)
library(ggplot2)
library(dplyr)
#read in avg DNAm file
ch<- read.csv("NonDupAllAvgMethCov_JD.csv")
#insert chromosome length information, derived from NCBI: bCorHaw1.pri.cur
lh <- read.csv("HCChromosome_Length.csv")
lh$Length<-as.numeric(lh$Length)
#add chromosome length to intitial data set
ch<- merge(ch,lh, by=c("Chromosome"))
#add chromosome information
ch$gen<- NA
ch$gen[ch$Chromosome=="Z"]<- "Z"
ch$gen[ch$Chromosome=="W"]<- "W"
ch$gen[is.na(ch$gen)] = "Autosomes"
ch$gen[ch$Chromosome=="MT"]<- "MT"
#add individual sex information
ch$Sex<- NA
ch$Sex[ch$Sample=="1"| ch$Sample=="2"| ch$Sample=="3"| ch$Sample=="4"|ch$Sample=="5"|ch$Sample=="10"|ch$Sample=="11"|ch$Sample=="12"|ch$Sample=="13"|ch$Sample=="14"|ch$Sample=="19"|ch$Sample=="21"]<- "Female"
ch$Sex[ch$Sample=="6"| ch$Sample=="7"| ch$Sample=="8"| ch$Sample=="9"|ch$Sample=="15"|ch$Sample=="16"|ch$Sample=="17"|ch$Sample=="18"|ch$Sample=="20"|ch$Sample=="22"]<- "Male"
#add individual age information
ch$Age<- NA
ch$Age[ch$Sample=="1"|ch$Sample=="2"| ch$Sample=="3"| ch$Sample=="4"| ch$Sample=="5"| ch$Sample=="6"| ch$Sample=="7"| ch$Sample=="8"|ch$Sample=="9"|ch$Sample=="19"|ch$Sample=="20"]<- "Young"
ch$Age[ch$Sample=="10"|ch$Sample=="11"|ch$Sample=="12"|ch$Sample=="13"|ch$Sample=="14"|ch$Sample=="15" |ch$Sample=="16"| ch$Sample=="17"| ch$Sample=="18" |ch$Sample=="21"|ch$Sample=="22"]<- "Old"
#add longitudinal sample information
ch$SampleL<- NA
ch$SampleL[ch$Sample=="1" | ch$Sample=="10"]<- 1
ch$SampleL[ch$Sample=="2" | ch$Sample=="11"]<- 2
ch$SampleL[ch$Sample=="3" | ch$Sample=="12"]<- 3
ch$SampleL[ch$Sample=="4" | ch$Sample=="13"]<- 4
ch$SampleL[ch$Sample=="5" | ch$Sample=="14"]<- 5
ch$SampleL[ch$Sample=="6" | ch$Sample=="15"]<- 6
ch$SampleL[ch$Sample=="7" | ch$Sample=="16"]<- 7
ch$SampleL[ch$Sample=="8" | ch$Sample=="17"]<- 8
ch$SampleL[ch$Sample=="9" | ch$Sample=="18"]<- 9
ch$SampleL[ch$Sample=="19" | ch$Sample=="21"]<- 10
ch$SampleL[ch$Sample=="20" | ch$Sample=="22"]<- 11
ch<-ch %>%
rename(MeanM =mean.y,SDM = sd.y)
#prepare a subset without mitochondrial DNA ready for the subsequent analyses=> R script "avianDNAm"
jdnoMT <- subset(ch, gen == "Autosomes" | gen == "Z" | gen == "W",)