diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 16cabbb91..ac22f08a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: matrix: # Nextflow versions: check pipeline minimum and current latest nxf_ver: ['20.07.1', '21.03.0-edge'] - aligner: ['bismark', 'bismark_hisat', 'bwameth'] + aligner: ['bismark', 'bismark_hisat', 'bwameth', 'biscuit'] include: - aligner: 'bismark' ref_index: --bismark_index results/reference_genome/BismarkIndex/ @@ -29,6 +29,8 @@ jobs: ref_index: --bismark_index results/reference_genome/BismarkIndex/ - aligner: 'bwameth' ref_index: --bwa_meth_index results/reference_genome/genome.fa + - aligner: 'biscuit' + ref_index: --bwa_biscuit_index results/reference_genome/genome.fa steps: - name: Check out pipeline code uses: actions/checkout@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index e8d617bad..b2b9a396c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,20 @@ ## v1.7dev -_..nothing yet.._ +### Pipeline Updates + +* Added Picard CollectInsertSizeMetrics and Picard CollectGcBiasMetrics +* Improved qulimap and preseq by adding `samtools sort` and `samtools index` step in the Bismark aligner +* Added BISCUIT aligner as an optional aligner, with all relative steps (alignment, mark duplicates with [samblaster](https://github.com/GregoryFaust/samblaster), methylation extraction, QC for biscuit, and optional [Epi-read](https://huishenlab.github.io/biscuit/epiread_format/) file creation with SNP information). + +### New software + +* samblaster `0.1.26` +* bedtools `2.30.0` +* biscuit `0.3.16.20200420` +* bcftools`1.10` +* parallel `20201122` +* gawk `5.1.0` ## [v1.6](https://github.com/nf-core/methylseq/releases/tag/1.6) - 2021-03-26 diff --git a/README.md b/README.md index 956c80a14..7a68f276c 100644 --- a/README.md +++ b/README.md @@ -18,21 +18,22 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool ## Pipeline Summary The pipeline allows you to choose between running either [Bismark](https://github.com/FelixKrueger/Bismark) or [bwa-meth](https://github.com/brentp/bwa-meth) / [MethylDackel](https://github.com/dpryan79/methyldackel). -Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for alignment), `--aligner bismark_hisat` or `--aligner bwameth`. - -| Step | Bismark workflow | bwa-meth workflow | -|----------------------------------------------|------------------|-----------------------| -| Generate Reference Genome Index _(optional)_ | Bismark | bwa-meth | -| Raw data QC | FastQC | FastQC | -| Adapter sequence trimming | Trim Galore! | Trim Galore! | -| Align Reads | Bismark | bwa-meth | -| Deduplicate Alignments | Bismark | Picard MarkDuplicates | -| Extract methylation calls | Bismark | MethylDackel | -| Sample report | Bismark | - | -| Summary Report | Bismark | - | -| Alignment QC | Qualimap | Qualimap | -| Sample complexity | Preseq | Preseq | -| Project Report | MultiQC | MultiQC | +Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for alignment), `--aligner bismark_hisat` or `--aligner bwameth` or `--aligner biscuit`. + +| Step | Bismark workflow | bwa-meth workflow | biscuit | +|----------------------------------------------|------------------|-----------------------|-------------------| +| Generate Reference Genome Index _(optional)_ | Bismark | bwa-meth | biscuit | +| Raw data QC | FastQC | FastQC | FastQC | +| Adapter sequence trimming | Trim Galore! | Trim Galore! | Trim Galore! | +| Align Reads | Bismark | bwa-meth | biscuit | +| Deduplicate Alignments | Bismark | Picard MarkDuplicates | samblaster | +| Extract methylation calls | Bismark | MethylDackel | biscuit | +| Sample Report | Bismark | - | biscuit QC | +| Summary Report | Bismark | - | - | +| Picard Metrics | Picard | Picard | Picard | +| Alignment QC | Qualimap | Qualimap | Qualimap | +| Sample complexity | Preseq | Preseq | Preseq | +| Project Report | MultiQC | MultiQC | MultiQC | ## Quick Start diff --git a/assets/common_dbsnp.hdr b/assets/common_dbsnp.hdr new file mode 100755 index 000000000..96af77474 --- /dev/null +++ b/assets/common_dbsnp.hdr @@ -0,0 +1,11 @@ +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= diff --git a/bin/epiread_pairedEnd_conversion b/bin/epiread_pairedEnd_conversion new file mode 100755 index 000000000..eebec5dbe Binary files /dev/null and b/bin/epiread_pairedEnd_conversion differ diff --git a/bin/epiread_pairedEnd_conversion.cpp b/bin/epiread_pairedEnd_conversion.cpp new file mode 100644 index 000000000..f974f9536 --- /dev/null +++ b/bin/epiread_pairedEnd_conversion.cpp @@ -0,0 +1,643 @@ +#include +#include +#include // std::stringstream +#include +#include + +#include +using namespace std; + +// Based on N. Loyfer Pattern algorithm + +string TAB = "\t"; +int MAX_PAT_LEN = 300; +struct CpG_record +{ + + string next_cpg; + int index; + CpG_record(){} + + CpG_record(string _next_cpg, int _index) : + next_cpg(_next_cpg), index(_index) {} +}; +unordered_map dictCpG; +struct SNP_record +{ + char ref; + char alt; + string next_snp; + string sp; + SNP_record(){} + + SNP_record(char _ref, char _alt,string snp,string _sp) : + ref(_ref), alt(_alt), next_snp(snp), sp(_sp) {} +}; +unordered_map dictSNP; + + +bool DEBUG = false; +bool SNP_data = true; //file is probably not empty, if file is empty-skip adding SMP data + +vector line2tokens(string &line); +//void convert_epiread(string genome_cpg); +void convert_epiread(ofstream& merged_epiread); +//int execute(string cmd, string& output); +string vec2string(vector &vec, string coordinates = string()); +//void merge_paired_and_print(vector l1, vector l2, string &genome); +void merge_paired_and_print(vector l1, vector l2,ofstream& merged_epiread); + +vector line2tokens(string &line) { + /** Break string line to words (a vector of string tokens) */ + vector result; + string cell; + stringstream lineStream(line); + while (getline(lineStream, cell, '\t')) { + result.push_back(cell); + } + return result; +} + +void convert_epiread(ofstream& merged_epiread) { + /** parse stdin for epiread paired-end file, sorted by name and order of mate + * Translate them to single-end like epiread format, and output to a file */ + + vector row1, row2, row_from_line; + + bool first_in_pair = true; + for (string line_str; getline(cin, line_str);) { + row_from_line = line2tokens(line_str); + // row is first out of a couple of rows + if (first_in_pair) { + //cout < &vec, string coordinates) { + /** print a epiread-like vector to stdout, tab separated */ + // vec length must be 8, or 6 if there is no SNP. + int num_of_column=8; + if (!SNP_data) + num_of_column=6; + string str_vec = vec[0] + coordinates; + //for (int i=1; isecond).index; + } else { + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + } + return start_site; + +} + +int CpGLastLoci(string &chr,string &pos,int length_cpg) { + //get the locus of the last CpG according to window and length of string + if (length_cpg==0) return stoi(pos); + string locus = chr + TAB + pos; + auto search = dictCpG.find(locus); + if (search != dictCpG.end()) { + for (int i=0;isecond).next_cpg; + search = dictCpG.find(locus); + if (search == dictCpG.end()) + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + } + return stoi((search->second).next_cpg); + + } + + + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + +} + +SNP_record FindSNPRecord(string &locus) { + /** translate CpG index (in range 1,...,28M~) to dictionary */ + SNP_record variant; + auto search = dictSNP.find(locus); + if (search != dictSNP.end()) { + variant = search->second; + } else { + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown SNP locus: " + locus); + } + return variant; +} + + +void initializeDictCpG(string cpg) +{ + + int cpg_index=1; + vector record, next_record; + ifstream cpg_file(cpg, ios_base::in); + string line; + string next_cpg; + + //get first CpG + getline(cpg_file, line); + record = line2tokens(line); + + while (getline(cpg_file, line)) { + next_record = line2tokens(line); + next_cpg = next_record[1]; + dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record(next_cpg,cpg_index++))); + record = next_record; + } + //last record-no "next cpg" + dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record("",cpg_index++))); + +} + + +void initializeDictSNP(string snp) +{ + ifstream snp_file(snp, ios::in); + vector record,next_record; + string line_str; + if ( snp_file.peek() == fstream::traits_type::eof() ) + { + cerr <<"SNP file is empty"< &line) +{ // get the final merged SNP + string debug_data; + string cuttent_snp = line[7]; + string final_snp = "0:"; + int snp_length = line[7].length(); + SNP_record snp_rec = checkLocus(line[0],line[6],line[3],cuttent_snp[0]); + debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; + final_snp += convertChar2srting(cuttent_snp[0]); + if (DEBUG) { + final_snp += debug_data; + } + if (snp_length ==1) + return final_snp; + + string next_pos = snp_rec.next_snp; + string absolute_pos = line[6]; + for (int i=1; i &line) +{ + string final_snp = ""; + if (line[6]==".") + return final_snp; + string debug_data; + string cuttent_snp = line[7]; + final_snp = "0:"; + int snp_length = line[7].length(); + string locus = line[0] + TAB + line[6]; + SNP_record snp_rec = FindSNPRecord(locus); + debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; + final_snp += convertChar2srting(cuttent_snp[0]); + if (DEBUG) { + final_snp += debug_data; + } + if (snp_length ==1) + return final_snp; + + string next_pos = snp_rec.next_snp; + string absolute_pos = line[6]; + for (int i=1; i mergeSNP(vector l1, vector l2) +{//change snp to desired format, with relative index for each variant in format 0:var1:12:var2:13:var3 + + vector returned_snp; + + //if one mate has missing value-make it be the first vector + if (l2[6] == "." && l1[6] != ".") { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + //if both mates have SNP in the same variant-make the one with more variants be the first mate + if (l1[6] == l2[6] && l1[7].length() < l2[7].length()) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + //if both mates are "-" strand, but same position + if (l1[6] != "." && l2[6] != "." && l1[4] == l2[4] && stoi(l1[6]) > stoi(l2[6])) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + + //get SNP-length: number of variants for each line: + string snp1 = l1[7]; + string snp2 = l2[7]; + int snp1_length = l1[7].length(); + int snp2_length = l2[7].length(); + //if both snp has missing values + if (l1[6] == "." && l2[6] == ".") returned_snp.assign( {".","."} ); + //get SNP data + //if one read has missing value (".") than use the other value. + else if (l1[6] == "." ) + returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); + + //both mates have values in the SNP column, in the same position. l1 has more variants + else if (l1[6] == l2[6]) { + //both mates have the same SNP variants + if (snp1==snp2) + returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); + //same position, different variants in both mates, the first mate has longer SNP variant list + else + { + //check all common variants between mates are the same. If not- put "N" in the relevant position + for (int i=0; i &l1) +{//add first and last coordinates in case where there is SNP data + int last_snp = 0; // works even if there's no SNP data + string coordinates = ""; + if (SNP_data && l1[7] != ".") //if there's SNP data + { //get last index of SNP + string str_tmp = l1[7].substr(0,l1[7].rfind(":")); + str_tmp = str_tmp.substr(str_tmp.rfind(":")+1); + int last_snp = stoi(l1[6])+stoi(str_tmp); + } + + //get last index of CpG + //string window = l1[0] + "\t" + l1[4]; + //int index = CpGFirstIndex(window) + l1[5].length() - 1; + int last_CpG = CpGLastLoci(l1[0],l1[4], l1[5].length() - 1); + + //insert values to vector and print + if (SNP_data && l1[6] != ".") + //l1.insert(l1.begin()+1, to_string(min(stoi(l1[4]),stoi(l1[6])))); + coordinates += TAB+ to_string(min(stoi(l1[4]),stoi(l1[6]))); + else // no SNP data, coordinates depends only on first CpG + //l1.insert(l1.begin()+1, l1[4]); + coordinates += TAB+ l1[4] ; + //l1.insert(l1.begin()+2, to_string(max(last_snp,last_CpG))); + coordinates += TAB+ to_string(max(last_snp,last_CpG)); + return coordinates; +} + +//void merge_paired_and_print(vector l1, vector l2, string &genome) { +void merge_paired_and_print(vector l1, vector l2, ofstream& merged_epiread) { + /*Merge two epiread-formated line into one */ + + if (!DEBUG) { + l1[1] = "."; + l2[1] = "."; + } + + + bool flag_SNP_identical = (SNP_data) ? l1[6] == l2[6] && l1[7] == l2[7] : true; + //if l2 doesn't add any information to the read-length, sequence and SNP data: + if (l1[4] == l2[4] && l1[5] == l2[5] && ( flag_SNP_identical )) { + //there is an snp value on the identical lines + try { + if ( SNP_data && l1[6] != "." ) + l1[7] = GetFinalSNP(l1); + + //add_coordintes(l1); + string coordinates = add_coordintes(l1); + merged_epiread << vec2string(l1,coordinates) < stoi(l2[4])) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + + string pattern1 = l1[5]; + string pattern2 = l2[5]; + int pattern1_len = pattern1.length(); + int pattern2_len = pattern2.length(); + + //0-based file + //int first_cpg_site1 = stoi(l1[4]); + //int first_cpg_site2 = stoi(l2[4]); + + string window1,window2; + //window1 = l1[0] + "\t" + to_string(first_cpg_site1) + "\t" + to_string(first_cpg_site1+1); + //window2 = l2[0] + "\t" + to_string(first_cpg_site2) + "\t" + to_string(first_cpg_site2+1); + window1 = l1[0] + "\t" + l1[4]; + window2 = l2[0] + "\t" + l2[4]; + int first_cpg1,first_cpg2; + try { + first_cpg1 = CpGFirstIndex(window1); + first_cpg2 = CpGFirstIndex(window2); + } + catch (std::exception &e) { + cout << vec2string(l1) << endl; + cout << vec2string(l2) << endl; + return; + } + + + int last_site = max(first_cpg1 + pattern1_len, first_cpg2 + pattern2_len); + int overall_len = last_site-first_cpg1; + + string merged_pattern; // output pattern + + if (overall_len > MAX_PAT_LEN) // sanity check: make sure the two reads are not too far apart + { + // throw invalid_argument("invalid pairing. merged read is too long "); + string output_error = "Problem with:\n" + l1[0] + "\t" + l1[1] + "\t" + l1[2] + "\t" + l1[3] + "\t" + l1[4] + "\n" + l2[0] + "\t" + l2[1] + "\t" + l2[2] + "\t" + l2[3] + "\t" + l2[4] ; + cerr < merged_snp; + try { + if (SNP_data) //SNP file is not empty + { + merged_snp = mergeSNP(l1,l2); + l1[5] = merged_pattern; + l1[6] = merged_snp[0]; + l1[7] = merged_snp[1]; + + } + //add_coordintes(l1); + merged_epiread << vec2string(l1,add_coordintes(l1)) <) +{ + chomp $line; + my @f = split(/\t/,$line); + + # get max MAF + my @allMafs = uniqVals($f[9]); + my $maxMaf = maxval(@allMafs); + + # get common alleles + my @allRef = uniqVals($f[10]); + my @allAlt = uniqVals($f[11]); + if (0) + { + print STDERR sprintf("refAlleles has more than one:\t%s\t%s\n",$f[10],makeList(@allRef)) + if (scalar(@allRef)>1); + print STDERR sprintf("altAlleles has more than one:\t%s\t%s\n",$f[11],makeList(@allAlt)) + if (scalar(@allAlt)>1); + } + + my @dbsnpRef = uniqVals($f[4]); + my @dbsnpAlt = uniqVals($f[6]); + my $usedbsnp = ( (scalar(@dbsnpAlt)$maxv); + } + return $maxv; +} + +sub makeList +{ + my (@flds) = @_; + return join(",",@flds); +} + +sub uniqVals +{ + my ($fld) = @_; + + my @allAlleles = split(/,/,$fld); + my $outAllelesH = {}; + foreach my $allele (@allAlleles) { $outAllelesH->{$allele}++ if ($allele); } + my @outAlleles = keys(%$outAllelesH); + +# print STDERR join("\t",$fld,@outAlleles)."\n"; + + return @outAlleles; +} diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index ea8e4315b..fdf3b1653 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -20,10 +20,20 @@ 'BWA': ['v_bwa.txt', r"Version: (\S+)"], 'bwa-meth': ['v_bwameth.txt', r"bwa-meth\.py (\S+)"], 'Picard MarkDuplicates': ['v_picard_markdups.txt', r"([\d\.]+)"], + 'Picard CreateSequenceDictionary': ['v_picard_createseqdict.txt', r"([\d\.]+)"], + 'Picard CollectInsertSizeMetrics': ['v_picard_collectinssize.txt', r"([\d\.]+)"], + 'Picard CollectGcBiasMetrics': ['v_picard_collectgcbias.txt', r"([\d\.]+)"], + 'samblaster': ['v_samblaster.txt', r"samblaster: Version (\S+)"], + 'biscuit': ['v_biscuit.txt', r"Version: (\S+)"], + 'bcftools': ['v_bcftools.txt', r"bcftools (\S+)"], + 'bedtools': ['v_bedtools.txt', r"bedtools (\S+)"], + 'parallel': ['v_parallel.txt', r"GNU parallel (\S+)"], + 'gawk': ['v_gawk.txt', r"GNU Awk (\S+)"], 'MethylDackel': ['v_methyldackel.txt', r"(.+)"], 'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"], 'Preseq': ['v_preseq.txt', r"Version: (\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], + } results = OrderedDict() results['nf-core/methylseq'] = 'N/A' @@ -45,6 +55,17 @@ results['Qualimap'] = 'N/A' results['Preseq'] = 'N/A' results['MultiQC'] = 'N/A' +results['samblaster'] = 'N/A' +results['biscuit'] = 'N/A' +results['fastasort'] = 'N/A' +results['Picard CreateSequenceDictionary'] = 'N/A' +results['Picard CollectInsertSizeMetrics'] = 'N/A' +results['Picard CollectGcBiasMetrics'] = 'N/A' +results['bcftools'] = 'N/A' +results['bedtools'] = 'N/A' +results['gawk'] = 'N/A' +results['parallel'] = 'N/A' + # Search each file using its regex for k, v in regexes.items(): @@ -63,6 +84,7 @@ del results[k] # Dump to YAML + print( """ id: 'software_versions' diff --git a/conf/base.config b/conf/base.config index 523f5fd22..53228e36f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -26,13 +26,19 @@ process { memory = { check_max( 64.GB * task.attempt, 'memory') } time = { check_max( 36.h * task.attempt, 'time') } } - withName:fastqc { - cpus = { check_max( 2 * task.attempt, 'cpus') } - memory = { check_max( 16.GB * task.attempt, 'memory') } - time = { check_max( 6.h * task.attempt, 'time') } + withName:makeBwaMemIndex { + cpus = { check_max( 8 * task.attempt, 'cpus') } + memory = { check_max( 64.GB * task.attempt, 'memory') } + time = { check_max( 36.h * task.attempt, 'time') } + } + + withName:fastqc { + cpus = { check_max( 6 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } } withName:trim_galore { - cpus = { check_max( 15 * task.attempt, 'cpus') } + cpus = { check_max( 8 * task.attempt, 'cpus') } memory = { check_max( 6.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } } @@ -41,6 +47,11 @@ process { memory = { check_max( 64.GB * task.attempt, 'memory') } time = { check_max( 8.d * task.attempt, 'time') } } + withName:samtools_sort_index_flagstat_bismark { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 1.d * task.attempt, 'time') } + } withName:bismark_deduplicate { cpus = { check_max( 8 * task.attempt, 'cpus') } memory = { check_max( 64.GB * task.attempt, 'memory') } @@ -53,11 +64,14 @@ process { } withName:qualimap { cpus = { check_max( 4 * task.attempt, 'cpus') } - memory = { check_max( 32.GB * task.attempt, 'memory') } - time = { check_max( 6.h * task.attempt, 'time') } + memory = { check_max( 40.GB * task.attempt, 'memory') } + time = { check_max( 20.h * task.attempt, 'time') } } withName:preseq { errorStrategy = 'ignore' + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 8.GB * task.attempt, 'memory') } + time = { check_max( 15.h * task.attempt, 'time') } } withName:get_software_versions { cache = false @@ -82,6 +96,63 @@ process { memory = { check_max( 48.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } } + + withName:biscuit_align { + cpus = { check_max( 10 * task.attempt, 'cpus') } + memory = { check_max( 64.GB * task.attempt, 'memory') } + time = { check_max( 6.d * task.attempt, 'time') } + } + withName:makeBwaBISCUITIndex { + cpus = { check_max( 8 * task.attempt, 'cpus') } + memory = { check_max( 64.GB * task.attempt, 'memory') } + time = { check_max( 36.h * task.attempt, 'time') } + } + withName:samtools_sort_index_flagstat_biscuit { + cpus = { check_max( 6 * task.attempt, 'cpus') } + memory = { check_max( 36.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + + withName:markDuplicates_samblaster { + cpus = { check_max( 10 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 3.d * task.attempt, 'time') } + } + withName:create_VCF { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 3.d * task.attempt, 'time') } + } + withName:biscuit_QC { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 5.d * task.attempt, 'time') } + } + withName:create_Bedgraph { + cpus = { check_max( 1 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 12.h * task.attempt, 'time') } + } + withName:picard_metrics { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + withName:prepare_genome_to_picard { + cpus = { check_max( 2 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 1.d * task.attempt, 'time') } + } + withName:epiread_conversion { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + withName:get_SNP_file { + cpus = { check_max( 2 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } } // Defaults config params, may be overwritten by later configs diff --git a/docs/output.md b/docs/output.md index fac138813..2dae70a61 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,3 +1,4 @@ + # nf-core/methylseq Output ## Introduction @@ -19,8 +20,10 @@ and processes data using the following steps: * [Deduplication](#deduplication) - Deduplicating reads * [Methylation Extraction](#methylation-extraction) - Calling cytosine methylation steps * [Bismark Reports](#bismark-reports) - Single-sample and summary analysis reports +* [BISCUIT Reports](#biscuit-reports) - Single-sample analysis reports for BISCUIT aligner * [Qualimap](#qualimap) - Tool for genome alignments QC * [Preseq](#preseq) - Tool for estimating sample complexity +* [Picard](#picard) - Tool for generating metrics of statistics * [MultiQC](#multiqc) - Aggregate report describing results of the whole pipeline * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -66,7 +69,7 @@ Single-end data will have slightly different file names and only one FastQ file ### Alignment -Bismark and bwa-meth convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. +Bismark, bwa-meth and BISCUIT convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. **Bismark output directory: `results/bismark_alignments/`** _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment tool and the output file names will not differ between the options._ @@ -96,9 +99,27 @@ _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment t * `logs/sample_stats.txt` * Summary file giving lots of metrics about the aligned BAM file. +**BISCUIT output directory: `results/biscuit_alignnts/`** + +* `sample.assembly.bam` + * Aligned reads in BAM format. + * **NB:** Only saved if `--save_align_intermeds` is used +* `sample.assembly.sorted.bam` + * Aligned reads in a sorted BAM file. + * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam` +* `sample.assembly.sorted.bam.bai` + * Index of sorted BAM file + * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam.bai` +* `logs/sample_flagstat.txt` + * Summary file describing the number of reads which aligned in different ways. +* `logs/sample_stats.txt` + * Summary file giving lots of metrics about the aligned BAM file. + ### Deduplication -This step removes alignments with identical mapping position to avoid technical duplication in the results. Note that it is skipped if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. +This step removes alignments with identical mapping position to avoid technical duplication in the results. Note that it is skipped if `--skip_deduplication` or `--rrbs` is specified when running the pipeline. **Bismark output directory: `results/bismark_deduplicated/`** @@ -118,11 +139,18 @@ This step removes alignments with identical mapping position to avoid technical * `logs/sample.sorted.markDups_metrics.txt` * Log file giving summary statistics about deduplication. +**BISCUIT output directory: `results/biscuit_markDuplicates/`** + +> **NB:** The BISCUIT (samblaster) step doesn't remove duplicate reads from the BAM file, it just labels them. + +* `sample.assembly.txt` + * Log file giving summary statistics about deduplication. + ### Methylation Extraction The methylation extractor step takes a BAM file with aligned reads and generates files containing cytosine methylation calls. It produces a few different output formats, described below. -Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` when running the pipeline. +Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` or `--skip_deduplication` or `--rrbs` when running the pipeline. Filename abbreviations stand for the following reference alignment strands: @@ -151,6 +179,38 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedGraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. +**BISCUIT workflow output directory: `results/methylation_extract/`** + +* `sample.bedgraph` + * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. +* `sample.vcf.gz` + * VCF file with the pileup information, used for creating the bedGraph file. + * **NB:** Only saved if `--save_align_intermeds` is specified when running the pipeline. +* `sample.vcf.gz.tbi` + * Index file for `sample.vcf.gz` + * **NB:** Only saved if `--save_align_intermeds` is specified when running the pipeline. + +**NB** if `--epriread` is specified in the pipeline, then: +**output directory:** `results/epireads` : + +* `sample.epiread.gz` + * Storing CpG retention pattern on the read in a compact way. For paired end mode, two adjacent rows of the read mates in Epi-read format are merged. +* `sample.epiread.gz.tbi` + * Index file for `sample.epiread.gz`. +* `sample.err.gz` + * In paired end mode, storing all CpG retention pattern of the reads that failed to be merged together. +* `sample.err.gz.tbi` + * Index file for `sample.err.gz`. +* `sample.original.epiread.gz` + * In paired end mode, storing all CpG retention pattern of the reads before the merging. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. +* `sample.original.epiread.gz.tbi` + * Index file for `sample.original.epiread.gz`. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. +* `snp/sample.snp.bed` + * bed file with SNP information about the sample. + * **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. + ### Bismark Reports Bismark generates a HTML reports describing results for each sample, as well as a summary report for the whole run. @@ -159,6 +219,12 @@ Bismark generates a HTML reports describing results for each sample, as well as **Output directory: `results/bismark_summary`** +### BISCUIT Reports + +BISCUIT generates a directory with different statistical reports describing results for each sample. The statistical reports are converted to plots plotted in the MultiQC report. + +**Output directory: `results/biscuit_QC/sample.assembly_biscuitQC/`** + ## Qualimap [Qualimap BamQC](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#bam-qc) is a general-use quality-control tool that generates a number of statistics about aligned BAM files. It's not specific to bisulfite data, but it produces several useful stats - for example, insert size and coverage statistics. @@ -181,6 +247,28 @@ Note that these are predictive numbers only, not absolute. The MultiQC plot can * `sample_ccurve.txt` * This file contains plot values for the complexity curve, plotted in the MultiQC report. +## Picard + +[Picard]([https://broadinstitute.github.io/picard/picard-metric-definitions.html](https://broadinstitute.github.io/picard/picard-metric-definitions.html)) is a set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF. + +The two metrics created here are: + +* [GcBiasMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics)) +* [InsertSizeMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics)) - Metrics about the insert size distribution of a paired-end library, created by the CollectInsertSizeMetrics program and usually written to a file with the extension ".insert_size_metrics". + +**Output directory: `results/picardMetrics`** + +* `sample.insert_size_metrics.txt` + * This file contains plot values for the number of reads at a given insert size, plotted in the MultiQC report. +* `sample.gc_bias_metrics.txt` + * This file contains plot values for the bias in coverage across regions of the genome with varying GC content, plotted in the MultiQC report. +* `sample.summary_metrics.txt` + * This file contains a table summerizing the `sample.gc_bias_metrics.txt` data. +* `pdf/sample.insert_size_histogram.pdf` +* This file contains a plot of insert size histogram, created by Picard. +* `pdf/sample.gc_bias_metrics.pdf` + * This file contains a plot of GC bias of all reads, created by Picard. + ## MultiQC [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. diff --git a/docs/usage.md b/docs/usage.md index 899b3337b..e66b42760 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -15,12 +15,14 @@ ## Introduction -The nf-core/methylseq package is actually two pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. +The nf-core/methylseq package is actually three pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. Since bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`. The second workflow uses [BWA-Meth](https://github.com/brentp/bwa-meth) and [MethylDackel](https://github.com/dpryan79/methyldackel) instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner bwameth`. +The third workflow uses [BISCUIT](https://github.com/huishenlab/biscuit) and [samblaster](https://github.com/GregoryFaust/samblaster) for mark duplicates instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner biscuit`. + ## Running the pipeline The typical command for running the pipeline is as follows: diff --git a/environment.yml b/environment.yml index 63089ea2d..8e4a8477f 100644 --- a/environment.yml +++ b/environment.yml @@ -26,3 +26,10 @@ dependencies: - bioconda::picard=2.25.1 - bioconda::bwameth=0.2.2 - bioconda::methyldackel=0.5.2 + # BISCUIT pipeline + - bioconda::samblaster=0.1.26 + - bioconda::bedtools=2.30.0 + - bioconda::biscuit=0.3.16.20200420 + - bioconda::bcftools=1.9 + - conda-forge::parallel=20201122 + - gawk=5.1.0 diff --git a/main.nf b/main.nf index 3d9a8cf07..775e970f9 100644 --- a/main.nf +++ b/main.nf @@ -50,29 +50,29 @@ ch_splicesites_for_bismark_hisat_align = params.known_splices ? Channel.fromPath if( params.aligner =~ /bismark/ ){ params.bismark_index = params.genome && params.aligner == 'bismark' ? params.genomes[ params.genome ].bismark ?: false : false assert params.bismark_index || params.fasta : "No reference genome index or fasta file specified" - ch_wherearemyfiles_for_alignment.set { ch_wherearemyfiles_for_bismark_align } + ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bismark_align; ch_wherearemyfiles_for_bismark_samtools_sort; ch_wherearemyfiles_for_bismark_dedup_samtools_sort } + Channel + .fromPath(params.fasta, checkIfExists: true) + .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } + .into { ch_fasta_for_makeBismarkIndex; ch_fasta_for_picard } if( params.bismark_index ){ Channel .fromPath(params.bismark_index, checkIfExists: true) .ifEmpty { exit 1, "Bismark index file not found: ${params.bismark_index}" } .into { ch_bismark_index_for_bismark_align; ch_bismark_index_for_bismark_methXtract } + ch_fasta_for_makeBismarkIndex.close() } - else if( params.fasta ){ - Channel - .fromPath(params.fasta, checkIfExists: true) - .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .set { ch_fasta_for_makeBismarkIndex } - } + } -else if( params.aligner == 'bwameth' ){ - assert params.fasta : "No Fasta reference specified! This is required by MethylDackel." - ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat } +else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ + assert params.fasta : "No Fasta reference specified!" + ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_biscuit_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat; ch_wherearemyfiles_for_samblaster } Channel .fromPath(params.fasta, checkIfExists: true) .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_methyldackel } + .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_build_biscuit_QC_assets; ch_fasta_for_methyldackel; ch_fasta_for_pileup; ch_fasta_for_epiread; ch_fasta_for_biscuitQC; ch_fasta_for_picard} params.bwa_meth_index = params.genome ? params.genomes[ params.genome ].bwa_meth ?: false : false if( params.bwa_meth_index ){ @@ -83,14 +83,30 @@ else if( params.aligner == 'bwameth' ){ ch_fasta_for_makeBwaMemIndex.close() } + if( params.bwa_biscuit_index ){ + Channel + .fromPath("${params.bwa_biscuit_index}*", checkIfExists: true) + .ifEmpty { exit 1, "bwa (biscuit) index file(s) not found: ${params.bwa_biscuit_index}" } + .set { ch_bwa_index_for_biscuit } + ch_fasta_for_makeBwaMemIndex.close() + } + params.fasta_index = params.genome ? params.genomes[ params.genome ].fasta_index ?: false : false if( params.fasta_index ){ Channel .fromPath(params.fasta_index, checkIfExists: true) .ifEmpty { exit 1, "fasta index file not found: ${params.fasta_index}" } - .set { ch_fasta_index_for_methyldackel } + .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_create_VCF; ch_fasta_for_create_whitelist; ch_fasta_index_for_epiread } ch_fasta_for_makeFastaIndex.close() } + } + +if( params.aligner == 'biscuit' && params.assets_dir ) { + Channel + .fromPath("${params.assets_dir}", checkIfExists: true) + .ifEmpty { exit 1, "Assets directory for biscuit QC not found: ${params.assets_dir}" } + .into { ch_assets_dir_for_biscuit_qc; ch_assets_dir_with_cpg_for_epiread } + ch_fasta_for_build_biscuit_QC_assets.close() } // Trimming / kit presets @@ -165,6 +181,7 @@ if (params.input_paths) { .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] } .ifEmpty { exit 1, 'params.input_paths was empty - no input files supplied' } .into { ch_read_files_fastqc; ch_read_files_trimming } + } else { Channel .from(params.input_paths) @@ -174,11 +191,39 @@ if (params.input_paths) { } } else { Channel - .fromFilePairs(params.input, size: params.single_end ? 1 : 2) + .fromFilePairs( params.input, size: params.single_end ? 1 : 2 ) .ifEmpty { exit 1, "Cannot find any reads matching: ${params.input}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." } .into { ch_read_files_fastqc; ch_read_files_trimming } } +if (params.aligner == 'biscuit') { + if (params.epiread) { + assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if \'--epiread\' is specified" + + if (params.whitelist) { + Channel + .fromPath(params.whitelist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}" } + .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread } + } + else { + Channel + .fromPath(params.blacklist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } + .set { ch_blacklist_for_create_whitelist } + } + + if (params.common_dbsnp) { + Channel + .fromPath(params.common_dbsnp, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } + .set { ch_commonSNP_for_SNP } + } + + } else { + ch_fasta_for_create_whitelist.close() + } +} //////////////////////////////////////////////////// /* -- PRINT PARAMETER SUMMARY -- */ //////////////////////////////////////////////////// @@ -197,6 +242,7 @@ if(params.local_alignment) summary['Local alignment'] = 'Yes' if(params.genome) summary['Genome'] = params.genome if(params.bismark_index) summary['Bismark Index'] = params.bismark_index if(params.bwa_meth_index) summary['BWA-Meth Index'] = "${params.bwa_meth_index}*" +if(params.bwa_biscuit_index)summary['BWA Index'] = "${params.bwa_biscuit_index}*" if(params.fasta) summary['Fasta Ref'] = params.fasta if(params.fasta_index) summary['Fasta Index'] = params.fasta_index if(params.rrbs) summary['RRBS Mode'] = 'On' @@ -222,28 +268,34 @@ if(params.save_reference) save_intermeds.add('Reference genome build') if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') if(params.unmapped) save_intermeds.add('Unmapped reads') if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') +if(params.save_snp_file) save_intermeds.add('SNP bed-files') if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') +debug_mode = []; +if(params.debug_epiread) debug_mode.add('Debug epiread step') +if(params.debug_epiread_merging) debug_mode.add('Debug epiread merging') +if(debug_mode.size() > 0) summary['Debug mode'] = debug_mode.join(', ') if(params.minins) summary['Bismark min insert size'] = bismark_minins if(params.maxins || params.em_seq) summary['Bismark max insert size'] = bismark_maxins if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Pipeline dir'] = workflow.projectDir -summary['User'] = workflow.userName -summary['Config Profile'] = workflow.profile +if(params.assets_dir) summary['Assets Directory'] = params.assets_dir +if(params.whitelist) summary['Whitelist'] = params.whitelist +if(params.blacklist) summary['Blacklist'] = params.whitelist +if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp +if(params.epiread) summary['Epiread'] = 'Yes' +summary['Output dir'] = params.outdir +summary['Launch dir'] = workflow.launchDir +summary['Working dir'] = workflow.workDir +summary['Pipeline dir'] = workflow.projectDir +summary['User'] = workflow.userName +summary['Config Profile'] = workflow.profile if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli } summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -summary['Config Profile'] = workflow.profile -if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url summary['Config Files'] = workflow.configFiles.join(', ') if (params.email || params.email_on_fail) { summary['E-mail Address'] = params.email @@ -308,10 +360,19 @@ process get_software_versions { bwa &> v_bwa.txt 2>&1 || true bwameth.py --version &> v_bwameth.txt picard MarkDuplicates --version &> v_picard_markdups.txt 2>&1 || true + picard CreateSequenceDictionary --version &> v_picard_createseqdict.txt 2>&1 || true + picard CollectInsertSizeMetrics --version &> v_picard_collectinssize.txt 2>&1 || true + picard CollectGcBiasMetrics --version &> v_picard_collectgcbias.txt 2>&1 || true MethylDackel --version &> v_methyldackel.txt qualimap --version &> v_qualimap.txt || true preseq &> v_preseq.txt multiqc --version &> v_multiqc.txt + samblaster --version &> v_samblaster.txt + biscuit &>v_biscuit.txt 2>&1 || true + bcftools --version &> v_bcftools.txt + bedtools --version &> v_bedtools.txt + parallel --version &> v_parallel.txt + gawk --version > v_gawk.txt scrape_software_versions.py &> software_versions_mqc.yaml """ } @@ -323,7 +384,6 @@ if( !params.bismark_index && params.aligner =~ /bismark/ ){ process makeBismarkIndex { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode - input: file fasta from ch_fasta_for_makeBismarkIndex @@ -363,10 +423,39 @@ if( !params.bwa_meth_index && params.aligner == 'bwameth' ){ } } +/* + * PREPROCESSING - Build bwa-biscuit + */ +if(!params.bwa_biscuit_index && params.aligner == 'biscuit' ){ + process makeBwaBISCUITIndex { + tag "$fasta" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode + + // input: + // file fasta from ch_fasta_for_makeBwaMemIndex + + // output: + // file "${fasta}*" into ch_bwa_index_for_biscuit + input: + path fasta, stageAs 'fasta*' from ch_fasta_for_makeBwaMemIndex + + output: + file fasta into ch_bwa_index_for_biscuit + + script: + """ + mkdir BiscuitIndex + cp $fasta BiscuitIndex/ + biscuit index $fasta + cp ${fasta}* BiscuitIndex + """ + } +} + /* * PREPROCESSING - Index Fasta file */ -if( !params.fasta_index && params.aligner == 'bwameth' ){ +if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index && params.aligner == 'biscuit' ){ process makeFastaIndex { tag "$fasta" publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode @@ -375,7 +464,7 @@ if( !params.fasta_index && params.aligner == 'bwameth' ){ file fasta from ch_fasta_for_makeFastaIndex output: - file "${fasta}.fai" into ch_fasta_index_for_methyldackel + file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_create_VCF,ch_fasta_for_create_whitelist,ch_fasta_index_for_epiread script: """ @@ -384,6 +473,30 @@ if( !params.fasta_index && params.aligner == 'bwameth' ){ } } +/* + * PREPROCESSING - Build Biscuit QC assets + */ +if( !params.assets_dir && params.aligner == 'biscuit' ) { + process build_biscuit_QC_assets { + tag "$fasta" + publishDir path: "${params.outdir}/reference_assets", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode + + input: + file fasta from ch_fasta_for_build_biscuit_QC_assets + + output: + file "*assets" into ch_assets_dir_for_biscuit_qc, ch_assets_dir_with_cpg_for_epiread + + + script: + assembly = fasta.toString().replaceAll(/\.\w+/,"") + + """ + build_biscuit_QC_assets.pl -r $fasta -o ${assembly}_assets + """ + } +} + /* * STEP 1 - FastQC @@ -485,7 +598,7 @@ if( params.aligner =~ /bismark/ ){ file knownsplices from ch_splicesites_for_bismark_hisat_align.collect().ifEmpty([]) output: - set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_preseq + set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc file "*.fq.gz" optional true file "where_are_my_files.txt" @@ -499,6 +612,7 @@ if( params.aligner =~ /bismark/ ){ // Optional extra bismark parameters splicesites = params.aligner == "bismark_hisat" && params.known_splices ? "--known-splicesite-infile <(hisat2_extract_splice_sites.py ${knownsplices})" : '' + pbat = params.pbat ? "--pbat" : '' non_directional = params.single_cell || params.zymo || params.non_directional ? "--non_directional" : '' unmapped = params.unmapped ? "--unmapped" : '' @@ -553,10 +667,42 @@ if( params.aligner =~ /bismark/ ){ } /* - * STEP 4 - Bismark deduplicate + * STEP 4 - Samtools sort bismark + */ + process samtools_sort_index_flagstat_bismark { + tag "$name" + publishDir "${params.outdir}/samtools", mode: params.publish_dir_mode, + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_bismark_samtools_sort.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_for_preseq,ch_bam_sorted_for_picard + file "where_are_my_files.txt" + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + """ + } + + + /* + * STEP 5 - Bismark deduplicate */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_bam_dedup_for_qualimap } + ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_dedup_bam_for_samtools_sort_index_flagstat } ch_bismark_dedup_log_for_bismark_report = Channel.from(false) ch_bismark_dedup_log_for_bismark_summary = Channel.from(false) ch_bismark_dedup_log_for_multiqc = Channel.from(false) @@ -570,7 +716,7 @@ if( params.aligner =~ /bismark/ ){ set val(name), file(bam) from ch_bam_for_bismark_deduplicate output: - set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_bam_dedup_for_qualimap + set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_dedup_bam_for_samtools_sort_index_flagstat set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc script: @@ -581,6 +727,37 @@ if( params.aligner =~ /bismark/ ){ } } + /* + * STEP 6 - Samtools sort bismark after dedup + */ + process samtools_sort_index_flagstat_dedup_bismark { + tag "$name" + publishDir "${params.outdir}/samtools", mode: params.publish_dir_mode, + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_dedup_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_bismark_dedup_samtools_sort.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_sorted_dedup_for_qualimap + file "where_are_my_files.txt" + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + """ + } + /* * STEP 5 - Bismark methylation extraction */ @@ -712,6 +889,7 @@ else { ch_bismark_mbias_for_multiqc = Channel.from(false) ch_bismark_reports_results_for_multiqc = Channel.from(false) ch_bismark_summary_results_for_multiqc = Channel.from(false) + } @@ -734,7 +912,7 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_bwamem_align.collect() output: - set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat, ch_bam_for_preseq + set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat file "where_are_my_files.txt" script: @@ -767,7 +945,7 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() output: - set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates + set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates,ch_bam_for_preseq, ch_bam_sorted_for_picard set val(name), file("${bam.baseName}.sorted.bam.bai") into ch_bam_index file "${bam.baseName}_flagstat_report.txt" into ch_flagstat_results_for_multiqc file "${bam.baseName}_stats_report.txt" into ch_samtools_stats_results_for_multiqc @@ -790,7 +968,7 @@ if( params.aligner == 'bwameth' ){ * STEP 5 - Mark duplicates */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_sorted_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_dedup_for_qualimap } + ch_bam_sorted_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_sorted_dedup_for_qualimap } ch_bam_index.set { ch_bam_index_for_methyldackel } ch_markDups_results_for_multiqc = Channel.from(false) } else { @@ -803,7 +981,7 @@ if( params.aligner == 'bwameth' ){ set val(name), file(bam) from ch_bam_sorted_for_markDuplicates output: - set val(name), file("${bam.baseName}.markDups.bam") into ch_bam_dedup_for_methyldackel, ch_bam_dedup_for_qualimap + set val(name), file("${bam.baseName}.markDups.bam") into ch_bam_dedup_for_methyldackel, ch_bam_sorted_dedup_for_qualimap set val(name), file("${bam.baseName}.markDups.bam.bai") into ch_bam_index_for_methyldackel //ToDo check if this correctly overrides the original channel file "${bam.baseName}.markDups_metrics.txt" into ch_markDups_results_for_multiqc @@ -829,7 +1007,7 @@ if( params.aligner == 'bwameth' ){ } /* - * STEP 6 - extract methylation with MethylDackel + * STEP 6 - Extract methylation with MethylDackel */ process methyldackel { @@ -867,18 +1045,373 @@ else { ch_samtools_stats_results_for_multiqc = Channel.from(false) ch_markDups_results_for_multiqc = Channel.from(false) ch_methyldackel_results_for_multiqc = Channel.from(false) + } /* - * STEP 8 - Qualimap + * Process with BISCUIT and assorted tools (samblaster) + */ +if( params.aligner == 'biscuit' ){ + process biscuit_align { + tag "$name" + publishDir "${params.outdir}/biscuit_alignments", mode: params.publish_dir_mode, + saveAs: {filename -> + if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename + else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename + else null + } + + input: + set val(name), file(reads) from ch_trimmed_reads_for_alignment + file bwa_indices from ch_bwa_index_for_biscuit.collect() + file wherearemyfiles from ch_wherearemyfiles_for_biscuit_align.collect() + + output: + set val(name), file('*.bam') into ch_bam_for_markDuplicates, ch_bam_for_samtools_sort_index_flagstat + file "where_are_my_files.txt" + + script: + fasta = bwa_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - '.fai' - '.par' - '.dau' -'.bis' + assembly = fasta.replaceAll(/\.\w+/,"") + prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ + + non_directional = params.single_cell || params.zymo || params.non_directional ? 0 : 1 + // Paired-end or single-end input files and pbat or not + input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]} -b " + non_directional : reads.toString() +" -b " + non_directional + + """ + biscuit align -M -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam + """ + } + + /* + * STEP 4 - Mark duplicates + */ + if( params.skip_deduplication || params.rrbs ) { + ch_bam_for_markDuplicates.into { ch_samblaster_for_samtools_sort_index_flagstat } + ch_samblaster_for_multiqc = Channel.from(false) + } else { + process markDuplicates_samblaster { + tag "$name" + + publishDir "${params.outdir}", mode: params.publish_dir_mode, + saveAs: {filename -> + if( filename.indexOf("log") > 0 ) "biscuit_markDuplicates/$filename" + else null + } + + input: + set val(name), file(bam) from ch_bam_for_markDuplicates + file wherearemyfiles from ch_wherearemyfiles_for_samblaster.collect() + + output: + set val(name), file("${bam.baseName}.samblaster.bam") into ch_samblaster_for_samtools_sort_index_flagstat + file "*log" into ch_samblaster_for_multiqc + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + unmapped = params.single_end ? '--ignoreUnmated' : '' + + """ + samtools sort -n $bam \\ + -@ ${task.cpus} $sort_mem | \\ + samtools view -h | \\ + samblaster -M $unmapped \\ + -d "${bam.baseName}_discordant.sam" \\ + -s "${bam.baseName}_split.sam" \\ + -u "${bam.baseName}_.fastq" \\ + --excludeDups --addMateTags | \\ + samtools view -Sb > ${bam.baseName}.samblaster.bam + cp .command.log ${bam.baseName}.log + """ + } + } + + /* + * STEP 5.- Samtools flagstat on samples + */ + process samtools_sort_index_flagstat_biscuit { + tag "$name" + publishDir "${params.outdir}", mode: params.publish_dir_mode, + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "biscuit_alignments/logs/$filename" + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" + else if( (!params.save_align_intermeds && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(samblaster_bam) from ch_samblaster_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_sorted_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard + set val(name), file ("*.sorted.bam.bai") into ch_bam_index_sorted_for_pileup,ch_bam_index_for_epiread,ch_bam_index_noDups_for_QC + file "${samblaster_bam.baseName}_flagstat_report.txt" into ch_flagstat_results_biscuit_for_multiqc + file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc + file "where_are_my_files.txt" + + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $samblaster_bam \\ + -@ ${task.cpus} $sort_mem -l 9 \\ + -o ${samblaster_bam.baseName}.sorted.bam + samtools index ${samblaster_bam.baseName}.sorted.bam + samtools flagstat ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_flagstat_report.txt + samtools stats ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_stats_report.txt + """ + } + + + /* + * STEP 6 - Create vcf file with pileup, to extract methylation + */ + process create_VCF { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: params.publish_dir_mode, + saveAs: {filename -> + if( !params.save_align_intermeds && filename == "where_are_my_files.txt") filename + else if( filename.indexOf("vcf.gz") > 0 && params.save_align_intermeds && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam), file (bam_index) from ch_bam_sorted_for_pileup.join(ch_bam_index_sorted_for_pileup) + file fasta from ch_fasta_for_pileup.collect() + file fasta_index from ch_fasta_index_for_create_VCF.collect() + + output: + set val(name), file("${name}.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread + + script: + filter_duplication = params.skip_deduplication || params.rrbs ? '-u' : '' + """ + biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o ${name}.vcf + bgzip -@ ${task.cpus} -f ${name}.vcf + tabix -f -p vcf ${name}.vcf.gz + """ + } + + /* + * STEP 7 - Create bedgraph file from vcf + */ + process create_Bedgraph { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: params.publish_dir_mode + + input: + set val(name), file(vcf) from ch_vcf_for_bedgraph + + output: + set val(name), file("*bedgraph" ) into ch_bedgraph_for_intersect_soloWCGW + + script: + min_depth = params.min_depth > 1 ? "${params.min_depth}" : '1' + all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' + """ + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" + """ + } + + if (params.epiread) { + if (params.common_dbsnp) { + /* + * STEP 7.1 - Reformat SNP table for SNP file generation + */ + process reformat_SNP { + + input: + file commonSNP_file from ch_commonSNP_for_SNP.collect() + + output: + file("reformattedSNP.snv.txt.gz*" ) into ch_reformattedSNP_for_SNP + + script: + """ + less $commonSNP_file | $projectDir/bin/processUcscDbsnp.pl | grep snv | bgzip > reformattedSNP.snv.txt.gz + tabix -s 1 -b 2 -e 3 reformattedSNP.snv.txt.gz + """ + } + } + else { + ch_reformattedSNP_for_SNP = Channel.empty() + } + + + /* + * STEP 7.2 - Create whitelist for SNP calling + */ + if ( !params.whitelist) { + process create_whitelist { + tag "$blacklist" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode + + input: + file blacklist from ch_blacklist_for_create_whitelist + file fasta_index from ch_fasta_for_create_whitelist + + output: + file("whitelist.${name}.bed.gz" ) into ch_whitelist_for_SNP, ch_whitelist_for_epiread + file "sizes.${name}" + script: + name = fasta_index.getSimpleName() // - '.fa' - '.fai' + + """ + cut -f1,2 $fasta_index > sizes.${name} + bedtools sort -g sizes.${name} -i $blacklist > ${blacklist.baseName}.sorted.bed + bedtools complement -i ${blacklist.baseName}.sorted.bed -g sizes.${name} | grep -v _ | bgzip > whitelist.${name}.bed.gz + """ + } + } + else { + ch_fasta_for_create_whitelist.close() + } + /* + * STEP 7.3 - SNP file generation for the epiread conversion + */ + process get_SNP_file { + tag "$name" + publishDir "${params.outdir}/epireads/snp", mode: params.publish_dir_mode, + saveAs: {filename -> + if( filename.indexOf("bed") > 0 && params.save_snp_file && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(vcf) from ch_vcf_for_epiread + file whitelist_file from ch_whitelist_for_SNP.collect() + file reformatted_SNP from ch_reformattedSNP_for_SNP.collect().ifEmpty([]) + + output: + set val(name), file ("${name}.snp.bed") into ch_snp_for_epiread + file "*gz" + + script: + whitelist = params.whitelist ? "-R $whitelist_file" : '' + snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' + """ + bcftools annotate $whitelist -O z ${snp_file} -h $projectDir/assets/common_dbsnp.hdr -c CHROM,FROM,TO,TYPE,COMMON_SOME,COMMON_ALL,REF_MIN,ALT_MIN,REF_DBSNP,ALT_DBSNP,REF_ALL,ALT_ALL,RSID,MAX_MAF "${vcf[0]}" > "${name}-whitelist-dbSNP.vcf.gz" + tabix -p vcf "${name}-whitelist-dbSNP.vcf.gz" + bcftools view -O z -i'ALT!="N" & ALT!="." & ( (COUNT(GT=="0/1")>=1 & COMMON_ALL==1 & MAX_MAF>=0.05) | (COUNT(GT=="0/1" & GQ>=60)>=1) )' "${name}-whitelist-dbSNP.vcf.gz" > "${name}-whitelist-dbSNP-HET60.vcf.gz" + tabix -p vcf "${name}-whitelist-dbSNP-HET60.vcf.gz" + bcftools query -u -i'GT="0/1" & GQ>=10' --format '%CHROM\t%POS\t%POS\t%REF\t%ALT[\t%GT\t%GQ\t%SP\t%AC\t%AF1]\t%RSID\t%COMMON_ALL\t%MAX_MAF\t%REF_MIN\t%ALT_MIN\n' "${name}-whitelist-dbSNP-HET60.vcf.gz" | awk -v OFS="\t" '{\$2 = \$2 - 1; print}' > "${name}.snp.bed" + """ + } + + /* + * STEP 7.4 - Convert bam to epiread file format + */ + process epiread_conversion { + tag "$name" + publishDir "${params.outdir}/epireads", mode: params.publish_dir_mode, + saveAs: {filename -> + if( params.debug_epiread && filename != "where_are_my_files.txt") filename + else if( filename.indexOf("original") < 0 ) filename + else null + } + + input: + set val(name), + file(bam), + file(bam_index), + file(snp), + file(fasta), + file(fasta_index), + file(whitelist) from ch_bam_sorted_for_epiread + .join(ch_bam_index_for_epiread) + .join(ch_snp_for_epiread) + .combine(ch_fasta_for_epiread) + .combine(ch_fasta_index_for_epiread) + .combine(ch_whitelist_for_epiread) + file (assets) from ch_assets_dir_with_cpg_for_epiread.collect() + + output: + file "*${name}.e*.gz*" + file "${name}.original.epiread.*" optional true + + script: + snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' + cpg_file = assets.toString() + "/cpg.bed.gz" + debug_merging_epiread = (params.debug_epiread_merging || params.debug_epiread) ? "debug" : '' + no_filter_reverse = params.rrbs ? "-p" : '' + if (params.single_end) { + """ + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + samtools index ${name}.bam + biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.epiread.gz + """ + } else { + """ + zcat $cpg_file > cpg.bed + + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + samtools index ${name}.bam + biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n > ${name}.original.epiread + less ${name}.original.epiread | $projectDir/bin/epiread_pairedEnd_conversion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err + sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.original.epiread | bgzip > ${name}.original.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.original.epiread.gz + tabix -0 -p bed ${name}.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz + """ + } + } + } + + /* + * STEP 8 - Running QC of samples + */ + process biscuit_QC { + tag "$name" + publishDir "${params.outdir}/biscuit_QC", mode: params.publish_dir_mode + + input: + set val(name), + file(vcf), + file(bam), + file(fasta), + file(fasta_index), + file(assets) from ch_vcf_biscuit_qc + .join(ch_bam_noDups_for_QC) + .combine(ch_fasta_for_biscuitQC) + .combine(ch_fasta_index_for_biscuitQC) + .combine(ch_assets_dir_for_biscuit_qc) + + output: + file "*_biscuitQC" into ch_QC_results_for_multiqc + + script: + assembly = fasta.toString().replaceAll(/\.\w+/,"") + """ + QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} + """ + } + +} // end of biscuit if block +else { + ch_flagstat_results_biscuit_for_multiqc = Channel.from(false) + ch_samtools_stats_results_biscuit_for_multiqc = Channel.from(false) + ch_QC_results_for_multiqc = Channel.from(false) + ch_samblaster_for_multiqc = Channel.from(false) +} + +/* + * STEP 9 - Qualimap */ process qualimap { tag "$name" publishDir "${params.outdir}/qualimap", mode: params.publish_dir_mode input: - set val(name), file(bam) from ch_bam_dedup_for_qualimap + set val(name), file(bam) from ch_bam_sorted_dedup_for_qualimap output: file "${bam.baseName}_qualimap" into ch_qualimap_results_for_multiqc @@ -886,14 +1419,9 @@ process qualimap { script: gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam qualimap bamqc $gcref \\ - -bam ${bam.baseName}.sorted.bam \\ + -bam ${bam.baseName}.bam \\ -outdir ${bam.baseName}_qualimap \\ --collect-overlap-pairs \\ --java-mem-size=${task.memory.toGiga()}G \\ @@ -901,8 +1429,101 @@ process qualimap { """ } + + /* + * STEP 10 - Picard - Preparation step + */ +process prepare_genome_to_picard { + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: params.publish_dir_mode + + input: + file fasta from ch_fasta_for_picard + output: + file "${fasta.baseName}.picard.fa" into ch_fasta_picard_for_picard + file "${fasta.baseName}.picard.dict" into ch_fasta_picard_dict_for_picard + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + mv ${fasta} ${fasta.baseName}.picard.fa + picard -Xmx${avail_mem}g CreateSequenceDictionary \\ + R=${fasta.baseName}.picard.fa \\ + O=${fasta.baseName}.picard.dict + """ +} + + + + /* + * STEP 11 - Picard InsertSizeMetrics and GcBiasMetrics + */ +process picard_metrics { + tag "$name" + publishDir "${params.outdir}/picardMetrics", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf(".txt") > 0) filename + else if (filename.indexOf(".pdf") > 0) "pdf/$filename" + else null + } + input: + set val(name), file(bam) from ch_bam_sorted_for_picard + file fasta from ch_fasta_picard_for_picard.collect() + file dict from ch_fasta_picard_dict_for_picard.collect() + + output: + file "${name}.*.pdf" + file "${name}.*.txt" into ch_picard_results_for_multiqc + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + picard -Xmx${avail_mem}g CollectInsertSizeMetrics \\ + INPUT=$bam \\ + OUTPUT=${name}.insert_size_metrics.txt \\ + HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ + ASSUME_SORTED=true \\ + VALIDATION_STRINGENCY=LENIENT + set +e + picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=$bam \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam \\ + I=$bam O=${bam.baseName}.picard.bam \\ + SEQUENCE_DICTIONARY=$fasta \\ + VALIDATION_STRINGENCY=LENIENT \\ + TMP_DIR=. && \\ + picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=${bam.baseName}.picard.bam \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + echo "fine" + """ +} + /* - * STEP 9 - preseq + * STEP 12 - preseq */ process preseq { tag "$name" @@ -915,18 +1536,14 @@ process preseq { file "${bam.baseName}.ccurve.txt" into preseq_results script: - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam - preseq lc_extrap -v -B ${bam.baseName}.sorted.bam -o ${bam.baseName}.ccurve.txt + preseq lc_extrap -v -B ${bam.baseName}.bam -o ${bam.baseName}.ccurve.txt """ + } /* - * STEP 10 - MultiQC + * STEP 13 - MultiQC */ process multiqc { publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode @@ -944,10 +1561,15 @@ process multiqc { file ('bismark/*') from ch_bismark_summary_results_for_multiqc.collect().ifEmpty([]) file ('samtools/*') from ch_flagstat_results_for_multiqc.flatten().collect().ifEmpty([]) file ('samtools/*') from ch_samtools_stats_results_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_flagstat_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_samtools_stats_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) file ('picard/*') from ch_markDups_results_for_multiqc.flatten().collect().ifEmpty([]) file ('methyldackel/*') from ch_methyldackel_results_for_multiqc.flatten().collect().ifEmpty([]) file ('qualimap/*') from ch_qualimap_results_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from preseq_results.collect().ifEmpty([]) + file ('biscuit_QC/*') from ch_QC_results_for_multiqc.collect().ifEmpty([]) + file ('biscuit_markDuplicates/*') from ch_samblaster_for_multiqc.collect().ifEmpty([]) + file ('picardMetrics/*') from ch_picard_results_for_multiqc.collect().ifEmpty([]) file ('software_versions/*') from ch_software_versions_yaml_for_multiqc.collect() file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") @@ -966,12 +1588,12 @@ process multiqc { custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' """ multiqc -f $rtitle $rfilename $custom_config_file . \\ - -m custom_content -m picard -m qualimap -m bismark -m samtools -m preseq -m cutadapt -m fastqc + -m custom_content -m picard -m qualimap -m bismark -m samtools -m preseq -m cutadapt -m fastqc -m biscuit -m samblaster """ } /* - * STEP 11 - Output Description HTML + * STEP 14 - Output Description HTML */ process output_documentation { publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode @@ -993,7 +1615,6 @@ process output_documentation { * Completion e-mail notification */ workflow.onComplete { - // Set up the e-mail variables def subject = "[nf-core/methylseq] Successful: $workflow.runName" if (!workflow.success) { @@ -1104,7 +1725,6 @@ workflow.onComplete { checkHostname() log.info "-${c_purple}[nf-core/methylseq]${c_red} Pipeline completed with errors${c_reset}-" } - } workflow.onError { diff --git a/nextflow.config b/nextflow.config index 683c2bc69..56b24b269 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,6 +55,16 @@ params { bismark_align_mem_per_multicore = null publish_dir_mode = 'copy' + bwa_biscuit_index = false + whitelist = false + blacklist = false + common_dbsnp = false + save_snp_file = false + epiread = false + debug_epiread = false + debug_epiread_merging = false + assets_dir = false + // Boilerplate options multiqc_config = false email = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 125da8f55..b3b24b8fd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -55,15 +55,16 @@ "enum": [ "bismark", "bismark_hisat", - "bwameth" + "bwameth", + "biscuit" ], - "help_text": "The nf-core/methylseq package is actually two pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline.\n\nSince bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`.\n\nThe second workflow uses [BWA-Meth](https://github.com/brentp/bwa-meth) and [MethylDackel](https://github.com/dpryan79/methyldackel) instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner bwameth`." + "help_text": "The nf-core/methylseq package is actually three pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline.\n\nSince bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`.\n\nThe second workflow uses [BWA-Meth](https://github.com/brentp/bwa-meth) and [MethylDackel](https://github.com/dpryan79/methyldackel) instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner bwameth`.\n\nThe third workflow uses [BISCUIT](https://github.com/huishenlab/biscuit) and [samblaster](https://github.com/GregoryFaust/samblaster) for mark duplicates instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner biscuit`." }, "comprehensive": { "type": "boolean", "description": "Output information for all cytosine contexts.", "fa_icon": "fas fa-arrows-alt", - "help_text": "By default, the pipeline only produces data for cytosine methylation states in CpG context. Specifying `--comprehensive` makes the pipeline give results for all cytosine contexts. Note that for large genomes (e.g. Human), these can be massive files. This is only recommended for small genomes (especially those that don't exhibit strong CpG context methylation specificity).\n\nIf specified, this flag instructs the Bismark methylation extractor to use the `--comprehensive` and `--merge_non_CpG` flags. This produces coverage files with information from about all strands and cytosine contexts merged into two files - one for CpG context and one for non-CpG context.\n\nIf using the bwa-meth workflow, the flag makes MethylDackel report CHG and CHH contexts as well." + "help_text": "By default, the pipeline only produces data for cytosine methylation states in CpG context. Specifying `--comprehensive` makes the pipeline give results for all cytosine contexts. Note that for large genomes (e.g. Human), these can be massive files. This is only recommended for small genomes (especially those that don't exhibit strong CpG context methylation specificity).\n\nIf specified, this flag instructs the Bismark methylation extractor to use the `--comprehensive` and `--merge_non_CpG` flags. This produces coverage files with information from about all strands and cytosine contexts merged into two files - one for CpG context and one for non-CpG context.\n\nIf using the bwa-meth workflow, the flag makes MethylDackel report CHG and CHH contexts as well.\nIf using the biscuit workflow, the flag extract all cytosine contexts (C, CG, CG, HCG, GCH) from the VCF to the bed file, instead of only CG" }, "save_align_intermeds": { "type": "boolean", @@ -87,7 +88,7 @@ "type": "boolean", "fa_icon": "fas fa-outdent", "description": "Preset for working with PBAT libraries.", - "help_text": "Specify this parameter when working with PBAT _(Post Bisulfite Adapter Tagging)_ libraries.\n\nUsing this parameter sets the `--pbat` flag when aligning with Bismark. This tells Bismark to align complementary strands (the opposite of `--directional`).\n\nAdditionally, this is a trimming preset equivalent to `--clip_r1 6` `--clip_r2 9` `--three_prime_clip_r1 6` `--three_prime_clip_r2 9`" + "help_text": "Specify this parameter when working with PBAT _(Post Bisulfite Adapter Tagging)_ libraries.\n\nUsing this parameter sets the `--pbat` flag when aligning with Bismark. This tells Bismark to align complementary strands (the opposite of `--directional`). \nWhen using the BISCUIT aligner, it reverses the reads in paired-end, or set the read to be the synthesized strand for single-end.\n\nAdditionally, this is a trimming preset equivalent to `--clip_r1 6` `--clip_r2 9` `--three_prime_clip_r1 6` `--three_prime_clip_r2 9`" }, "rrbs": { "type": "boolean", @@ -111,7 +112,7 @@ "type": "boolean", "fa_icon": "fas fa-cut", "description": "Trimming preset for single-cell bisulfite libraries.", - "help_text": "Equivalent to `--clip_r1 6` `--clip_r2 6` `--three_prime_clip_r1 6` `--three_prime_clip_r2 6`.\n\nAlso sets the `--non_directional` flag for Bismark." + "help_text": "Equivalent to `--clip_r1 6` `--clip_r2 6` `--three_prime_clip_r1 6` `--three_prime_clip_r2 6`.\n\nAlso sets the `--non_directional` flag for Bismark or BISCUIT." }, "accel": { "type": "boolean", @@ -135,7 +136,7 @@ "type": "boolean", "fa_icon": "fas fa-cut", "description": "Trimming preset for the Zymo kit.", - "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 15` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`.\n\nAlso sets the `--non_directional` flag for Bismark." + "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 15` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`.\n\nAlso sets the `--non_directional` flag for Bismark or for BISCUIT.." } }, "fa_icon": "fas fa-prescription-bottle" @@ -156,7 +157,7 @@ "type": "string", "fa_icon": "fas fa-font", "description": "Path to FASTA genome file.", - "help_text": "If you have no genome reference available, the pipeline can build one using a FASTA file. This requires additional time and resources, so it's better to use a pre-build index if possible. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future.\n\nNote that the `bwa-meth` workflow always needs a FASTA file, for methylation calling." + "help_text": "If you have no genome reference available, the pipeline can build one using a FASTA file. This requires additional time and resources, so it's better to use a pre-build index if possible. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future.\n\nNote that FASTA file is always needed, for the Picard metrics creation." }, "fasta_index": { "type": "string", @@ -175,6 +176,12 @@ "help_text": "The base filename for a bwa-meth genome reference index. Only used when using the bwa-meth aligner.\n\nNote that this is not a complete path, but rather a common filename _base_. For example, if you have file paths such as `/path/to/ref/genome.fa.bwameth.c2t.bwt`, you should specify `/path/to/ref/genome.fa`.", "fa_icon": "far fa-dot-circle" }, + "bwa_biscuit_index": { + "type": "string", + "description": "biscuit-bwa index filename base", + "help_text": "The base filename for a biscuit-bwa genome reference index. Only used when using the biscuit aligner.\n\nNote that this is not a complete path, but rather a common filename _base_. For example, if you have file paths such as `/path/to/ref/genome.fa.bwameth.bis.amb`, you should specify `/path/to/ref/genome.fa`.", + "fa_icon": "far fa-dot-circle" + }, "save_reference": { "type": "boolean", "description": "Save reference(s) to results directory", @@ -244,12 +251,6 @@ "default": "", "fa_icon": "fas fa-circle", "properties": { - "non_directional": { - "type": "boolean", - "description": "Run alignment against all four possible strands.", - "help_text": "By default, Bismark assumes that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--non_directional` workflow flag automatically.", - "fa_icon": "fas fa-exchange-alt" - }, "cytosine_report": { "type": "boolean", "description": "Output stranded cytosine report during Bismark's bismark_methylation_extractor step.", @@ -324,15 +325,9 @@ "bwa_meth_options": { "title": "bwa-meth options", "type": "object", - "description": "", + "description": "Parameters specific to the bwa-meth workflow", "default": "", "properties": { - "min_depth": { - "type": "integer", - "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", - "default": 0, - "fa_icon": "fas fa-angle-double-down" - }, "ignore_flags": { "type": "boolean", "description": "MethylDackel - ignore SAM flags", @@ -348,6 +343,86 @@ }, "fa_icon": "far fa-circle" }, + "biscuit_options": { + "title": "biscuit options", + "type": "object", + "description": "Parameters specific to the BISCUIT workflow", + "default": "", + "properties": { + "epiread": { + "type": "boolean", + "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", + "help_text": "[Epiread](https://huishenlab.github.io/biscuit/epiread_format/) is a compact way of storing CpG retention pattern on the same read. This option will tell the biscuit workflow to generate Epi-read file for the sample, as well as all needed steps.", + "fa_icon": "fas fa-clipboard-list" + }, + "common_dbsnp": { + "type": "string", + "description": "Common dbSNP table of the relevant genome, for SNP filteration", + "help_text": "Common-dbSNP table that contains at least the following fields: chrom, chromStart, chromEnd, name, ref, altCount, alts, shiftBases, freqSourceCount, minorAlleleFreq, majorAllele, minorAllele, maxFuncImpact, class, ucscNotes [can be downloaded from UCSC]. This table is used for SNP filteration in the Epi-read file. Relevant only if `--epiread` is set.", + "fa_icon": "fas fa-chess-board" + }, + "whitelist": { + "type": "string", + "description": "Path to the file that is the complement of blacklist.", + "help_text": "The whitelist is needed for SNP file generation.\nThe whitelist will be created automatically if a blacklist is probided with `--blacklist`\nRelevant only if `--epiread` is set.", + "fa_icon": "far fa-list-alt" + }, + "blacklist": { + "type": "string", + "description": "Path to the file with problematic regions of the genome, known as blacklist.", + "help_text": "The whitelist is needed for white list creation, which is needed for SNP file generation.\nRelevant only if `--epiread` is set.", + "fa_icon": "fas fa-list-alt" + }, + "debug_epiread": { + "type": "boolean", + "description": "Debug epiread merging for paired end reads.", + "fa_icon": "fas fa-check-square", + "help_text": "By default, merging two adjacent rows of the read mates in Epi-read format when running with paired-end mode will not output the debug data (about the reference allele, the alternative allele and the SP data), and the original Epi-read file will not be saved to the results directory. Specify this flag (or set to true in your config file) to run the merging with debug data and copy the original Epi-read file to the results directory when complete. \n If you don't want to keep the original files, check `debug_epiread_merging`.\nRelevant only if `--epiread` is set.", + "hidden": true + }, + "debug_epiread_merging": { + "type": "boolean", + "description": "Debug epiread merging. Output merged epiread in debug mode.", + "fa_icon": "far fa-check-square", + "help_text": "By default, merging two adjacent rows of the read mates in Epi-read format when running with paired-end mode will not output the debug data (about the reference allele, the alternative allele and the SP data). Specify this flag (or set to true in your config file) to run the merging with the debug data.\nIf you want to keep the original Epi-read files, check `debug_epiread`\nRelevant only if `--epiread` is set.", + "hidden": true + }, + "assets_dir": { + "type": "string", + "description": "Path to assets directory for biscuit_QC", + "help_text": "Path to a directory containing needed file for biscuit-QC step\n> **NB** If none provided, will be generated automatically.", + "fa_icon": "fab fa-buffer" + }, + "save_snp_file": { + "type": "boolean", + "description": "Save SNP bed-file to results directory", + "help_text": "By default, the bed file with SNP information about the sample will not be save to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete. Relevant only if `--epiread` is set.", + "fa_icon": "fas fa-save" + } + }, + "fa_icon": "far fa-circle" + }, + "common_options": { + "title": "common options", + "type": "object", + "description": "Parameters that are common between pipelines", + "default": "", + "properties": { + "min_depth": { + "type": "integer", + "description": "Specify a minimum read coverage for MethylDackel to report a methylation call in bwa-meth workflow, or a minimum read coverage for information extraction from the VCF file to bed file in BISCUIT workflow.", + "default": 0, + "fa_icon": "fas fa-angle-double-down" + }, + "non_directional": { + "type": "boolean", + "description": "Run alignment against all four possible strands.", + "help_text": "By default, Bismark and BISCUIT assume that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--non_directional` workflow flag automatically.", + "fa_icon": "fas fa-exchange-alt" + } + }, + "fa_icon": "far fa-circle" + }, "skip_pipeline_steps": { "title": "Skip pipeline steps", "type": "object", @@ -568,6 +643,12 @@ }, { "$ref": "#/definitions/bwa_meth_options" + }, + { + "$ref": "#/definitions/biscuit_options" + }, + { + "$ref": "#/definitions/common_options" }, { "$ref": "#/definitions/skip_pipeline_steps"