diff --git a/README.md b/README.md index 9f0c7dcb..3ccd32ee 100755 --- a/README.md +++ b/README.md @@ -110,6 +110,10 @@ nextflow run LabShengLi/nanome\ # Run NANOME for human data nextflow run LabShengLi/nanome\ -profile test_human,[docker/singularity] + +# Run NANOME for dorado call +nextflow run LabShengLi/nanome\ + -profile test_dorado,singularity ``` Please note that above commands are integrated in our **CI/CD test cases**. Our GitHub will automatically test and report results on every commit and PRs (https://github.com/LabShengLi/nanome/actions). diff --git a/conf/examples/test_dorado.config b/conf/examples/test_dorado.config new file mode 100644 index 00000000..c3c93ccc --- /dev/null +++ b/conf/examples/test_dorado.config @@ -0,0 +1,34 @@ +/* + * ------------------------------------------------- + * Nextflow config file for CI human test case + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run LabShengLi/nanome -profile test, + */ + +params{ + dsname = 'CIEcoli' + input = 'https://github.com/LabShengLi/nanome/raw/rev5/test_data/fast5_ecoli_v2.tar.gz' + genome = 'ecoli' + + dorado = true + dorado_basecall_model = 'dna_r9.4.1_e8_fast@v3.4' + dorado_methcall_model = 'dna_r9.4.1_e8_fast@v3.4_5mCG@v0.1' + + // processors = 2 + // maxRetries = 5 + // errorStrategy = 'ignore' +} + +process { + cpus = null + memory = null + time = null + maxRetries = params.maxRetries + + withName: 'ENVCHECK' { + // allow retry if download Rerio model failed + errorStrategy = {task.attempt >= process.maxRetries ? params.errorStrategy : 'retry' } + } +} diff --git a/main.nf b/main.nf index 4145d5ed..a661c565 100755 --- a/main.nf +++ b/main.nf @@ -303,7 +303,7 @@ include { EVAL } from './modules/EVAL' include { REPORT } from './modules/REPORT' -include { DORADO_UNTAR; DORADO_CALL; DORADO_QC } from './modules/DORADO' +include { DORADO_UNTAR; DORADO_CALL; DORADO_QC; DORADO_CALL_EXTRACT; UNIFY } from './modules/DORADO' // place holder channel, used for empty file of a channel null1 = Channel.fromPath("${projectDir}/utils/null1") @@ -340,6 +340,16 @@ workflow { DORADO_UNTAR(ch_inputs.collect()) DORADO_CALL(DORADO_UNTAR.out.untar, ENVCHECK.out.reference_genome) DORADO_QC(DORADO_CALL.out.dorado_call, ENVCHECK.out.reference_genome) + + bam_fn = "${params.dsname}.dorado_call/${params.dsname}.dorado_call.bam" + DORADO_CALL_EXTRACT("per_read", bam_fn, + DORADO_CALL.out.dorado_call, ENVCHECK.out.reference_genome, + ch_src, ch_utils) + + UNIFY("Dorado","NANOME", "all", + DORADO_CALL_EXTRACT.out.dorado_call_extract, + ENVCHECK.out.reference_genome, + ch_src, ch_utils) } else { // Guppy ecosystems if (params.runBasecall) { UNTAR(ch_inputs) diff --git a/modules/DORADO.nf b/modules/DORADO.nf index 94af91cc..8166bae1 100644 --- a/modules/DORADO.nf +++ b/modules/DORADO.nf @@ -89,7 +89,7 @@ process DORADO_CALL { echo untar_dir=!{untar_dir} echo reference_genome=!{reference_genome} - +`` dorado -vv ls /models @@ -143,3 +143,110 @@ process DORADO_QC { echo "### Dorado QC all DONE" ''' } + + +// extract BAM as per-read results +process DORADO_CALL_EXTRACT { + tag "${call_tagname}" + + publishDir "${params.outdir}/${params.dsname}-methylation-callings/Raw_Results-${params.dsname}", + pattern: "${params.dsname}.${call_tagname}.dorado_call_extract.tsv.gz", + mode: "copy" + + input: + val call_tagname // call's tagname + val bam_fn // BAM file location + path dorado_call // can be a folder contains files of BAM and BAM.BAI + path reference_genome + path ch_src + path ch_utils + + output: + path "${params.dsname}.${call_tagname}.dorado_call_extract.tsv.gz", emit: dorado_call_extract, optional: true + + when: + params.dorado + + shell: + cores = task.cpus * params.highProcTimes + ''' + date; hostname; pwd + + echo !{call_tagname} + echo !{bam_fn} + echo !{dorado_call} + + python utils/modbam2bed_extract_read_cpg.py \ + -r !{params.referenceGenome} \ + -i !{bam_fn} \ + -o !{params.dsname}.!{call_tagname}.dorado_call_extract.tsv \ + -a !{params.guppy_canon_threshold} -b !{params.guppy_mod_threshold} + + gzip -f !{params.dsname}.!{call_tagname}.dorado_call_extract.tsv + + echo "### Dorado call extract DONE" + ''' +} + + +// extract BAM as per-read results +process UNIFY { + tag "${call_tagname}" + + publishDir "${params.outdir}/${params.dsname}-methylation-callings", + mode: "copy" + + input: + val tool_name // e.g., Dorado + val encode_name // e.g., NANOME + val call_tagname // e.g., all, H1, H2, chr1_22,etc. + path per_read_fn // per-read file + path reference_genome + path ch_src + path ch_utils + + output: + path "Read_Level-${params.dsname}_${call_tagname}/*-perRead-score*.gz", emit: read_unify, optional: true + path "Site_Level-${params.dsname}_${call_tagname}/*-perSite-cov*.gz", emit: site_unify, optional: true + path "Site_Level-${params.dsname}_${call_tagname}/*.methylKit.CpG.txt.gz", emit: methylkit_unify, optional: true + + when: + params.dorado + + shell: + cores = task.cpus * params.highProcTimes + ''' + date; hostname; pwd + + echo !{per_read_fn} + + per_read_fn=!{per_read_fn} + if [[ !{params.deduplicate} == true ]] ; then + echo "### Deduplicate for read-level outputs" + ## sort order: Chr, Start, (End), ID, Strand + zcat !{per_read_fn} |\ + sort -V -u -k2,2 -k3,3n -k1,1 -k4,4 |\ + gzip -f > !{per_read_fn}.sort.tsv.gz + + per_read_fn=!{per_read_fn}.sort.tsv.gz + fi + + ## Unify format output + echo "### generate read/site level results" + bash utils/unify_format_for_calls.sh \ + !{params.dsname}_!{call_tagname} !{tool_name} !{encode_name} \ + !{per_read_fn} \ + . !{task.cpus} 12 !{params.sort ? true : false} "!{params.chrSet1.replaceAll(',', ' ')}" + + ## MethylKit format + site_fn="Site_Level-!{params.dsname}_!{call_tagname}/!{params.dsname}_!{call_tagname}_!{tool_name}-perSite-cov1.sort.bed.gz" + methylkit_fn="Site_Level-!{params.dsname}_!{call_tagname}/!{params.dsname}_!{call_tagname}_!{tool_name}.methylKit.CpG.txt.gz" + + python utils/nanome2methylkit.py \ + -i $site_fn \ + -o $methylkit_fn \ + --chrSet !{params.chrSet1.replaceAll(',', ' ')} + + echo "### Unify DONE" + ''' +} diff --git a/nextflow.config b/nextflow.config index 44857e3e..f228c094 100755 --- a/nextflow.config +++ b/nextflow.config @@ -260,6 +260,8 @@ profiles { test { includeConfig 'conf/examples/test.config' } + test_dorado { includeConfig 'conf/examples/test_dorado.config' } + test_human { includeConfig 'conf/examples/test_human.config' } jax { includeConfig 'conf/executors/jaxhpc_input.config' } @@ -289,7 +291,7 @@ profiles { withName: 'DEEPSIGNAL2' { container = params.deepsignal2_docker_name } - withName: 'Guppy6' { + withName: 'Guppy6|DORADO_CALL_EXTRACT' { container = params.guppy_stable_name } withName: 'DORADO_CALL' { @@ -340,7 +342,7 @@ profiles { container = params.deepsignal2_docker_name.startsWith("/") ? params.deepsignal2_docker_name : "docker://${params.deepsignal2_docker_name}" } - withName: 'Guppy6' { + withName: 'Guppy6|DORADO_CALL_EXTRACT' { container = params.guppy_stable_name.startsWith("/") ? params.guppy_stable_name : "docker://${params.guppy_stable_name}" } diff --git a/test_data/fast5_ecoli_v2.tar.gz b/test_data/fast5_ecoli_v2.tar.gz new file mode 100644 index 00000000..5e522beb Binary files /dev/null and b/test_data/fast5_ecoli_v2.tar.gz differ diff --git a/utils/nanome2methylkit.py b/utils/nanome2methylkit.py new file mode 100644 index 00000000..7f3dea5d --- /dev/null +++ b/utils/nanome2methylkit.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +""" +Convert nanome site level output to methylkit format + +Nanome site level input: + +zcat GT23-09911_Guppy-perSite-cov1.sort.bed.gz| head +chr1 2714 2715 . . + 1.0 1 +chr1 2715 2716 . . - 0.25 4 +chr1 2722 2723 . . + 1.0 1 + +Output is base 1-base, used for methylkit's input data format + +zcat GM24385_guppy_NanoMethPhase_HP1_methylkit_format.CpG.txt.gz| head +chrBase chr base strand coverage freqC freqT +chr1.2715 chr1 2715 F 1 100.0 0.0 +chr1.2723 chr1 2723 F 1 100.0 0.0 + +Note: methylkit prefer 1-based input, since the output DMC will be correct, default input tsv is no header +""" + +import argparse + +import pandas as pd +import logging + +# interested chrs +chrs = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', + 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', + 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', + 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', + 'chr21', 'chr22', 'chrX', 'chrY'] + + +def parse_arguments(): + parser = argparse.ArgumentParser(prog='Convert nanome site level format to methylkit format') + parser.add_argument('-i', type=str, required=True, + help='input file') + parser.add_argument('-o', type=str, required=True, + help='output methylkit formated file') + parser.add_argument('--verbose', help="if output verbose info", action='store_true') + parser.add_argument('--chrSet', nargs='+', help='chromosome list, default is human chromosome chr1-22, X and Y', + default=chrs) + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_arguments() + if args.verbose: + logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') + + else: + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + + logging.debug(f"args={args}") + + df = pd.read_csv(args.i, sep='\t', index_col=None, header=None) + df.columns = ['chromosome', 'pos0', 'pos1', + 'place_hold1', 'place_hold2', 'strand', + 'meth_freq', 'coverage'] + df = df[df.chromosome.isin(args.chrSet)] + + logging.debug(f"df={df}") + + df['chrBase'] = df['chromosome'] + '.' + df['pos1'].astype(str) + df['chr'] = df['chromosome'] + df['base'] = df['pos1'] # output base column is 1-based + df['strand'] = df['strand'].apply(lambda x: 'F' if x == '+' else 'R') + df = df[df['coverage'] > 0] + + df['freqC'] = df['meth_freq'] * 100.0 + df['freqT'] = 100.0 - df['freqC'] + + df = df[['chrBase', 'chr', 'base', 'strand', 'coverage', 'freqC', 'freqT']] + logging.debug(f"new df={df}") + + df.to_csv(args.o, sep='\t', index=False) + logging.info(f"save to {args.o}")