Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ nextflow run LabShengLi/nanome\
# Run NANOME for human data
nextflow run LabShengLi/nanome\
-profile test_human,[docker/singularity]

# Run NANOME for dorado call
nextflow run LabShengLi/nanome\
-profile test_dorado,singularity
```
Please note that above commands are integrated in our **CI/CD test cases**. Our GitHub will automatically test and report results on every commit and PRs (https://github.com/LabShengLi/nanome/actions).

Expand Down
34 changes: 34 additions & 0 deletions conf/examples/test_dorado.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* -------------------------------------------------
* Nextflow config file for CI human test case
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run LabShengLi/nanome -profile test,<docker/singularity>
*/

params{
dsname = 'CIEcoli'
input = 'https://github.com/LabShengLi/nanome/raw/rev5/test_data/fast5_ecoli_v2.tar.gz'
genome = 'ecoli'

dorado = true
dorado_basecall_model = 'dna_r9.4.1_e8_fast@v3.4'
dorado_methcall_model = 'dna_r9.4.1_e8_fast@v3.4_5mCG@v0.1'

// processors = 2
// maxRetries = 5
// errorStrategy = 'ignore'
}

process {
cpus = null
memory = null
time = null
maxRetries = params.maxRetries

withName: 'ENVCHECK' {
// allow retry if download Rerio model failed
errorStrategy = {task.attempt >= process.maxRetries ? params.errorStrategy : 'retry' }
}
}
12 changes: 11 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ include { EVAL } from './modules/EVAL'

include { REPORT } from './modules/REPORT'

include { DORADO_UNTAR; DORADO_CALL; DORADO_QC } from './modules/DORADO'
include { DORADO_UNTAR; DORADO_CALL; DORADO_QC; DORADO_CALL_EXTRACT; UNIFY } from './modules/DORADO'

// place holder channel, used for empty file of a channel
null1 = Channel.fromPath("${projectDir}/utils/null1")
Expand Down Expand Up @@ -340,6 +340,16 @@ workflow {
DORADO_UNTAR(ch_inputs.collect())
DORADO_CALL(DORADO_UNTAR.out.untar, ENVCHECK.out.reference_genome)
DORADO_QC(DORADO_CALL.out.dorado_call, ENVCHECK.out.reference_genome)

bam_fn = "${params.dsname}.dorado_call/${params.dsname}.dorado_call.bam"
DORADO_CALL_EXTRACT("per_read", bam_fn,
DORADO_CALL.out.dorado_call, ENVCHECK.out.reference_genome,
ch_src, ch_utils)

UNIFY("Dorado","NANOME", "all",
DORADO_CALL_EXTRACT.out.dorado_call_extract,
ENVCHECK.out.reference_genome,
ch_src, ch_utils)
} else { // Guppy ecosystems
if (params.runBasecall) {
UNTAR(ch_inputs)
Expand Down
109 changes: 108 additions & 1 deletion modules/DORADO.nf
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ process DORADO_CALL {

echo untar_dir=!{untar_dir}
echo reference_genome=!{reference_genome}

``
dorado -vv

ls /models
Expand Down Expand Up @@ -143,3 +143,110 @@ process DORADO_QC {
echo "### Dorado QC all DONE"
'''
}


// extract BAM as per-read results
process DORADO_CALL_EXTRACT {
tag "${call_tagname}"

publishDir "${params.outdir}/${params.dsname}-methylation-callings/Raw_Results-${params.dsname}",
pattern: "${params.dsname}.${call_tagname}.dorado_call_extract.tsv.gz",
mode: "copy"

input:
val call_tagname // call's tagname
val bam_fn // BAM file location
path dorado_call // can be a folder contains files of BAM and BAM.BAI
path reference_genome
path ch_src
path ch_utils

output:
path "${params.dsname}.${call_tagname}.dorado_call_extract.tsv.gz", emit: dorado_call_extract, optional: true

when:
params.dorado

shell:
cores = task.cpus * params.highProcTimes
'''
date; hostname; pwd

echo !{call_tagname}
echo !{bam_fn}
echo !{dorado_call}

python utils/modbam2bed_extract_read_cpg.py \
-r !{params.referenceGenome} \
-i !{bam_fn} \
-o !{params.dsname}.!{call_tagname}.dorado_call_extract.tsv \
-a !{params.guppy_canon_threshold} -b !{params.guppy_mod_threshold}

gzip -f !{params.dsname}.!{call_tagname}.dorado_call_extract.tsv

echo "### Dorado call extract DONE"
'''
}


// extract BAM as per-read results
process UNIFY {
tag "${call_tagname}"

publishDir "${params.outdir}/${params.dsname}-methylation-callings",
mode: "copy"

input:
val tool_name // e.g., Dorado
val encode_name // e.g., NANOME
val call_tagname // e.g., all, H1, H2, chr1_22,etc.
path per_read_fn // per-read file
path reference_genome
path ch_src
path ch_utils

output:
path "Read_Level-${params.dsname}_${call_tagname}/*-perRead-score*.gz", emit: read_unify, optional: true
path "Site_Level-${params.dsname}_${call_tagname}/*-perSite-cov*.gz", emit: site_unify, optional: true
path "Site_Level-${params.dsname}_${call_tagname}/*.methylKit.CpG.txt.gz", emit: methylkit_unify, optional: true

when:
params.dorado

shell:
cores = task.cpus * params.highProcTimes
'''
date; hostname; pwd

echo !{per_read_fn}

per_read_fn=!{per_read_fn}
if [[ !{params.deduplicate} == true ]] ; then
echo "### Deduplicate for read-level outputs"
## sort order: Chr, Start, (End), ID, Strand
zcat !{per_read_fn} |\
sort -V -u -k2,2 -k3,3n -k1,1 -k4,4 |\
gzip -f > !{per_read_fn}.sort.tsv.gz

per_read_fn=!{per_read_fn}.sort.tsv.gz
fi

## Unify format output
echo "### generate read/site level results"
bash utils/unify_format_for_calls.sh \
!{params.dsname}_!{call_tagname} !{tool_name} !{encode_name} \
!{per_read_fn} \
. !{task.cpus} 12 !{params.sort ? true : false} "!{params.chrSet1.replaceAll(',', ' ')}"

## MethylKit format
site_fn="Site_Level-!{params.dsname}_!{call_tagname}/!{params.dsname}_!{call_tagname}_!{tool_name}-perSite-cov1.sort.bed.gz"
methylkit_fn="Site_Level-!{params.dsname}_!{call_tagname}/!{params.dsname}_!{call_tagname}_!{tool_name}.methylKit.CpG.txt.gz"

python utils/nanome2methylkit.py \
-i $site_fn \
-o $methylkit_fn \
--chrSet !{params.chrSet1.replaceAll(',', ' ')}

echo "### Unify DONE"
'''
}
6 changes: 4 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ profiles {

test { includeConfig 'conf/examples/test.config' }

test_dorado { includeConfig 'conf/examples/test_dorado.config' }

test_human { includeConfig 'conf/examples/test_human.config' }

jax { includeConfig 'conf/executors/jaxhpc_input.config' }
Expand Down Expand Up @@ -289,7 +291,7 @@ profiles {
withName: 'DEEPSIGNAL2' {
container = params.deepsignal2_docker_name
}
withName: 'Guppy6' {
withName: 'Guppy6|DORADO_CALL_EXTRACT' {
container = params.guppy_stable_name
}
withName: 'DORADO_CALL' {
Expand Down Expand Up @@ -340,7 +342,7 @@ profiles {
container = params.deepsignal2_docker_name.startsWith("/") ?
params.deepsignal2_docker_name : "docker://${params.deepsignal2_docker_name}"
}
withName: 'Guppy6' {
withName: 'Guppy6|DORADO_CALL_EXTRACT' {
container = params.guppy_stable_name.startsWith("/") ?
params.guppy_stable_name : "docker://${params.guppy_stable_name}"
}
Expand Down
Binary file added test_data/fast5_ecoli_v2.tar.gz
Binary file not shown.
78 changes: 78 additions & 0 deletions utils/nanome2methylkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Convert nanome site level output to methylkit format

Nanome site level input:

zcat GT23-09911_Guppy-perSite-cov1.sort.bed.gz| head
chr1 2714 2715 . . + 1.0 1
chr1 2715 2716 . . - 0.25 4
chr1 2722 2723 . . + 1.0 1

Output is base 1-base, used for methylkit's input data format

zcat GM24385_guppy_NanoMethPhase_HP1_methylkit_format.CpG.txt.gz| head
chrBase chr base strand coverage freqC freqT
chr1.2715 chr1 2715 F 1 100.0 0.0
chr1.2723 chr1 2723 F 1 100.0 0.0

Note: methylkit prefer 1-based input, since the output DMC will be correct, default input tsv is no header
"""

import argparse

import pandas as pd
import logging

# interested chrs
chrs = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5',
'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
'chr21', 'chr22', 'chrX', 'chrY']


def parse_arguments():
parser = argparse.ArgumentParser(prog='Convert nanome site level format to methylkit format')
parser.add_argument('-i', type=str, required=True,
help='input file')
parser.add_argument('-o', type=str, required=True,
help='output methylkit formated file')
parser.add_argument('--verbose', help="if output verbose info", action='store_true')
parser.add_argument('--chrSet', nargs='+', help='chromosome list, default is human chromosome chr1-22, X and Y',
default=chrs)
return parser.parse_args()


if __name__ == '__main__':
args = parse_arguments()
if args.verbose:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

else:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

logging.debug(f"args={args}")

df = pd.read_csv(args.i, sep='\t', index_col=None, header=None)
df.columns = ['chromosome', 'pos0', 'pos1',
'place_hold1', 'place_hold2', 'strand',
'meth_freq', 'coverage']
df = df[df.chromosome.isin(args.chrSet)]

logging.debug(f"df={df}")

df['chrBase'] = df['chromosome'] + '.' + df['pos1'].astype(str)
df['chr'] = df['chromosome']
df['base'] = df['pos1'] # output base column is 1-based
df['strand'] = df['strand'].apply(lambda x: 'F' if x == '+' else 'R')
df = df[df['coverage'] > 0]

df['freqC'] = df['meth_freq'] * 100.0
df['freqT'] = 100.0 - df['freqC']

df = df[['chrBase', 'chr', 'base', 'strand', 'coverage', 'freqC', 'freqT']]
logging.debug(f"new df={df}")

df.to_csv(args.o, sep='\t', index=False)
logging.info(f"save to {args.o}")