From b2bc2e0b09e1f256b5a1184d5922c92493308a7c Mon Sep 17 00:00:00 2001 From: xiaoli-dong Date: Tue, 30 Jan 2024 08:17:36 -0700 Subject: [PATCH] update --- bin/GBS-SBG.pl | 501 ++++++++++++++++++ bin/combine_xml.py | 3 +- bin/xml2csv.py | 61 +++ conf/modules.config | 35 +- conf/modules_illumina.config | 2 + modules/local/csvtk/concat/main.nf | 40 ++ modules/local/csvtk/concat/meta.yml | 51 ++ modules/local/fastp/main.nf | 6 +- modules/local/gbs/sbg/main.nf | 43 ++ modules/local/hostile/main.nf | 8 +- .../pneumocat/environment.yml | 0 modules/local/pneumocat/main.nf | 73 +++ modules/{nf-core => local}/pneumocat/meta.yml | 0 modules/local/shovill/main.nf | 3 +- modules/local/tbprofiler/collate/main.nf | 3 +- modules/local/tbprofiler/profile/main.nf | 2 +- modules/nf-core/pneumocat/main.nf | 50 -- .../tbprofiler/profile/environment.yml | 7 - modules/nf-core/tbprofiler/profile/main.nf | 41 -- modules/nf-core/tbprofiler/profile/meta.yml | 58 -- nextflow.config | 7 +- nextflow_schema.json | 124 +++-- subworkflows/local/prepare_references.nf | 17 +- subworkflows/local/special_tools.nf | 56 +- workflows/illumina.nf | 53 +- workflows/nanopore.nf | 53 +- 26 files changed, 1053 insertions(+), 244 deletions(-) create mode 100644 bin/GBS-SBG.pl create mode 100644 bin/xml2csv.py create mode 100644 modules/local/csvtk/concat/main.nf create mode 100644 modules/local/csvtk/concat/meta.yml create mode 100644 modules/local/gbs/sbg/main.nf rename modules/{nf-core => local}/pneumocat/environment.yml (100%) create mode 100644 modules/local/pneumocat/main.nf rename modules/{nf-core => local}/pneumocat/meta.yml (100%) delete mode 100644 modules/nf-core/pneumocat/main.nf delete mode 100644 modules/nf-core/tbprofiler/profile/environment.yml delete mode 100644 modules/nf-core/tbprofiler/profile/main.nf delete mode 100644 modules/nf-core/tbprofiler/profile/meta.yml diff --git a/bin/GBS-SBG.pl b/bin/GBS-SBG.pl new file mode 100644 index 0000000..14f08ae --- /dev/null +++ b/bin/GBS-SBG.pl @@ -0,0 +1,501 @@ +#!/usr/bin/env perl +# +# Script for assembly-based calling of Group B Streptococcus serotypes for WGS data +# +# Source data should always be up-to-date at https://github.com/swainechen/GBS-SBG +# +use warnings; +use strict; +use Cwd; +use File::Copy; +use File::Spec; +use File::Fetch; +use File::Temp; +use File::Basename; +use IPC::Open3; +use Getopt::Long; +Getopt::Long::Configure("pass_through"); + +my $BLASTN_BIN = ""; +my $MIN_BLASTN_VERSION = "2.7.0"; +my $BLASTN_COMMAND; +my $GITHUB_REPO = "https://github.com/swainechen/GBS-SBG"; +my $GITHUB_REF_FASTA = "https://raw.githubusercontent.com/swainechen/GBS-SBG/main/GBS-SBG.fasta"; +my $REF_FASTA_BASE = "GBS-SBG.fasta"; +my $REF_FASTA_FULL = ""; +my $USE_STDIN = 0; +my $ORIGDIR = File::Spec->rel2abs(getcwd); +my $TEMPDIR; +my $INFILE; +my $BLASTOUT; +my $NAME = ""; +# this hash ref holds the cutoff values for reporting uncertainty +# these are different from the cutoffs used for making a serotype call, which +# are generally 90% identity over 90% coverage +my $UNCERTAINTY = { 'MAX_NEXT' => 0.95, + 'MIN_COV' => 0.99, + 'MIN_PID' => 99, # this is 0-100 as per BLASTN output + 'MAX_HITS' => 1, + 'MAX_CONTIGS' => 1, + 'MIN_NEXT_PRINT' => 0.9 # this is for when to not even + # bother printing the next best + }; +my ($d, $i, $t); +my (@f, @g); +my @index; +my $ff; +my (@start, @end, @id); +my (@start_c, @end_c, @id_c); +my $contig_count; +my $contig_sum; +my $serotype; + +# keyed on serotype, then should have SCORE, LENGTH, COVERAGE, FOOTPRINT, ID, HITS, NEXTBEST, COVERAGES_CONTIG +my $sdata; + +my @uncertainty; +my $contig; +my ($pid, $chld_in, $chld_out, $chld_err); +my ($max_cov, $max_id); +my $pass; +my $show_help = 0; +my $DEBUG = 0; +my $VERBOSE = 0; # show all hits +my $BEST_ONLY = 0; # better for automation if this is set + +# Some internal parameters +my $BLAST_MIN_PID = 90; # minimum percent ID to consider an individual hit +my $BLAST_MIN_LEN = 100; # minimum alignment length to consider an individual hit +my $TOTAL_MIN_COVERAGE = 0.9; # minimum fraction of the serotype to give a positive call + +GetOptions ( + 'help!' => \$show_help, + 'name=s' => \$NAME, + 'blast=s' => \$BLASTN_BIN, + 'ref=s' => \$REF_FASTA_FULL, + 'best!' => \$BEST_ONLY, + 'verbose!' => \$VERBOSE, + 'debug!' => \$DEBUG +); + +# figure out the input filename - mostly if it's on STDIN +if (defined $ARGV[0] && $ARGV[0] ne "" && $ARGV[0] ne "-") { + if (!-f $ARGV[0]) { + $show_help = 1; + print STDERR "Can't find input file $ARGV[0]\n\n"; + } else { + $INFILE = File::Spec->rel2abs($ARGV[0]); + if ($NAME eq "") { + $NAME = basename($INFILE); + } + } +} else { + # we'll try STDIN - set this here so we can check everything else first + $USE_STDIN = 1; + if ($NAME eq "") { + $NAME = "STDIN"; + } +} + +if ($show_help) { + &print_help; + exit; +} +$VERBOSE = 1 if $DEBUG; + +# make sure we have everything we need +if ($DEBUG) { + $TEMPDIR = File::Temp::tempdir( CLEANUP => 0 ); + print STDERR "[GBS-SBG INFO] Using temp directory $TEMPDIR\n"; +} else { + $TEMPDIR = File::Temp::tempdir( CLEANUP => 1 ); +} +$BLASTN_BIN = get_blastn($BLASTN_BIN); +$DEBUG && print STDERR "[GBS-SBG INFO] Found blastn $BLASTN_BIN\n"; +if (-f $REF_FASTA_BASE && $REF_FASTA_FULL eq "") { + $REF_FASTA_FULL = File::Spec->rel2abs($REF_FASTA_BASE); +} else { + if ($REF_FASTA_FULL eq "") { + # try a few guesses + if (eval "use FindBin; 1") { + use FindBin; + if (-f File::Spec->catfile($FindBin::Bin, $REF_FASTA_BASE)) { + $REF_FASTA_FULL = File::Spec->catfile($FindBin::Bin, $REF_FASTA_BASE); + } elsif (-f File::Spec->catfile($FindBin::Bin, "..", "lib", $REF_FASTA_BASE)) { + $REF_FASTA_FULL = File::Spec->catfile($FindBin::Bin, "..", "lib", $REF_FASTA_BASE); + } + } + } elsif (-d $REF_FASTA_FULL) { + if (-f File::Spec->catfile($REF_FASTA_FULL, $REF_FASTA_BASE)) { + $REF_FASTA_FULL = File::Spec->catfile($FindBin::Bin, $REF_FASTA_BASE); + } + } elsif (-f $REF_FASTA_FULL) { + $REF_FASTA_FULL = File::Spec->rel2abs($REF_FASTA_FULL); + } +} +if (-f $REF_FASTA_FULL) { + if (!-f "$REF_FASTA_FULL.nin") { + copy($REF_FASTA_FULL, File::Spec->catfile($TEMPDIR, $REF_FASTA_BASE)); + $REF_FASTA_FULL = File::Spec->catfile($TEMPDIR, $REF_FASTA_BASE); + chdir($TEMPDIR); + $pid = open3($chld_in, $chld_out, $chld_err, "makeblastdb -in $REF_FASTA_FULL -dbtype nucl"); + waitpid($pid, 0); + die("Some problem with running makeblastdb...exit $?\n") if $?; + } +} +if ($REF_FASTA_FULL eq "" || !-f $REF_FASTA_FULL) { + $ff = File::Fetch->new(uri => "$GITHUB_REF_FASTA"); + $REF_FASTA_FULL = $ff->fetch( to => $TEMPDIR ); + chdir($TEMPDIR); + $pid = open3($chld_in, $chld_out, $chld_err, "makeblastdb -in $REF_FASTA_FULL -dbtype nucl"); + waitpid($pid, 0); + die("Some problem with running makeblastdb...exit $?\n") if $?; +} +if (!-f $REF_FASTA_FULL) { + die("Couldn't find $REF_FASTA_BASE and couldn't download it; set the location with the -ref command line parameter\n"); +} +$DEBUG && print STDERR "[GBS-SBG INFO] Using reference file $REF_FASTA_FULL\n"; + +# run the blast +chdir($TEMPDIR); +if ($USE_STDIN) { + @f = <>; + open F, ">" . File::Spec->rel2abs(File::Spec->catfile($TEMPDIR, "STDIN_INPUT")); + print F @f; + close F; + $INFILE = File::Spec->rel2abs(File::Spec->catfile($TEMPDIR, "STDIN_INPUT")); +} +$BLASTOUT = File::Spec->rel2abs(File::Spec->catfile($TEMPDIR, "BLAST_OUTPUT")); +$BLASTN_COMMAND = "$BLASTN_BIN -db $REF_FASTA_FULL -query $INFILE -out $BLASTOUT -outfmt '6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send sstrand evalue bitscore'"; +$DEBUG && print STDERR "[GBS-SBG INFO] Running blast: $BLASTN_COMMAND\n"; +$pid = open3($chld_in, $chld_out, $chld_err, $BLASTN_COMMAND); +waitpid($pid, 0); +die ("Error $? with blast command...\n") if $?; + +# parse the results +open F, $BLASTOUT; +@f = ; +close F; +$d = (); +$DEBUG && print "== BLASTN Results ==\n", @f, "== END BLASTN Results ==\n"; + +# only take hits that are >90% identical +# data structure: $d->{Serotype}->{Query}->{i}->{SSTART} +# data structure: $d->{Serotype}->{Query}->{i}->{SEND} +# data structure: $d->{Serotype}->{Query}->{i}->{PID} +# data structure: $d->{Serotype}->{Query}->{i}->{LEN} +# data structure: $d->{Serotype}->{Query}->{i}->{BITSCORE} +# i is just a counter, i.e. line number in the blast output, can treat like a unique ID +# columns: +# 0 - qseqid +# 1 - sseqid +# 2 - pident (0-100) +# 3 - length +# 4 - mismatch +# 5 - gapopen +# 6 - qlen +# 7 - qstart +# 8 - qend +# 9 - slen +# 10 - sstart +# 11 - send +# 12 - sstrand +# 13 - evalue +# 14 - bitscore +foreach $i (0..$#f) { + chomp $f[$i]; + @g = split /\t/, $f[$i]; + next if ($g[2] <= $BLAST_MIN_PID); + next if ($g[3] <= $BLAST_MIN_LEN); + $serotype = parse_serotype($g[1]); + if (!defined $sdata->{$serotype}->{SCORE}) { + $sdata->{$serotype}->{SCORE} = 0; + $sdata->{$serotype}->{LENGTH} = $g[9]; + } + $sdata->{$serotype}->{SCORE} += $g[14]; # bitscore + $d->{$serotype}->{$g[0]}->{$i}->{PID} = $g[2]; + $d->{$serotype}->{$g[0]}->{$i}->{LEN} = $g[3]; + if ($g[12] eq "minus" || $g[12] eq "-" || $g[11] < $g[10]) { + $d->{$serotype}->{$g[0]}->{$i}->{SSTART} = $g[11]; + $d->{$serotype}->{$g[0]}->{$i}->{SEND} = $g[10]; + } else { + $d->{$serotype}->{$g[0]}->{$i}->{SSTART} = $g[10]; + $d->{$serotype}->{$g[0]}->{$i}->{SEND} = $g[11]; + } + $d->{$serotype}->{$g[0]}->{$i}->{BITSCORE} = $g[14]; +} +# main call is based on highest total bitscore, but have to check for >90% coverage +# collapse to highest ID per serotype to do some checking +foreach $serotype (keys %$sdata) { + $sdata->{$serotype}->{COVERAGE} = 0; + $sdata->{$serotype}->{CONTIGCOV} = (); + @start = (); + @end = (); + @id = (); + foreach $contig (keys %{$d->{$serotype}}) { + $t = $d->{$serotype}->{$contig}; + # we have to get per-contig data in this loop, then accumulate it for overall + @start_c = (); + @end_c = (); + @id_c = (); + foreach $i (sort {$t->{$a}->{SSTART} <=> $t->{$b}->{SSTART}} keys %{$t}) { + push @start_c, $t->{$i}->{SSTART}; + push @end_c, $t->{$i}->{SEND}; + push @id_c, $t->{$i}->{PID}; + push @start, $t->{$i}->{SSTART}; + push @end, $t->{$i}->{SEND}; + push @id, $t->{$i}->{PID}; + } + # merge from the back + foreach $i (reverse 1..$#start_c) { + # @start_c should already be sorted by start coordinates + if ($start_c[$i] <= $end_c[$i-1]) { + # aggregate the identity here + if ($id_c[$i-1] > $id_c[$i]) { + $id_c[$i-1] = ( ($end_c[$i-1] - $start_c[$i-1] + 1) * $id_c[$i-1] + + ($end_c[$i] - $end_c[$i-1]) * $id_c[$i] ) / + ( $end_c[$i] - $start_c[$i-1] + 1 ); + } else { + $id_c[$i-1] = ( ($start_c[$i] - $start_c[$i-1]) * $id_c[$i-1] + + ($end_c[$i] - $start_c[$i] + 1) * $id_c[$i] ) / + ( $end_c[$i] - $start_c[$i-1] + 1 ); + } + $end_c[$i-1] = $end_c[$i]; + splice @start_c, $i, 1; + splice @end_c, $i, 1; + splice @id_c, $i, 1; + } + } + # we should have nonoverlapping intervals now + foreach $i (0..$#start_c) { + $sdata->{$serotype}->{CONTIGCOV}->{$contig} += ($end_c[$i] - $start_c[$i] + 1) / $sdata->{$serotype}->{LENGTH}; + } + } + # need to re-sort the coordinates because we aggregated across contigs + @index = sort {$start[$a] <=> $start[$b]} 0..$#start; + @start = @start[@index]; + @end = @end[@index]; + @id = @id[@index]; + $sdata->{$serotype}->{HITS} = scalar @start; + # merge from the back + foreach $i (reverse 1..$#start) { + # @start should already be sorted by start coordinates + if ($start[$i] <= $end[$i-1]) { + if ($end[$i-1] < $end[$i]) { + # aggregate the identity here + if ($id[$i-1] > $id[$i]) { + $id[$i-1] = ( ($end[$i-1] - $start[$i-1] + 1) * $id[$i-1] + + ($end[$i] - $end[$i-1]) * $id[$i] ) / + ( $end[$i] - $start[$i-1] + 1 ); + } else { + $id[$i-1] = ( ($start[$i] - $start[$i-1]) * $id[$i-1] + + ($end[$i] - $start[$i] + 1) * $id[$i] ) / + ( $end[$i] - $start[$i-1] + 1 ); + } + $end[$i-1] = $end[$i]; + splice @start, $i, 1; + splice @end, $i, 1; + splice @id, $i, 1; + } else { + # interval $i is completely contained within interval $i-1 + # if $id[$i] > $id[$i-1] we could increase the overall identity...but + # this doesn't seem to make sense, since at this point interval $i-1 is + # a single blast hit, so instead just get rid of interval $i + splice @start, $i, 1; + splice @end, $i, 1; + splice @id, $i, 1; + } + } + } + # we should have nonoverlapping intervals now + foreach $i (0..$#start) { + $sdata->{$serotype}->{COVERAGE} += ($end[$i] - $start[$i] + 1); + $sdata->{$serotype}->{ID} += ($end[$i] - $start[$i] + 1) * $id[$i]; + } + $sdata->{$serotype}->{ID} /= $sdata->{$serotype}->{COVERAGE}; + $sdata->{$serotype}->{COVERAGE} /= $sdata->{$serotype}->{LENGTH}; +} +$DEBUG && print "[GBS-SBG INFO] Main output follows below:\n"; +if ($VERBOSE) { + print join ("\t", "# Name", "Serotype", "Total BitScore", "Total Coverage", "Percent ID", "Number of BLASTN Hits", "Number Contigs", "NextBest BitScore"), "\n"; +} else { + print join ("\t", "# Name", "Serotype", "Uncertainty"), "\n"; +} +# calculate the bitscore differential to the next best hit +$i = 0; +foreach $serotype (sort {$sdata->{$a}->{SCORE} <=> $sdata->{$b}->{SCORE}} keys %$sdata) { + $sdata->{$serotype}->{NEXTBEST} = $i/$sdata->{$serotype}->{SCORE}; + $i = $sdata->{$serotype}->{SCORE}; +} +# first ensure we don't have a nontypeable +$pass = 0; +$max_cov = 0; +$max_id = 0; +foreach $serotype (sort {$sdata->{$b}->{SCORE} <=> $sdata->{$a}->{SCORE}} keys %$sdata) { + if ($sdata->{$serotype}->{COVERAGE} >= $TOTAL_MIN_COVERAGE && + $sdata->{$serotype}->{ID} >= $BLAST_MIN_PID) { + $pass = 1; + } + $max_cov = $sdata->{$serotype}->{COVERAGE} if $max_cov < $sdata->{$serotype}->{COVERAGE}; + $max_id = $sdata->{$serotype}->{ID} if $max_id < $sdata->{$serotype}->{ID}; +} +if (!$pass) { + if ($VERBOSE) { + print join ("\t", $NAME, "NT", "0", "0", "0", "0", "0", "0"), "\n"; + } else { + print join ("\t", $NAME, "NT", join (";", "MaxCov:$max_cov", "MaxID:$max_id")), "\n"; + } +} + +# we will print in decreasing order of total bitscore +# $i here is used to count output lines if not debug (max 2) +$i = 0; +foreach $serotype (sort {$sdata->{$b}->{SCORE} <=> $sdata->{$a}->{SCORE}} keys %$sdata) { + # count # of contigs required + $contig_count = 0; + $contig_sum = 0; + foreach $contig (sort {$sdata->{$serotype}->{CONTIGCOV}->{$b} <=> $sdata->{$serotype}->{CONTIGCOV}->{$a}} keys %{$sdata->{$serotype}->{CONTIGCOV}}) { + $contig_sum += $sdata->{$serotype}->{CONTIGCOV}->{$contig}; + $contig_count++; + last if $contig_sum >= $sdata->{$serotype}->{COVERAGE}; + } + if ($VERBOSE) { + print join ("\t", $NAME, $serotype, $sdata->{$serotype}->{SCORE}, $sdata->{$serotype}->{COVERAGE}, $sdata->{$serotype}->{ID}, $sdata->{$serotype}->{HITS}, $contig_count, $sdata->{$serotype}->{NEXTBEST}), "\n"; + } else { + last if !$pass && $BEST_ONLY; + $i = 1 if !$pass && $i == 0; + print join ("\t", $NAME, $serotype); + @uncertainty = (); + if ($sdata->{$serotype}->{NEXTBEST} > $UNCERTAINTY->{MAX_NEXT}) { + push @uncertainty, "NextBitScore:$sdata->{$serotype}->{NEXTBEST}"; + } + if ($sdata->{$serotype}->{COVERAGE} < $UNCERTAINTY->{MIN_COV}) { + push @uncertainty, "Coverage:$sdata->{$serotype}->{COVERAGE}"; + } + if ($sdata->{$serotype}->{ID} < $UNCERTAINTY->{MIN_PID}) { + push @uncertainty, "Pidentity:$sdata->{$serotype}->{ID}"; + } + if ($sdata->{$serotype}->{HITS} > $UNCERTAINTY->{MAX_HITS}) { + push @uncertainty, "BLASTNHits:$sdata->{$serotype}->{HITS}"; + } + if ($contig_count > $UNCERTAINTY->{MAX_CONTIGS}) { + push @uncertainty, "Contigs:$contig_count"; + } + if (scalar @uncertainty) { + print "\t", join (";", @uncertainty), "\n"; + } else { + print "\n"; + } + $i++; + last if $BEST_ONLY; + last if ($i >= 2); + next if ($sdata->{$serotype}->{NEXTBEST} > $UNCERTAINTY->{MIN_NEXT_PRINT}); + last if ($sdata->{$serotype}->{COVERAGE} >= $TOTAL_MIN_COVERAGE && $sdata->{$serotype}->{ID} >= $BLAST_MIN_PID && $contig_count == 1); + } +} + +# +# subroutines below here +# +sub parse_serotype { + # assuming SRST2 format here + my ($s) = @_; + my @f = split /__/, $s; + if (defined $f[2]) { + return $f[2]; + } + return $s; +} + +sub get_blastn { + my ($init) = @_; + my $full = ""; + my $output = ""; + my $pid; + my ($chld_in, $chld_out, $chld_err); + my @h; + if ($init eq "") { + $full = "blastn"; + $pid = open3($chld_in, $chld_out, $chld_err, "$full -h"); + $output = join ("", <$chld_out>); + waitpid($pid, 0); + if ($?) { + die("Can't find blastn executable, can set this with the -blast command line parameter\n"); + } else { + if (check_blastn_version($output)) { + return($full); + } + } + } else { + $full = File::Spec->rel2abs($init); + if (-d $full) { + $full = File::Spec->catfile($full, "blastn"); + } + if (-f $full) { + $pid = open3($chld_in, $chld_out, $chld_err, "$full -h"); + $output = join ("", <$chld_out>); + waitpid($pid, 0); + if ($?) { + die ("Can't find blastn executable at $init, can set this with the -blast command line parameter"); + } + } + } + return($full); + + sub check_blastn_version { + my ($output) = @_; + my @h = split /\n/, $output; + my $i; + my $version_string; + my ($major, $minor, $patch); + my ($minmajor, $minminor, $minpatch) = split /\./, $MIN_BLASTN_VERSION; + for ($i=0; $i < scalar @h; $i++) { + last if $h[$i] =~ /^DESCRIPTION/; + } + if ($i < $#h) { + if ($h[$i+1] =~ /BLAST (\d+)\.(\d+)\.(\d+)\+?/) { + $version_string = $h[$i]; + $version_string =~ s/^.*BLAST //; + $major = $1; + $minor = $2; + $patch = $3; + if ($major > $minmajor) { + return (1); + } elsif ($major == $minmajor) { + if ($minor > $minminor) { + return (1); + } elsif ($minor == $minminor) { + if ($patch >= $minpatch) { + return (1); + } + } + } + die ("Need minimum blastn version $MIN_BLASTN_VERSION - could only find version $version_string - can set a custom blastn path with the -blast command line parameter\n"); + } + } + die ("Could not find blastn or couldn't determine the version - can set a custom blastn path with the -blastn command line parameter\n"); + } +} + +sub print_help { + print <<__HELP__; +Usage: $0 [ -name ] [ -best ] [ -blastn ] [ -ref ] [ -debug ] [ -verbose ] + + should be a regular multi-fasta file with assembled contigs or a complete genome. +You should specify the -name parameter, all output will be prefixed by that string. Defaults to the input filename. +The -best option only prints out one call (with possible uncertainty information). Default behavior is to also print the next best call if any of the following are true for the "best" serotype call: + - Overall coverage < $TOTAL_MIN_COVERAGE + - Overall percent ID < $BLAST_MIN_PID + - Number of contigs > 1 + - Number of BLASTN hits > 1 + - BitScore for next best serotype is >$UNCERTAINTY->{MIN_NEXT_PRINT}*BitScore for the best serotype + +Requires BLAST+ version $MIN_BLASTN_VERSION or above. Will look on your path, or you can specify the blastn binary with -blast. +Requires GBS-SBG references (typically $REF_FASTA_BASE), will try looking in a few places, otherwise will try to pull directly from $GITHUB_REPO. + +The -debug switch prints debugging info to STDERR. +The -verbose switch prints more information and outputs data for all hits, even if they don't pass the cutoffs. + +More complete documentation available at $GITHUB_REPO. +__HELP__ +} diff --git a/bin/combine_xml.py b/bin/combine_xml.py index 42dcd6d..98e4ba9 100644 --- a/bin/combine_xml.py +++ b/bin/combine_xml.py @@ -21,7 +21,8 @@ def main(): xml_element_tree = None for xml_file in xml_files: data = ET.tostring(ET.parse(xml_file).getroot()).decode("utf-8") - fout.write(data) + fout.write(data) + fout.write('\n') fout.close() diff --git a/bin/xml2csv.py b/bin/xml2csv.py new file mode 100644 index 0000000..dcaa511 --- /dev/null +++ b/bin/xml2csv.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +import csv +import argparse +import xml.etree.ElementTree as ET + +def xml_to_csv(element, csv_writer): + # Extract element name and text + name = element.tag + text = element.text + + # Extract element attributes + attrib = element.attrib + + # Write element name and text as well as attributes to CSV file + row = [name, text] + list(attrib.values()) + csv_writer.writerow(row) + + # Recursively process child elements + for child in element: + xml_to_csv(child, csv_writer) + +def main(): + + description = "Combine multiple xml files into a single xml file" + parser = argparse.ArgumentParser(description=description) + + # help=f"Space seperated xml file list, for example: 'f1.xml f2.xml f3.xml'\n", + parser.add_argument('-i', "--input", required=True, help=f"space seperated xml file name list\n") + parser.add_argument("-o", "--output", required=True, default="combined.xml", help=f"Output file name\n") + + args = parser.parse_args() + + cols = ["name", "phone", "email", "date", "country"] + rows = [] + + tree = ET.parse(args.input) + root = tree.getroot() + + for result in root[1]: + sample_data = [] + #print(result.tag) + #print(result.attrib) + for detail in result: + #print(detail.tag) + #print(detail.attrib) + print(detail.attrib.get("type")) + print(detail.attrib.get("value")) + + + #Open CSV file for writing + with open(args.output, "w", newline="") as csv_file: + # Create CSV writer + csv_writer = csv.writer(csv_file) + + # Convert XML to CSV + xml_to_csv(root, csv_writer) + +if __name__ == "__main__": + main() + diff --git a/conf/modules.config b/conf/modules.config index 98e683a..ca7e4e3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -72,6 +72,14 @@ if(!params.skip_illumina_reads_qc){ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } + withName: CSVTK_CONCAT_STATS_NOT_ASSEMBLED{ + publishDir = [ + path: { "${params.outdir}/report" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + + ] + } } if(params.illumina_reads_qc_tool == 'bbduk'){ @@ -276,7 +284,7 @@ if(! params.skip_tbprofiler){ if(! params.skip_pneumocat){ process{ withName: PNEUMOCAT { - ext.prefix = { "${meta.id}.pneumocat" } + //ext.prefix = { "${meta.id}.pneumocat" } ext.args = '--cleanup' publishDir = [ path: { "${params.outdir}/${meta.id}/pneumocat/illumina" }, @@ -628,7 +636,32 @@ if(! params.skip_emmtyper){ } } +if(! params.skip_gbssbg){ + process{ + withName: GBS_SBG { + ext.prefix = { "${meta.id}.gbssbg" } + + publishDir = [ + path: { "${params.outdir}/${meta.id}/gbssbg" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + + withName: CSVTK_CONCAT_GBSSBG{ + ext.args = '-C \'$\' -I -E ' + publishDir = [ + path: { "${params.outdir}/report" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + + } +} if (params.platform == 'nanopore') { includeConfig 'modules_nanopore.config' diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config index f7111ff..7ecc0cb 100644 --- a/conf/modules_illumina.config +++ b/conf/modules_illumina.config @@ -12,6 +12,8 @@ if(!params.skip_illumina_reads_assembly){ process { + + withName: SKESA { publishDir = [ path: { "${params.outdir}/${meta.id}/assembly/illumina/skesa" }, diff --git a/modules/local/csvtk/concat/main.nf b/modules/local/csvtk/concat/main.nf new file mode 100644 index 0000000..7528c8f --- /dev/null +++ b/modules/local/csvtk/concat/main.nf @@ -0,0 +1,40 @@ +process CSVTK_CONCAT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::csvtk=0.23.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : + 'biocontainers/csvtk:0.23.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv) + val in_format + val out_format + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + csvtk \\ + concat \\ + $args \\ + --num-cpus $task.cpus \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/local/csvtk/concat/meta.yml b/modules/local/csvtk/concat/meta.yml new file mode 100644 index 0000000..19e0e3b --- /dev/null +++ b/modules/local/csvtk/concat/meta.yml @@ -0,0 +1,51 @@ +name: csvtk_concat +description: Concatenate two or more CSV (or TSV) tables into a single table +keywords: + - concatenate + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" + - in_format: + type: string + description: Input format (csv, tab, or a delimiting character) + pattern: "*" + - out_format: + type: string + description: Output format (csv, tab, or a delimiting character) + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "version.yml" + - csv: + type: file + description: Concatenated CSV/TSV file + pattern: "*.{csv,tsv}" + +authors: + - "@rpetit3" diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 5e26b13..126e551 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -14,7 +14,7 @@ process FASTP { val save_merged output: - tuple val(meta), path('*fastp.*fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*fastp*fastq.gz') , optional:true, emit: reads tuple val(meta), path('*.json') , emit: json tuple val(meta), path('*.html') , emit: html tuple val(meta), path('*.log') , emit: log @@ -81,8 +81,8 @@ process FASTP { fastp \\ --in1 ${prefix}_1.fastq.gz \\ --in2 ${prefix}_2.fastq.gz \\ - --out1 ${prefix}_fastp.R1.fastq.gz \\ - --out2 ${prefix}_fastp.R2.fastq.gz \\ + --out1 ${prefix}_fastp_1.fastq.gz \\ + --out2 ${prefix}_fastp_2.fastq.gz \\ --json ${prefix}.fastp.json \\ --html ${prefix}.fastp.html \\ $adapter_list \\ diff --git a/modules/local/gbs/sbg/main.nf b/modules/local/gbs/sbg/main.nf new file mode 100644 index 0000000..f73f7e3 --- /dev/null +++ b/modules/local/gbs/sbg/main.nf @@ -0,0 +1,43 @@ +process GBS_SBG { + tag "$meta.id" + label 'process_low' + + conda "bioconda::blast=2.15.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast%3A2.15.0--pl5321h6f7f691_1': + 'biocontainers/blast%3A2.15.0--pl5321h6f7f691_1' }" + + + input: + tuple val(meta), path(fasta) //contigs + path(ref) + + output: + //# Name Serotype Uncertainty + //S18 NT MaxCov:0;MaxID:0 + //S17 GBS-SBG:Ia + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def gzipped = fasta.toString().endsWith('.gz') + def cmd_input = gzipped ? "zcat ${fasta}" : "cat ${fasta}" + def cmd_refdb = ref ? "-ref ${ref}" : "" + + """ + ${cmd_input} | GBS-SBG.pl \\ + -name ${meta.id} \\ + ${cmd_refdb} \\ + > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/hostile/main.nf b/modules/local/hostile/main.nf index a4439fd..595add5 100644 --- a/modules/local/hostile/main.nf +++ b/modules/local/hostile/main.nf @@ -46,8 +46,8 @@ process HOSTILE { then mv ${simplename}*.clean.fastq.gz ${prefix}.dehost.fastq.gz else - mv ${simplename}*.clean_1.fastq.gz ${prefix}.dehost.R1.fastq.gz - mv ${simplename}*.clean_2.fastq.gz ${prefix}.dehost.R2.fastq.gz + mv *clean_1.fastq.gz ${prefix}.dehost_1.fastq.gz + mv *clean_2.fastq.gz ${prefix}.dehost_2.fastq.gz fi cat <<-END_VERSIONS > versions.yml @@ -71,8 +71,8 @@ process HOSTILE { then mv ${simplename}*.clean.fastq.gz ${prefix}.dehost.fastq.gz else - mv ${simplename}*.clean_1.fastq.gz ${prefix}.dehost.R1.fastq.gz - mv ${simplename}*.clean_2.fastq.gz ${prefix}.dehost.R2.fastq.gz + mv ${simplename}*.clean_1.fastq.gz ${prefix}.dehost_1.fastq.gz + mv ${simplename}*.clean_2.fastq.gz ${prefix}.dehost_2.fastq.gz fi cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/pneumocat/environment.yml b/modules/local/pneumocat/environment.yml similarity index 100% rename from modules/nf-core/pneumocat/environment.yml rename to modules/local/pneumocat/environment.yml diff --git a/modules/local/pneumocat/main.nf b/modules/local/pneumocat/main.nf new file mode 100644 index 0000000..0990e7a --- /dev/null +++ b/modules/local/pneumocat/main.nf @@ -0,0 +1,73 @@ +VERSION = '1.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. +process PNEUMOCAT { + tag "$meta.id" + label 'process_low' + errorStrategy 'ignore' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pneumocat:1.2.1--0': + 'biocontainers/pneumocat:1.2.1--0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.final_results.xml"), emit: results + tuple val(meta), path("coverage_summary.txt"), emit: coverage + path "versions.yml" , emit: versions + /* + If only one capsular type is matched with more than 90% coverage + then the report from step 1 contained in this xml file is considered + the final result (result type="Serotype") and no further folders + will appear within the PneumoCaT output folder. If more than one + capsular type are matched with more than 90% coverage then the + software moves to step two and a SNP_based_serotyping folder is + created containing a second XML file with the final result + - see STEP 2- VARIANT-BASED APPROACH. + Note that the output XML file from step 1 only reports two capsular types, + when actually more could be matched and all will pass to step 2 for + further distinction. Further information on mapped serotypes in + stage 1 can be found in "Coverage_summary.txt". If the top hit + coverage is < 90% then no serotypes are reported and 'Failed' + appears instead. + */ + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + PneumoCaT.py \\ + --input_directory ./ \\ + $args \\ + --threads $task.cpus \\ + --output_dir ./ + + if [ -d "SNP_based_serotyping" ] + then + cp SNP_based_serotyping/${prefix}.results.xml ${prefix}.final_results.xml + else + cp ${prefix}.results.xml ${prefix}.final_results.xml + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pneumocat: $VERSION + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.results.xml + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pneumocat: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/pneumocat/meta.yml b/modules/local/pneumocat/meta.yml similarity index 100% rename from modules/nf-core/pneumocat/meta.yml rename to modules/local/pneumocat/meta.yml diff --git a/modules/local/shovill/main.nf b/modules/local/shovill/main.nf index a89e09b..a1f6ab3 100644 --- a/modules/local/shovill/main.nf +++ b/modules/local/shovill/main.nf @@ -1,7 +1,8 @@ process SHOVILL { tag "$meta.id" label 'process_medium' - + errorStrategy 'ignore' + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/shovill:1.1.0--0' : diff --git a/modules/local/tbprofiler/collate/main.nf b/modules/local/tbprofiler/collate/main.nf index c361600..b5fc463 100644 --- a/modules/local/tbprofiler/collate/main.nf +++ b/modules/local/tbprofiler/collate/main.nf @@ -1,7 +1,8 @@ process TBPROFILER_COLLATE { tag "$meta.id" label 'process_medium' - + errorStrategy 'ignore' + conda "${moduleDir}/environment.yml" conda "bioconda::tb-profiler=5.0.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/tbprofiler/profile/main.nf b/modules/local/tbprofiler/profile/main.nf index 19ea114..5bc0205 100644 --- a/modules/local/tbprofiler/profile/main.nf +++ b/modules/local/tbprofiler/profile/main.nf @@ -1,7 +1,7 @@ process TBPROFILER_PROFILE { tag "$meta.id" label 'process_medium' - + errorStrategy 'ignore' conda "${moduleDir}/environment.yml" conda "bioconda::tb-profiler=5.0.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/pneumocat/main.nf b/modules/nf-core/pneumocat/main.nf deleted file mode 100644 index 603a8d2..0000000 --- a/modules/nf-core/pneumocat/main.nf +++ /dev/null @@ -1,50 +0,0 @@ -VERSION = '1.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. -process PNEUMOCAT { - tag "$meta.id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pneumocat:1.2.1--0': - 'biocontainers/pneumocat:1.2.1--0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.xml"), emit: xml - tuple val(meta), path("*.txt"), emit: txt - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - PneumoCaT.py \\ - --input_directory ./ \\ - $args \\ - --threads $task.cpus \\ - --output_dir ./ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pneumocat: $VERSION - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.results.xml - touch ${prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pneumocat: $VERSION - END_VERSIONS - """ -} diff --git a/modules/nf-core/tbprofiler/profile/environment.yml b/modules/nf-core/tbprofiler/profile/environment.yml deleted file mode 100644 index ae2d3b9..0000000 --- a/modules/nf-core/tbprofiler/profile/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: tbprofiler_profile -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::tb-profiler=3.0.8 diff --git a/modules/nf-core/tbprofiler/profile/main.nf b/modules/nf-core/tbprofiler/profile/main.nf deleted file mode 100644 index 4d5f432..0000000 --- a/modules/nf-core/tbprofiler/profile/main.nf +++ /dev/null @@ -1,41 +0,0 @@ -process TBPROFILER_PROFILE { - tag "$meta.id" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/tb-profiler:3.0.8--pypyh5e36f6f_0' : - 'biocontainers/tb-profiler:3.0.8--pypyh5e36f6f_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("bam/*.bam") , emit: bam - tuple val(meta), path("results/*.csv") , emit: csv, optional: true - tuple val(meta), path("results/*.json"), emit: json - tuple val(meta), path("results/*.txt") , emit: txt, optional: true - tuple val(meta), path("vcf/*.vcf.gz") , emit: vcf - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - def input_reads = meta.single_end ? "--read1 $reads" : "--read1 ${reads[0]} --read2 ${reads[1]}" - """ - tb-profiler \\ - profile \\ - $args \\ - --prefix ${prefix} \\ - --threads $task.cpus \\ - $input_reads - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - tbprofiler: \$( echo \$(tb-profiler --version 2>&1) | sed 's/TBProfiler version //') - END_VERSIONS - """ -} diff --git a/modules/nf-core/tbprofiler/profile/meta.yml b/modules/nf-core/tbprofiler/profile/meta.yml deleted file mode 100644 index a53aa3e..0000000 --- a/modules/nf-core/tbprofiler/profile/meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: tbprofiler_profile -description: A tool to detect resistance and lineages of M. tuberculosis genomes -keywords: - - Mycobacterium tuberculosis - - resistance - - serotype -tools: - - tbprofiler: - description: Profiling tool for Mycobacterium tuberculosis to detect drug resistance and lineage from WGS data - homepage: https://github.com/jodyphelan/TBProfiler - documentation: https://jodyphelan.gitbook.io/tb-profiler/ - tool_dev_url: https://github.com/jodyphelan/TBProfiler - doi: "10.1186/s13073-019-0650-x" - licence: ["GPL v3"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: FASTQ file - pattern: "*.{fastq.gz,fq.gz}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - bam: - type: file - description: BAM file with alignment details - pattern: "*.bam" - - csv: - type: file - description: Optional CSV formated result file of resistance and strain type - pattern: "*.csv" - - json: - type: file - description: JSON formated result file of resistance and strain type - pattern: "*.json" - - txt: - type: file - description: Optional text file of resistance and strain type - pattern: "*.txt" - - vcf: - type: file - description: VCF with variant info again refernce genomes - pattern: "*.vcf" -authors: - - "@rpetit3" -maintainers: - - "@rpetit3" diff --git a/nextflow.config b/nextflow.config index 9a19c92..c77d16c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,6 +24,7 @@ params { checkm2_db = "/nfs/APL_Genomics/db/prod/CheckM2_database/uniref100.KO.1.dmnd" amrfinderplus_db = null gambit_db = "/nfs/APL_Genomics/db/prod/gambit" + //illumina qc parameters skip_illumina_reads_qc = false @@ -33,6 +34,7 @@ params { //illumina assembly skip_illumina_reads_assembly = false + min_tbp_for_assembly_illumina = 1000000 illumina_reads_assembler = 'unicycler' //megahit|skesa|spades|unicycler|shovill // nanopre qc parameters @@ -43,6 +45,7 @@ params { //nanopore assembly skip_nanopore_reads_assembly = false + min_tbp_for_assembly_nanopore = 1000000 //illumina read polish skip_illumina_reads_polish = false skip_polypolish = false @@ -66,7 +69,9 @@ params { skip_emmtyper = true //capsular type to Streptococcus pneumoniae skip_pneumocat = true - + //serotype for GBS: s_agalactiae + skip_gbssbg = true + gbssbg_db = "/nfs/APL_Genomics/db/prod/gbs-sbg/GBS-SBG.fasta" // MultiQC options skip_multiqc = false multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 4432338..badce86 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -93,6 +93,12 @@ "shovill" ] }, + "min_tbp_for_assembly_illumina": { + "type": "integer", + "default": 1000000, + "description": "Required total basepairs of the reads to get into assembly stage.", + "fa_icon": "fas fa-cut" + }, "skip_illumina_reads_qc": { "type": "boolean", "default": false, @@ -111,12 +117,6 @@ "description": "Skip illumina read assembly step.", "fa_icon": "fas fa-cut" }, - "skip_illumina_kraken2": { - "type": "boolean", - "default": false, - "description": "Skip kraken2 with illumina reads step.", - "fa_icon": "fas fa-cut" - }, "hostile_human_ref_bowtie2": { "type": "string", "description": "hostile human genome index file", @@ -141,18 +141,18 @@ "description": "Skip illumina read quality control step.", "fa_icon": "fas fa-cut" }, - "skip_nanopore_kraken2": { - "type": "boolean", - "default": true, - "description": "Skip kraken2 with nanopore reads step.", - "fa_icon": "fas fa-cut" - }, "nanopore_reads_assembler": { "type": "string", "default": "flye+medaka", "description": "Specifies the nanopore reads assembly tool to use - available options are 'flye+medaka'.", "fa_icon": "fas fa-cut" }, + "min_tbp_for_assembly_nanopore": { + "type": "integer", + "default": 1000000, + "description": "Required total basepairs of the reads to get into assembly stage.", + "fa_icon": "fas fa-cut" + }, "skip_polypolish": { "type": "boolean", "default": false, @@ -190,12 +190,6 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "bakta_db", - "checkm2_db", - "gambit_db", - "kraken2_db" - ], "properties": { "bakta_db": { "type": "string", @@ -213,30 +207,12 @@ "type": "string", "default": null }, - "kraken2_db": { - "type": "string", - "default": "/nfs/APL_Genomics/db/prod/kraken2/k2_standard_08gb_20220926", - "description": "Specify path to kraken2 database", - "fa_icon": "fas fa-cut" - }, - "gambit_db": { - "type": "string", - "default": "/nfs/APL_Genomics/db/prod/gambit", - "description": "Path to gambit database.", - "fa_icon": "fas fa-cut" - }, "skip_checkm2": { "type": "boolean", "default": false, "description": "Skip checkm2 step.", "fa_icon": "fas fa-cut" }, - "skip_gambit": { - "type": "boolean", - "default": false, - "description": "Skip gambit step.", - "fa_icon": "fas fa-cut" - }, "skip_bakta": { "type": "boolean", "default": false, @@ -276,6 +252,63 @@ "default": false, "description": "Skip assembly depth calculation step.", "fa_icon": "fas fa-cut" + } + } + }, + "taxonomic_tool_options": { + "title": "taxonomic_tool_options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": [ + "gambit_db", + "kraken2_db" + ], + "properties": { + "kraken2_db": { + "type": "string", + "default": "/nfs/APL_Genomics/db/prod/kraken2/k2_standard_08gb_20220926", + "description": "Specify path to kraken2 database", + "fa_icon": "fas fa-cut" + }, + "gambit_db": { + "type": "string", + "default": "/nfs/APL_Genomics/db/prod/gambit", + "description": "Path to gambit database.", + "fa_icon": "fas fa-cut" + }, + "skip_illumina_kraken2": { + "type": "boolean", + "default": false, + "description": "Skip kraken2 with illumina reads step.", + "fa_icon": "fas fa-cut" + }, + "skip_nanopore_kraken2": { + "type": "boolean", + "default": true, + "description": "Skip kraken2 with nanopore reads step.", + "fa_icon": "fas fa-cut" + }, + "skip_gambit": { + "type": "boolean", + "default": false, + "description": "Skip gambit step.", + "fa_icon": "fas fa-cut" + } + } + }, + "special_tool_options": { + "title": "special_tool_options", + "type": "object", + "fa_icon": "fas fa-terminal", + "help_text": "By default, those tools are all turned off.", + "description": "Define organism specific tools: Mycobacterium tuberculosis,Streptococcus pneumoniae, Streptococcus pyogenes, Streptococcus agalactiae gas.", + "properties": { + "gbssbg_db": { + "type": "string", + "default": "/nfs/APL_Genomics/db/prod/gbs-sbg/GBS-SBG.fasta", + "description": "Path to GBS-SBG database.", + "fa_icon": "fas fa-cut" }, "skip_tbprofiler": { "type": "boolean", @@ -294,6 +327,12 @@ "default": true, "description": "Skip capsular typing of Streptococcus pneumoniae using pneumocat.", "fa_icon": "fas fa-cut" + }, + "skip_gbssbg": { + "type": "boolean", + "default": true, + "description": "Skip serotyping of Streptococcus agalactiae using GBS-SBG.", + "fa_icon": "fas fa-cut" } } }, @@ -355,7 +394,7 @@ "max_cpus": { "type": "integer", "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, + "default": 8, "fa_icon": "fas fa-microchip", "hidden": true, "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" @@ -363,7 +402,7 @@ "max_memory": { "type": "string", "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", + "default": "64.GB", "fa_icon": "fas fa-memory", "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "hidden": true, @@ -496,6 +535,15 @@ { "$ref": "#/definitions/nanopore_options" }, + { + "$ref": "#/definitions/annotation_options" + }, + { + "$ref": "#/definitions/taxonomic_tool_options" + }, + { + "$ref": "#/definitions/special_tool_options" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index 65e7935..fa6c6a8 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -101,7 +101,20 @@ workflow PREPARE_REFERENCES { } } - + // + // Prepare GBS-SBG database + // + ch_gbssbg_db = Channel.empty() + if (!params.skip_gbssbg) { + if (params.gbssbg_db) { + ch_gbssbg_db = Channel.value(file(params.gbssbg_db)) + } + else { + log.error "Please specify a valid gbs-sbg database." + System.exit(1) + } + } + emit: ch_hostile_ref_bowtie2 ch_hostile_ref_minimap2 @@ -110,6 +123,6 @@ workflow PREPARE_REFERENCES { ch_bakta_db // path to bakta database ch_amrfinderplus_db // path: path to amrfinderplus_db ch_gambit_db // path: path to gambit db - + ch_gbssbg_db // path: path to gbs-sbg database versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/special_tools.nf b/subworkflows/local/special_tools.nf index 6f12e09..c0a545c 100644 --- a/subworkflows/local/special_tools.nf +++ b/subworkflows/local/special_tools.nf @@ -9,7 +9,7 @@ include { include { PNEUMOCAT -} from '../../modules/nf-core/pneumocat/main.nf' +} from '../../modules/local/pneumocat/main.nf' include { COMBINE_XML as COMBINE_XML_PNEUMOCAT; } from '../../modules/local/combine/xml' @@ -18,16 +18,29 @@ include { } from '../../modules/local/emmtyper/main.nf' include { CSVTK_CONCAT as CSVTK_CONCAT_EMMTYPER; + } from '../../modules/nf-core/csvtk/concat/main' +include { + CSVTK_CONCAT as CSVTK_CONCAT_GBSSBG; +} from '../../modules/local/csvtk/concat/main' + include { REFORMATEMMTYPERCSV; } from '../../modules/local/misc' -workflow SPECIAL_TOOLS { + +include { + GBS_SBG; +} from '../../modules/local/gbs/sbg' + +include { + PREPARE_REFERENCES +} from '../../subworkflows/local/prepare_references' + +workflow SPECIAL_TOOLS_BASED_ON_READS { take: illumina_reads nanopore_reads - contigs main: ch_software_versions = Channel.empty() @@ -49,16 +62,31 @@ workflow SPECIAL_TOOLS { ) } } - + //capsular type to Streptococcus pneumoniae if(! params.skip_pneumocat){ PNEUMOCAT(illumina_reads) ch_software_versions = ch_software_versions.mix(PNEUMOCAT.out.versions) - //PNEUMOCAT.out.xml.map{meta, tsv -> tsv }.collect().map { files -> tuple([id:"pneumocat"], files)}.view() - COMBINE_XML_PNEUMOCAT(PNEUMOCAT.out.xml.map{meta, tsv -> tsv }.collect().map { files -> tuple([id:"pneumocat"], files)}) + COMBINE_XML_PNEUMOCAT(PNEUMOCAT.out.results.map{meta, tsv -> tsv }.collect().map { files -> tuple([id:"pneumocat"], files)}) ch_software_versions = ch_software_versions.mix(COMBINE_XML_PNEUMOCAT.out.versions) } + + emit: + versions = ch_software_versions + +} + +workflow SPECIAL_TOOLS_BASED_ON_CONTIGS { + take: + contigs + main: + + ch_software_versions = Channel.empty() + + PREPARE_REFERENCES () + ch_software_versions = ch_software_versions.mix(PREPARE_REFERENCES.out.versions) + //emm-typing of Streptococcus pyogenes if(! params.skip_emmtyper){ EMMTYPER(contigs) @@ -72,9 +100,23 @@ workflow SPECIAL_TOOLS { ) ch_software_versions = ch_software_versions.mix(CSVTK_CONCAT_EMMTYPER.out.versions) } + + // serotyping GBS: Streptococcus agalactiae + if(! params.skip_gbssbg){ + GBS_SBG(contigs, PREPARE_REFERENCES.out.ch_gbssbg_db) + ch_software_versions = ch_software_versions.mix(GBS_SBG.out.versions) + GBS_SBG.out.tsv.map {meta, tsv -> tsv }.collect().view() + CSVTK_CONCAT_GBSSBG( + GBS_SBG.out.tsv.map {meta, tsv -> tsv }.collect().map { files -> tuple([id:"gbs-sbg"], files)}, + "tsv", + "tsv" + ) + ch_software_versions = ch_software_versions.mix(CSVTK_CONCAT_GBSSBG.out.versions) + } emit: - + //emm-typing = REFORMATEMMTYPERCSV.out.csv //csv format emmtyper verbose format versions = ch_software_versions } + diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 431c53e..956ca00 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -59,7 +59,8 @@ include { } from '../subworkflows/local/depth_illumina' include { - SPECIAL_TOOLS + SPECIAL_TOOLS_BASED_ON_READS; + SPECIAL_TOOLS_BASED_ON_CONTIGS; } from '../subworkflows/local/special_tools' /* @@ -85,6 +86,7 @@ include { include { CSVTK_CONCAT as CSVTK_CONCAT_STATS_ASM; CSVTK_CONCAT as CSVTK_CONCAT_DEPTH_ILLUMINA; + CSVTK_CONCAT as CSVTK_CONCAT_STATS_NOT_ASSEMBLED; } from '../modules/nf-core/csvtk/concat/main' /* include { @@ -141,6 +143,7 @@ workflow ILLUMINA { out_format = "tsv" contig_file_ext = ".fa.gz" + //illumina_reads.view() if(!params.skip_illumina_reads_qc){ QC_ILLUMINA( @@ -171,9 +174,40 @@ workflow ILLUMINA { ch_software_versions = ch_software_versions.mix(KRAKEN2_KRAKEN2_ILLUMINA.out.versions) } + //tools for special organism + SPECIAL_TOOLS_BASED_ON_READS(illumina_reads, []) + ch_software_versions = ch_software_versions.mix(SPECIAL_TOOLS_BASED_ON_READS.out.versions) + // assembly if(!params.skip_illumina_reads_assembly){ - + //illumina_reads.view() + + illumina_reads.join(QC_ILLUMINA.out.qc_stats).map{ + meta, reads, stats -> [meta, reads, stats.splitCsv(header: true, sep:'\t', strip:true)] + }.map{ + meta, reads, row -> [meta, reads, row.sum_len[0]] + }.branch{ + pass_for_assembly: it[2].toInteger() >= params.min_tbp_for_assembly_illumina + fail_for_assembly: it[2].toInteger() < params.min_tbp_for_assembly_illumina + }.set{ + ch_input + } + + CSVTK_CONCAT_STATS_NOT_ASSEMBLED( + ch_input.fail_for_assembly.map{ + meta, reads, row -> [meta, reads] + }.join(QC_ILLUMINA.out.qc_stats).map{ + cfg, reads, stats -> stats + }.collect().map { files -> tuple([id: "samples_not_assembled"], files)}, + in_format, + out_format + ) + + illumina_reads = ch_input.pass_for_assembly.map{ + meta, reads, row -> [meta, reads] + } + //illumina_reads.view() + ASSEMBLE_ILLUMINA ( illumina_reads) // zero size contig can cause some of the program such as bakta, mobsuite file ASSEMBLE_ILLUMINA.out.contigs @@ -240,21 +274,12 @@ workflow ILLUMINA { ) ch_software_versions = ch_software_versions.mix(ANNOTATION.out.versions) + SPECIAL_TOOLS_BASED_ON_CONTIGS(contigs) + ch_software_versions = ch_software_versions.mix(SPECIAL_TOOLS_BASED_ON_CONTIGS.out.versions) } - illumina_reads.join(contigs).multiMap{ - - it -> - illumina_reads: [it[0], it[1]] - contigs: [it[0], it[2]] - - }.set{ - ch_input - } - SPECIAL_TOOLS(ch_input.illumina_reads, [], ch_input.contigs) - ch_software_versions = ch_software_versions.mix(SPECIAL_TOOLS.out.versions) - + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_software_versions.unique().collectFile(name: 'collated_versions.yml') diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf index ac56f84..48d8a6d 100644 --- a/workflows/nanopore.nf +++ b/workflows/nanopore.nf @@ -41,7 +41,10 @@ include { ASSEMBLE_NANOPORE } from '../subworkflows/local/assembly_n include { RUN_POLYPOLISH; RUN_POLCA; } from '../subworkflows/local/polisher_illumina' include { DEPTH_ILLUMINA } from '../subworkflows/local/depth_illumina' include { DEPTH_NANOPORE } from '../subworkflows/local/depth_nanopore' -include { SPECIAL_TOOLS } from '../subworkflows/local/special_tools' +include { + SPECIAL_TOOLS_BASED_ON_READS; + SPECIAL_TOOLS_BASED_ON_CONTIGS; +} from '../subworkflows/local/special_tools' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -56,6 +59,7 @@ include { CSVTK_CONCAT as CSVTK_CONCAT_STATS_ASM; CSVTK_CONCAT as CSVTK_CONCAT_DEPTH_NANOPORE; CSVTK_CONCAT as CSVTK_CONCAT_DEPTH_ILLUMINA; + CSVTK_CONCAT as CSVTK_CONCAT_STATS_NOT_ASSEMBLED; } from '../modules/nf-core/csvtk/concat/main' include { KRAKEN2_KRAKEN2 as KRAKEN2_KRAKEN2_ILLUMINA } from '../modules/nf-core/kraken2/kraken2/main' @@ -134,9 +138,40 @@ workflow NANOPORE { ch_software_versions = ch_software_versions.mix(QC_NANOPORE.out.versions) } + //tools for special organism + SPECIAL_TOOLS_BASED_ON_READS(illumina_reads, nanopore_reads) + ch_software_versions = ch_software_versions.mix(SPECIAL_TOOLS_BASED_ON_READS.out.versions) + // assembly if(!params.skip_nanopore_reads_assembly){ + nanopore_reads.join(QC_NANOPORE.out.qc_stats).map{ + meta, reads, stats -> [meta, reads, stats.splitCsv(header: true, sep:'\t', strip:true)] + }.map{ + meta, reads, row -> [meta, reads, row.sum_len[0]] + }.branch{ + pass_for_assembly: it[2].toInteger() >= params.min_tbp_for_assembly_nanopore + fail_for_assembly: it[2].toInteger() < params.min_tbp_for_assembly_nanopore + }.set{ + ch_input + } + + CSVTK_CONCAT_STATS_NOT_ASSEMBLED( + ch_input.fail_for_assembly.map{ + meta, reads, row -> [meta, reads] + }.join(QC_NANOPORE.out.qc_stats).map{ + cfg, reads, stats -> stats + }.collect().map { files -> tuple([id: "samples_not_assembled"], files)}, + in_format, + out_format + ) + + nanopore_reads = ch_input.pass_for_assembly.map{ + meta, reads, row -> [meta, reads] + } + nanopore_reads.view() + + //flye with 4x iteration and 1x medaka ASSEMBLE_NANOPORE ( nanopore_reads) ASSEMBLE_NANOPORE.out.contigs @@ -253,21 +288,11 @@ workflow NANOPORE { } ANNOTATION(contigs, PREPARE_REFERENCES.out.ch_bakta_db, PREPARE_REFERENCES.out.ch_amrfinderplus_db) - } - - illumina_reads.join(nanopore_reads).join(contigs).multiMap{ - - it -> - illumina_reads: [it[0], it[1]] - nanopore_reads: [it[0], it[2]] - contigs: [it[0], it[3]] - }.set{ - ch_input - } + SPECIAL_TOOLS_BASED_ON_CONTIGS(contigs) + ch_software_versions = ch_software_versions.mix(SPECIAL_TOOLS_BASED_ON_CONTIGS.out.versions) + } - SPECIAL_TOOLS(ch_input.illumina_reads, ch_input.nanopore_reads, ch_input.contigs) - CUSTOM_DUMPSOFTWAREVERSIONS ( ch_software_versions.unique().collectFile(name: 'collated_versions.yml') )