From 2e10931c8deb711837531ee8845355f58be18359 Mon Sep 17 00:00:00 2001 From: Lionel Guy Date: Tue, 26 Aug 2014 13:03:31 +0200 Subject: [PATCH 1/3] Added intial support for prodigal annotating genes running off contig edges. --- bin/prokka | 125 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 45 deletions(-) diff --git a/bin/prokka b/bin/prokka index ea46a9c..5fd8272 100755 --- a/bin/prokka +++ b/bin/prokka @@ -26,6 +26,7 @@ use XML::Simple; use List::Util qw(min max sum); use Scalar::Util qw(openhandle); use Data::Dumper; +use Bio::Location::Fuzzy; use Bio::Root::Version; use Bio::SeqIO; use Bio::SearchIO; @@ -642,58 +643,78 @@ if ($tools{'minced'}->{HAVE}) { # CDS msg("Predicting coding sequences"); +my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; my $totalbp = sum( map { $seq{$_}{DNA}->length } keys %seq); my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta'; msg("Contigs total $totalbp bp, so using $prodigal_mode mode"); my $num_cds=0; -my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q"; +#my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f gff -q"; +my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -m -g $gcode -p $prodigal_mode -f gff -q"; msg("Running: $cmd"); open my $PRODIGAL, '-|', $cmd; -my $sid; -while (<$PRODIGAL>) { - if (m/seqhdr="([^\s\"]+)"/) { - $sid = $1; -# msg("CDS $sid"); - next; - } - elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) { - my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop? - my $cds = Bio::SeqFeature::Generic->new( - -primary => 'CDS', - -seq_id => $sid, - -source => $tool, - -start => $1, - -end => $2, - -strand => ($3 eq '+' ? +1 : -1), - -score => undef, - -frame => 0, - -tag => { - 'inference' => "ab initio prediction:$tool", - } - ); +my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL, + -gff_version => 3); +while (my $cds = $gff_in->next_feature){ + my $sid = $cds->seq_id; + # Determine whether the CDS is partial, i.e. it is running off + # contig edges. Add the appropriate /codon_start attribute if necessary + # Assuming prodigal only returns one single 'partial' tag + # Modify start & end attributes, because gff output is wrong + my ($partial) = $cds->get_tag_values('partial'); + my ($codon_start); + my ($start, $end) = ($cds->start, $cds->end); + my $ctg_length = $seq{$sid}{DNA}->length; + if ($partial ne '00') { + if ($partial =~ /^1/){ + $codon_start = $cds->start if ($cds->strand == 1); + $start = '<1'; + } + if ($partial =~ /1$/){ + $codon_start = ($ctg_length - $cds->end + 1) + if ($cds->strand == -1); + $end = ">$ctg_length"; + } + my $fuzzyloc = Bio::Location::Fuzzy->new( + -start => $start, + -end => $end, + -location_type => '..', + -strand => $cds->strand); + $cds->location($fuzzyloc); + } + # Removing all tags + foreach my $tag ($cds->get_all_tags) { + $cds->remove_tag($tag); + } + # Adding only codon_start, if applicable, and the inference tag + if ($codon_start){ + $cds->add_tag_value('codon_start', $codon_start); + $cds->frame($codon_start-1); + } + $cds->add_tag_value('inference', "ab initio prediction:$tool"); my $overlap; for my $rna (@allrna) { - # same contig, overlapping (could check same strand too? not sure) - if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { - $overlap = $rna; - last; - } + # same contig, overlapping (could check same strand too? not sure) + if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { + $overlap = $rna; + last; + } } # mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap. if ($overlap and $kingdom ne 'Mitochondria') { - my $type = $overlap->primary_tag; - msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand"); + my $type = $overlap->primary_tag; + msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand"); } else { - $num_cds++; - push @{$seq{$sid}{FEATURE}}, $cds; - ## BUG James Doonan - ensure no odd features extending beyond contig - if ($cds->end > $seq{$cds->seq_id}{DNA}->length ) { - err("CDS end", $cds->end, "is beyond length", $seq{$sid}{DNA}->length, "in contig $sid") - } - + $num_cds++; + push @{$seq{$sid}{FEATURE}}, $cds; + ## BUG James Doonan - ensure no odd features extending beyond contig + #(my $cds_end = $cds->end) =~ s/^>//; + my $cds_end = $cds->end; + if ($cds_end > $seq{$cds->seq_id}{DNA}->length ) { + err("CDS end", $cds_end, "is beyond length", + $seq{$sid}{DNA}->length, "in contig $sid") + } } - } } msg("Found $num_cds CDS"); @@ -890,7 +911,10 @@ else { next unless $f->primary_tag eq 'CDS'; next if $f->has_tag('product'); $cds{++$count} = $f; - print $faa ">$count\n",$f->seq->translate->seq,"\n"; + my $frame = 1; + ($frame) = $f->get_tag_values('codon_start') + if $f->has_tag('codon_start'); + print $faa ">$count\n",$f->seq->translate(-frame => $frame-1)->seq,"\n"; } } close $faa; @@ -1035,12 +1059,13 @@ for my $sid (sort keys %seq) { my $g = Bio::SeqFeature::Generic->new( -primary => 'gene', -seq_id => $f->seq_id, - -start => $f->start, - -end => $f->end, - -strand => $f->strand, + # -start => $f->start, + # -end => $f->end, + # -strand => $f->strand, -source_tag => $EXE, -tag => { 'locus_tag'=> $ID }, ); + $g->location($f->location); if (my $gENE = TAG($f, 'gene')) { $g->add_tag_value('gene', $gENE); } @@ -1079,10 +1104,20 @@ for my $sid (sort keys %seq) { if ($f->primary_tag eq 'CDS' and not $f->has_tag('product')) { $f->add_tag_value('product', $HYPO); } - + ## DEBUG ## + if ($f->seq_id eq "contig000005"){ + print "HERE"; + } + ### END DEBUG ## + print $gff_fh $f->gff_string($gff_factory),"\n"; - - my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start); + + my ($start, $end) = ($f->start, $f->end); + $start = (($f->strand >= 0) ? '<' : '>') . $start + if ($f->location->start_pos_type eq 'BEFORE'); + $end = (($f->strand >= 0) ? '>' : '<') . $end + if ($f->location->end_pos_type eq 'AFTER'); + my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start); print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n"; for my $tag ($f->get_all_tags) { next if $tag =~ m/^(ID|Alias)$/; # remove GFF specific tags From a1e22dbfa8c1b7e45dbddff04b53b1ddb68a2bec Mon Sep 17 00:00:00 2001 From: Lionel Guy Date: Tue, 26 Aug 2014 15:27:19 +0200 Subject: [PATCH 2/3] Cleanin up debug code; removing commented code --- bin/prokka | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/bin/prokka b/bin/prokka index 5fd8272..41c80c5 100755 --- a/bin/prokka +++ b/bin/prokka @@ -222,7 +222,7 @@ my(@Options, $quiet, $kingdom, $fast, $force, $outdir, $prefix, $cpus, $addgenes $genus, $species, $strain, $plasmid, $usegenus, $proteins, $centre, $scaffolds, $rfam, $norrna, $notrna, $rnammer, $rawproduct, - $metagenome, $compliant, $listdb, $citation); + $metagenome, $partialgenes, $compliant, $listdb, $citation); setOptions(); # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . @@ -646,10 +646,11 @@ msg("Predicting coding sequences"); my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; my $totalbp = sum( map { $seq{$_}{DNA}->length } keys %seq); my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta'; -msg("Contigs total $totalbp bp, so using $prodigal_mode mode"); +my $prodigal_closedends = $partialgenes ? '' : '-c'; +msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " . + ($partialgenes ? 'not' : '') . " allowing genes to run off contig edges."); my $num_cds=0; -#my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f gff -q"; -my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -m -g $gcode -p $prodigal_mode -f gff -q"; +my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q"; msg("Running: $cmd"); open my $PRODIGAL, '-|', $cmd; my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL, @@ -709,9 +710,8 @@ while (my $cds = $gff_in->next_feature){ push @{$seq{$sid}{FEATURE}}, $cds; ## BUG James Doonan - ensure no odd features extending beyond contig #(my $cds_end = $cds->end) =~ s/^>//; - my $cds_end = $cds->end; - if ($cds_end > $seq{$cds->seq_id}{DNA}->length ) { - err("CDS end", $cds_end, "is beyond length", + if ($cds->end > $seq{$cds->seq_id}{DNA}->length ) { + err("CDS end", $cds->end, "is beyond length", $seq{$sid}{DNA}->length, "in contig $sid") } } @@ -911,10 +911,7 @@ else { next unless $f->primary_tag eq 'CDS'; next if $f->has_tag('product'); $cds{++$count} = $f; - my $frame = 1; - ($frame) = $f->get_tag_values('codon_start') - if $f->has_tag('codon_start'); - print $faa ">$count\n",$f->seq->translate(-frame => $frame-1)->seq,"\n"; + print $faa ">$count\n",$f->seq->translate->seq,"\n"; } } close $faa; @@ -1059,9 +1056,6 @@ for my $sid (sort keys %seq) { my $g = Bio::SeqFeature::Generic->new( -primary => 'gene', -seq_id => $f->seq_id, - # -start => $f->start, - # -end => $f->end, - # -strand => $f->strand, -source_tag => $EXE, -tag => { 'locus_tag'=> $ID }, ); @@ -1104,11 +1098,6 @@ for my $sid (sort keys %seq) { if ($f->primary_tag eq 'CDS' and not $f->has_tag('product')) { $f->add_tag_value('product', $HYPO); } - ## DEBUG ## - if ($f->seq_id eq "contig000005"){ - print "HERE"; - } - ### END DEBUG ## print $gff_fh $f->gff_string($gff_factory),"\n"; @@ -1461,6 +1450,7 @@ sub setOptions { {OPT=>"usegenus!", VAR=>\$usegenus, DEFAULT=>0, DESC=>"Use genus-specific BLAST databases (needs --genus)"}, {OPT=>"proteins=s", VAR=>\$proteins, DEFAULT=>'', DESC=>"Fasta file of trusted proteins to first annotate from"}, {OPT=>"metagenome!", VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"}, + {OPT=>"partialgenes!", VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"}, {OPT=>"rawproduct!", VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"}, 'Computation:', {OPT=>"fast!", VAR=>\$fast, DEFAULT=>0, DESC=>"Fast mode - skip CDS /product searching"}, From 8799308446c87e149ac0734dd757de9abc222d2b Mon Sep 17 00:00:00 2001 From: Lionel Guy Date: Thu, 28 Aug 2014 08:55:07 +0200 Subject: [PATCH 3/3] Added --partialgenes to README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e3c376a..33babd9 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ If you are using Mac OS X, you'll also have to change the `"Linux"` to `"Darwin" ###Wizard # Watch and learn - % prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --gram neg --addgenes contigs.fa + % prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --partialgenes --gram neg --addgenes contigs.fa # Check to see if anything went really wrong % less mydir/EHEC_06072012.err @@ -198,6 +198,7 @@ If you are using Mac OS X, you'll also have to change the `"Linux"` to `"Darwin" --usegenus Use genus-specific BLAST databases (needs --genus) (default OFF) --proteins [X] Fasta file of trusted proteins to first annotate from (default '') --metagenome Improve gene predictions for highly fragmented genomes (default OFF) + --partialgenes Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal) (default OFF) --rawproduct Do not clean up /product annotation (default OFF) Computation: --fast Fast mode - skip CDS /product searching (default OFF)