Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for partial genes #37

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ If you are using Mac OS X, you'll also have to change the `"Linux"` to `"Darwin"
###Wizard

# Watch and learn
% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --gram neg --addgenes contigs.fa
% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --partialgenes --gram neg --addgenes contigs.fa

# Check to see if anything went really wrong
% less mydir/EHEC_06072012.err
Expand Down Expand Up @@ -198,6 +198,7 @@ If you are using Mac OS X, you'll also have to change the `"Linux"` to `"Darwin"
--usegenus Use genus-specific BLAST databases (needs --genus) (default OFF)
--proteins [X] Fasta file of trusted proteins to first annotate from (default '')
--metagenome Improve gene predictions for highly fragmented genomes (default OFF)
--partialgenes Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal) (default OFF)
--rawproduct Do not clean up /product annotation (default OFF)
Computation:
--fast Fast mode - skip CDS /product searching (default OFF)
Expand Down
117 changes: 71 additions & 46 deletions bin/prokka
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use XML::Simple;
use List::Util qw(min max sum);
use Scalar::Util qw(openhandle);
use Data::Dumper;
use Bio::Location::Fuzzy;
use Bio::Root::Version;
use Bio::SeqIO;
use Bio::SearchIO;
Expand Down Expand Up @@ -221,7 +222,7 @@ my(@Options, $quiet, $kingdom, $fast, $force, $outdir, $prefix, $cpus, $addgenes
$genus, $species, $strain, $plasmid,
$usegenus, $proteins, $centre, $scaffolds,
$rfam, $norrna, $notrna, $rnammer, $rawproduct,
$metagenome, $compliant, $listdb, $citation);
$metagenome, $partialgenes, $compliant, $listdb, $citation);
setOptions();

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Expand Down Expand Up @@ -642,58 +643,78 @@ if ($tools{'minced'}->{HAVE}) {
# CDS

msg("Predicting coding sequences");
my $tool = "Prodigal:".$tools{prodigal}->{VERSION};
my $totalbp = sum( map { $seq{$_}{DNA}->length } keys %seq);
my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta';
msg("Contigs total $totalbp bp, so using $prodigal_mode mode");
my $prodigal_closedends = $partialgenes ? '' : '-c';
msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " .
($partialgenes ? 'not' : '') . " allowing genes to run off contig edges.");
my $num_cds=0;
my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q";
my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q";
msg("Running: $cmd");
open my $PRODIGAL, '-|', $cmd;
my $sid;
while (<$PRODIGAL>) {
if (m/seqhdr="([^\s\"]+)"/) {
$sid = $1;
# msg("CDS $sid");
next;
}
elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) {
my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop?
my $cds = Bio::SeqFeature::Generic->new(
-primary => 'CDS',
-seq_id => $sid,
-source => $tool,
-start => $1,
-end => $2,
-strand => ($3 eq '+' ? +1 : -1),
-score => undef,
-frame => 0,
-tag => {
'inference' => "ab initio prediction:$tool",
}
);
my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL,
-gff_version => 3);
while (my $cds = $gff_in->next_feature){
my $sid = $cds->seq_id;
# Determine whether the CDS is partial, i.e. it is running off
# contig edges. Add the appropriate /codon_start attribute if necessary
# Assuming prodigal only returns one single 'partial' tag
# Modify start & end attributes, because gff output is wrong
my ($partial) = $cds->get_tag_values('partial');
my ($codon_start);
my ($start, $end) = ($cds->start, $cds->end);
my $ctg_length = $seq{$sid}{DNA}->length;
if ($partial ne '00') {
if ($partial =~ /^1/){
$codon_start = $cds->start if ($cds->strand == 1);
$start = '<1';
}
if ($partial =~ /1$/){
$codon_start = ($ctg_length - $cds->end + 1)
if ($cds->strand == -1);
$end = ">$ctg_length";
}
my $fuzzyloc = Bio::Location::Fuzzy->new(
-start => $start,
-end => $end,
-location_type => '..',
-strand => $cds->strand);
$cds->location($fuzzyloc);
}
# Removing all tags
foreach my $tag ($cds->get_all_tags) {
$cds->remove_tag($tag);
}
# Adding only codon_start, if applicable, and the inference tag
if ($codon_start){
$cds->add_tag_value('codon_start', $codon_start);
$cds->frame($codon_start-1);
}
$cds->add_tag_value('inference', "ab initio prediction:$tool");
my $overlap;
for my $rna (@allrna) {
# same contig, overlapping (could check same strand too? not sure)
if ($rna->seq_id eq $sid and $cds->overlaps($rna)) {
$overlap = $rna;
last;
}
# same contig, overlapping (could check same strand too? not sure)
if ($rna->seq_id eq $sid and $cds->overlaps($rna)) {
$overlap = $rna;
last;
}
}
# mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap.
if ($overlap and $kingdom ne 'Mitochondria') {
my $type = $overlap->primary_tag;
msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand");
my $type = $overlap->primary_tag;
msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand");
}
else {
$num_cds++;
push @{$seq{$sid}{FEATURE}}, $cds;
## BUG James Doonan - ensure no odd features extending beyond contig
if ($cds->end > $seq{$cds->seq_id}{DNA}->length ) {
err("CDS end", $cds->end, "is beyond length", $seq{$sid}{DNA}->length, "in contig $sid")
}

$num_cds++;
push @{$seq{$sid}{FEATURE}}, $cds;
## BUG James Doonan - ensure no odd features extending beyond contig
#(my $cds_end = $cds->end) =~ s/^>//;
if ($cds->end > $seq{$cds->seq_id}{DNA}->length ) {
err("CDS end", $cds->end, "is beyond length",
$seq{$sid}{DNA}->length, "in contig $sid")
}
}
}
}
msg("Found $num_cds CDS");

Expand Down Expand Up @@ -1035,12 +1056,10 @@ for my $sid (sort keys %seq) {
my $g = Bio::SeqFeature::Generic->new(
-primary => 'gene',
-seq_id => $f->seq_id,
-start => $f->start,
-end => $f->end,
-strand => $f->strand,
-source_tag => $EXE,
-tag => { 'locus_tag'=> $ID },
);
$g->location($f->location);
if (my $gENE = TAG($f, 'gene')) {
$g->add_tag_value('gene', $gENE);
}
Expand Down Expand Up @@ -1079,10 +1098,15 @@ for my $sid (sort keys %seq) {
if ($f->primary_tag eq 'CDS' and not $f->has_tag('product')) {
$f->add_tag_value('product', $HYPO);
}

print $gff_fh $f->gff_string($gff_factory),"\n";

my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start);

my ($start, $end) = ($f->start, $f->end);
$start = (($f->strand >= 0) ? '<' : '>') . $start
if ($f->location->start_pos_type eq 'BEFORE');
$end = (($f->strand >= 0) ? '>' : '<') . $end
if ($f->location->end_pos_type eq 'AFTER');
my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start);
print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n";
for my $tag ($f->get_all_tags) {
next if $tag =~ m/^(ID|Alias)$/; # remove GFF specific tags
Expand Down Expand Up @@ -1426,6 +1450,7 @@ sub setOptions {
{OPT=>"usegenus!", VAR=>\$usegenus, DEFAULT=>0, DESC=>"Use genus-specific BLAST databases (needs --genus)"},
{OPT=>"proteins=s", VAR=>\$proteins, DEFAULT=>'', DESC=>"Fasta file of trusted proteins to first annotate from"},
{OPT=>"metagenome!", VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"},
{OPT=>"partialgenes!", VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"},
{OPT=>"rawproduct!", VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"},
'Computation:',
{OPT=>"fast!", VAR=>\$fast, DEFAULT=>0, DESC=>"Fast mode - skip CDS /product searching"},
Expand Down