Genbankxmlparser.pm

#!/usr/local/bin/perl

package Genbankxmlparser;

use strict;
use Gene_obj;
use Xml_parser_facilitator;
use Data::Dumper;

## tree properties
#         tree node: (node_name, node_ref)
#         array_ref: (attribute_hash, (next_node_name, next_node_ref) ......)
#             if a node_name == 0, the node_ref is pcdata

my $SEE; #single file-scoped lexical

sub new {
    shift;
    my $self = {
	tree => 0,     #XML::Parser (Style=>Tree)
	genes => 0,    #Gene_obj's created to store gene info
	sequence => 0, #nucleotide sequence of genbank accession
        locus => 0,    #locus field
	accession => 0, # genbank accession
	version => 0,   #submission version
	title  => 0, #title
	update_date => 0, #date the submission was last updated.
	affiliation => 0, #seq_group information.
	SEE => 0,       #Verbose flag 
	chromosome=>0
    };
    while (@_) {
	my $att = shift;
	my $value = shift;
	if (exists ($self->{$att})) {
	    $self->{$att} = $value;
	}
    }
    unless (ref ($self->{tree}) eq "ARRAY") {
	die "I require an XML::Parser Style=Tree object\n";
    }
    $SEE = $self->{SEE};
    my $tree = $self->{tree};

    ## Specify Nodes for Desired Component Branches
    my $gene_components = "Seq-feat";
    my $sequence_data = "Seq-data";
    my $locus_info = "Textseq-id";
    my $title_info = "Seqdesc";
    my $update_date_info = "Seqdesc_update-date";
    my $affiliation = "Affil";
    my $chromosome = "SubSource_name";

    ## %gene_data holds the branch references for the Node classes specified above
    my %gene_data = ( ## all handlers are referenced here
		      $gene_components => [],
		      $sequence_data => [],
		      $locus_info => [],
		      $title_info =>[],
		      $affiliation =>[],
		      $chromosome =>[]
		      );

    # start tree parsing from the tip
    # all branches are retrieved for post-processing.
    &Xml_parser_facilitator::tree_parser (@{$tree}, \%gene_data);


    ## ------------ Pt. 1 ------------------
    ## Process the gene component branches
    my $x = 1;
    my @allcomponents;
    foreach my $ref (@{$gene_data{$gene_components}}) {
	print "$x\n" if ($SEE);
	$x++;

        # traverse the branch
	my $obj = { "Gene-ref_locus" => 0, #pub_locus
		    "Gene-ref_syn_E" => 0, #locus
		    "Seq-feat_comment" => 0, #pub_comment
		    "Seq-interval_from" => [],
		    "Seq-interval_to" => [], 
		    "RNA-ref_type" => 0, #flag to identify mRNA component (all mRNA exons)
		    "Seq-id_gi" => 0,
		    "Prot-ref_name_E" => 0, #flag to identify CDSinfo (com_names, etc.)
		    "Cdregion" => 0, #flag to identify CDS component (all CDS exons)
		    "Na-strand" => 0, #nucleic acid strand (default = "plus")
		    "Prot-ref_desc" => 0, #additional pub_comment information.
		    "Seq-feat_pseudo" => 0  #indicates pseudogene.
		    };
	&Xml_parser_facilitator::branch_parser ("start", $ref, $obj);
	push (@allcomponents, $obj);
	## print out parsed gene component data
	if ($SEE) {
	    print "--------------------Parsing gene-related info from XML---------------\n";
	    foreach my $key (keys %$obj) {
		if (! ref ($obj->{$key})) {
		    if ($obj->{$key}) {
			print "$key\t$obj->{$key} (not ref)\n";
		    }
		} else {
		    if (ref ($obj->{$key}) eq "ARRAY") {
			print "$key\t@{$obj->{$key}} (array ref)\n";
		    } elsif (ref ($obj->{$key}) eq "HASH") {
			my @stuff = %{$obj->{$key}};
			print "$key\t@stuff (hash ref)\n";
		    }
		}
	    }
	    print "\n";
	}
    }

    # separate all gene components into categories and then link them.
    # use CDS's as the primary keys because they're the most informative and most
    # probable to be present to represent the gene.

    my (@CDSs, @mRNAs, @genes, @CDS_info); #store the separated gene component references.

    ## Newly created components have the following data structure
    #$new_obj = { DIR => <+|->,
    #             F_SPAN => [end5, end3],, #only relative to forward strand
    #             COORDS => { end5 => end3, ....},
    #             locus  => "locus", #only for gene
    #             pub_locus => "pub_locus", #only for gene
    #             com_name => "com_name", #for CDS_info
    #             pub_comment => "pub_comment", #for gene
    #             gi => "gi-A / gi-B ....", #could use to track features.
    #             type => WHAT AM I
    #           }


    my @attributes = qw (DIR F_SPAN COORDS locus com_name pub_comment gi type);


    foreach my $component (@allcomponents) {
	if ($component->{"Cdregion"}) {
	    push (@CDSs, &create_exon_component($component));
	}  elsif ($component->{"Prot-ref_name_E"} || $component->{"Prot-ref_desc"}) {
	    push (@CDS_info, &create_CDSinfo_component($component));
	}  elsif ($component->{"RNA-ref_type"}) {
	    push (@mRNAs, &create_exon_component($component));
	}  elsif ($component->{"Gene-ref_locus"} || $component->{"Gene-ref_syn_E"}) { # gene
	    push (@genes, &create_gene_component($component));
	}
    }

    # Here, just print out the obj info to make sure I'm doing this right.


    if ($SEE) {
	print "\n----------------Separate Data Structs------------------\n";
	my @objects;
	push (@objects, "GENES", \@genes, "mRNAs", \@mRNAs, "CDSs", \@CDSs, "CDS_info", \@CDS_info);

	foreach my $object (@objects) {
	    if (!ref($object)) {
		print "\n\nPARSING $object !!!!!!!!!!!!!!!\n";
		next;
	    }
	    foreach my $obj (@$object) {
		print "\n";
		foreach my $att (@attributes) {
		    if (exists ($obj->{$att})) {
			if (ref ($obj->{$att}) eq "ARRAY") {
			    print "$att\t@{$obj->{$att}}\n";
			} elsif (ref ($obj->{$att}) eq "HASH") {
			    my @temp = %{$obj->{$att}};
			    print "$att\t@temp\n";
			} else {
			    print "$att\t$obj->{$att}\n";
			}
		    }
		}
	    }
	}
    }
    ## create Gene_obj based on CDS coordinates
    ## look for overlapping gene and mRNA components
    ## store info in Gene_obj's
    ## after identifying overlapping object, add "DELETE" flag to obj.

    my @gene_storage;


    foreach my $CDS_obj (@CDSs) {
	my $curr_Gene_obj = new Gene_obj ();
	my $CDS_dir = $CDS_obj->{"DIR"}; #if a gene is found, use gene direction (more reliable)
	my $CDS_gi = $CDS_obj->{"gi"};
	$curr_Gene_obj->{"strand"} = $CDS_dir;
	$curr_Gene_obj->{"is_pseudogene"} = ($CDS_obj->{"is_pseudogene"});
	## pubcomment info stored in many diff places.  Append or create.
	if ($curr_Gene_obj->{"pub_comment"}) {
	    unless ($CDS_obj->{"pub_comment"} == 0) {
		$curr_Gene_obj->{"pub_comment"} .= $CDS_obj->{"pub_comment"};
	    }
	} else {
	    $curr_Gene_obj->{"pub_comment"} = $CDS_obj->{"pub_comment"};
	}
	my ($CDS_span_left, $CDS_span_right) = @{$CDS_obj->{"F_SPAN"}};
	$curr_Gene_obj->{"CDS_coords"} = $CDS_obj->{"COORDS"};
	
	## find overlapping gene
	my $found = &find_overlapping_element ($CDS_span_left, $CDS_span_right, $CDS_dir, $CDS_gi, \@genes);
	if ($found) {
	    print "**** Found a gene, direction is $found->{DIR}\n" if $SEE;
	    $curr_Gene_obj->{"locus"} = $found->{"locus"};
	    $curr_Gene_obj->{"pub_locus"} = $found->{"pub_locus"};
	    if ($curr_Gene_obj->{"pub_comment"}) {
		unless ($found->{"pub_comment"} == 0) {
		    $curr_Gene_obj->{"pub_comment"} .= $found->{"pub_comment"};
		}
	    } else {
		$curr_Gene_obj->{"pub_comment"} = $found->{"pub_comment"};
	    }
	    $curr_Gene_obj->{"strand"} = $found->{"DIR"};
	}
	
	## find overlapping mRNA
	my $found = &find_overlapping_element ($CDS_span_left, $CDS_span_right, $CDS_dir, $CDS_gi, \@mRNAs);
	if ($found) {
	    $curr_Gene_obj->{"mRNA_coords"} = $found->{"COORDS"};
	} else {
	    # use CDS coords for mRNA if mRNA isn't specified.
	    $curr_Gene_obj->{"mRNA_coords"} = $CDS_obj->{"COORDS"};
	}
	
	## gather CDS_info
	my  $found = &find_overlapping_element ($CDS_span_left, $CDS_span_right, $CDS_dir, $CDS_gi, \@CDS_info);
	if ($found) {
	    $curr_Gene_obj->{"com_name"} = $found->{"com_name"};
	    $curr_Gene_obj->{"gi"} = $found->{"gi"};
	    if ($curr_Gene_obj->{"pub_comment"}) {
		unless ($found->{"pub_comment"} == 0) {
		    $curr_Gene_obj->{"pub_comment"} .= $found->{"pub_comment"};
		}
	    } else {
		$curr_Gene_obj->{"pub_comment"} = $found->{"pub_comment"};
	    }
	}
	
	## add gene to gene_storage
	$curr_Gene_obj->refine_gene_object();
	push (@gene_storage, $curr_Gene_obj);
	
    }
    
    if ($SEE) {
	print "\n------------Gene Objects----------------\n";
	foreach my $gene (@gene_storage) {
	    print "\n\nGENE\n";
	    print $gene->toString();
	}
    }
    
    ## store Gene_obj references as an instance member component.
    $self->{genes} = \@gene_storage;


    ####################################################
    #-------Pt 2: Parse Sequence Data ------------------
    ####################################################

    my $x = 1;
    foreach my $ref (@{$gene_data{$sequence_data}}) {
	print "\nFound Seq-data component $x\n" if ($SEE);
	$x++;

        # traverse the branch
	my $obj = { "IUPACna" => 0
		    };
	&Xml_parser_facilitator::branch_parser ("start", $ref, $obj);
	
	if ($obj->{IUPACna}) {
	    $self->{sequence} = $obj->{IUPACna};
	    last;
	}
    }
    
    #######################################################
    #-------Pt 3: Parse Locus Information -----------------
    #######################################################

    my $obj = { "Textseq-id_name" => 0,      #locus for genbank record
		"Textseq-id_accession" => 0, #genbank accession for record
		"Textseq-id_version" => 0    #apparently version info
		};
    
    ## Info should be in first entry.
    foreach my $ref (@{$gene_data{$locus_info}}) {
	
	&Xml_parser_facilitator::branch_parser ("start", $ref, $obj);
	
	my ($locus, $accession, $version) = ($obj->{"Textseq-id_name"},
					     $obj->{"Textseq-id_accession"},
					     $obj->{"Textseq-id_version"});
	if ($accession) {
	    $self->{locus} = $locus;
	    $self->{accession} = $accession;
	    $self->{version} = $version;
	    last;
	}
    }

    ###################################################
    #-------Pt 4: Parse title information ------------
    ##################################################
   
    my $obj = { "Seqdesc_title" => 0};
    foreach my $ref (@{$gene_data{$title_info}}) {
	&Xml_parser_facilitator::branch_parser ("start", $ref, $obj);
	my ($title) = $obj->{"Seqdesc_title"};
	if ($title) {
	    $self->{title} = $title;
	    last;
	}
    }

    ###################################################
    #-------Pt 5: Parse update_date  information -----
    ##################################################

   
    # higher node already retrieved within title_info.
    # must retree parse this section to gather the update_date_info node.
    my %date_parser = ( $update_date_info => [] );
    #search thru all title_info nodes for update_date_info node.
    foreach my $date_ref (@{$gene_data{$title_info}}) {
	&Xml_parser_facilitator::tree_parser ("start", $date_ref, \%date_parser);
    }

    my $obj = { "Date-std_year" => 0,
		"Date-std_month" => 0,
		"Date-std_day" => 0
		};
    
    foreach my $ref (@{$date_parser{$update_date_info}}) {
	&Xml_parser_facilitator::branch_parser ("start", $ref, $obj);
	if ($obj->{"Date-std_year"}) {
	    my ($date) = $obj->{"Date-std_month"} . "-" . $obj->{"Date-std_day"} . "-" . $obj->{"Date-std_year"};
	    $self->{update_date} = $date;
	    last;
	}
    }


    ####################################################
    #-------- Pt 6: Parse Affiliate Information -------
    ####################################################
    
    my $obj = { "Affil_std_affil" => 0,
	    "Affil_str" => 0};
    my $affil;
    foreach my $ref (@{$gene_data{$affiliation}}) {
	&Xml_parser_facilitator::branch_parser ("start", $ref, $obj);
	foreach my $key (%$obj) {
	    if ($obj->{$key}) {
		$affil .= $obj->{$key} . "\n\n";
	    }
	}
    }
    if ($affil) {
	$self->{affiliation} = $affil;
    }

    ###################################################
    #----------- Pt 7: Parse Chromosome Information ---
    ###################################################

    my $obj = { "SubSource_name" => 0};
    foreach my $ref (@{$gene_data{$chromosome}}) {
	&Xml_parser_facilitator::branch_parser ("SubSource_name", $ref, $obj);
	if ($obj->{SubSource_name}) {
	    my $chromosome = $obj->{SubSource_name};
	    $self->{chromosome} = $chromosome;
	    last;
	}
    }
    

    #END OF TREE PARSING


    #########################################################################
    # Finess the genes (shift pub_comment to com_name if no com_name specified 
    #########################################################################

    my @genes = &get_genes($self);
    foreach my $gene (@genes) {
	if (!$gene->{'com_name'} && $gene->{'pub_comment'}) {
	    $gene->{'com_name'} = $gene->{'pub_comment'};
	    $gene->{'pub_comment'} = 0;
	}
    }


    ## return Genbankxmlparser object
    bless($self);
    return ($self);
}

sub get_genes {
    my $self = shift;
    return (@{$self->{genes}});
}

sub get_sequence {
    my $self = shift;
    return ($self->{sequence});
}


####
sub find_overlapping_element {
    my ($input_left, $input_right, $input_dir, $input_gi, $collect_ref) = @_;
    foreach my $compare_obj (@{$collect_ref}) {
	if ($compare_obj->{"DELETE"}) { next;} #ignore previously chosen ones.
	my ($temp_left, $temp_right) = @{$compare_obj->{"F_SPAN"}};
	my ($temp_dir, $found_gi, $overlap);
	if ($temp_left && $temp_right) {
	    $temp_dir = $compare_obj->{"DIR"};
	    #test overlap
	    print "$input_left < $temp_right) && ($input_right > $temp_left\t" if ($SEE);
	    if (($input_left < $temp_right) && ($input_right > $temp_left)) {
		$overlap = 1;
		print "YES\n" if ($SEE);
	    } else {
		print "NO\n" if ($SEE);
	    }
	} else {
	    #no coords, so must examine gi numbers for matches.
	    my $temp_gi = $compare_obj->{"gi"};
	    if ($input_gi =~ /\b$temp_gi\b/) {
		$found_gi = 1;
	    }
	}

	## Determine if match found
	if ( (($input_dir eq $temp_dir) && $overlap) || $found_gi) {
	    $compare_obj->{"DELETE"} = 1;
	    return ($compare_obj);
	}
    }
    return (0);
}


####
sub create_CDSinfo_component {
    my ($obj) = @_;
    my $ret = {};
#    my $info = $obj->{"Prot-ref_name_E"};
#    if ($info =~ /^(.*);\s+(\d+)-(\d+)\s*$/) { #try reg-exp grabbing
#	$ret->{"com_name"} = $1;
#	my @coords = ($2, $3);
#	$ret->{"F_SPAN"} = &get_forward_span (@coords);
#	$ret->{"DIR"} = &determine_orientation(@coords);
#    } else {
    $ret->{"com_name"} = $obj->{"Prot-ref_name_E"};
    $ret->{"pub_comment"} = $obj->{"Prot-ref_desc"};
    $ret->{"F_SPAN"} = []; #empty ...no useful coords here.
#    }
    $ret->{"type"} = "CDSinfo";
    $ret->{"gi"} = $obj->{"Seq-id_gi"};
    return ($ret);
}


####
sub create_exon_component {
    my ($obj) = @_;
    my $ret = {};
    my $coord1 = $obj->{"Seq-interval_from"};
    my $coord2 = $obj->{"Seq-interval_to"};
    $ret->{"DIR"} = ($obj->{"Na-strand_atts"}->{"value"} eq "minus") ? '-' : '+';
    $ret->{"is_pseudogene"} = ($obj->{"Seq-feat_pseudo_atts"}->{"value"} eq "true") ? 1:0;
    my @coords;
    for (my $i = 0; $i <= $#{$coord1}; $i++) {
	push (@coords, ++$coord1->[$i], ++$coord2->[$i]);
	if ($ret->{"DIR"} eq '+') {
	    $ret->{"COORDS"}->{$coord1->[$i]} = $coord2->[$i];
	} else {
	    $ret->{"COORDS"}->{$coord2->[$i]} = $coord1->[$i];
	}  
    }
    $ret->{"F_SPAN"} = &get_forward_span (@coords);
    $ret->{"type"} = "EXONS (CDS's or mRNA's)";
    $ret->{"gi"} = $obj->{"Seq-id_gi"};
    $ret->{"pub_comment"} = $obj->{"Seq-feat_comment"};
    return ($ret);
}

####
sub create_gene_component {
    my ($obj) = @_;
    my $ret = {};
    if ($obj->{"Gene-ref_locus"}) {
	$ret->{"pub_locus"} = $obj->{"Gene-ref_locus"};
    }
    if ($obj->{"Gene-ref_syn_E"}) {
	$ret->{"locus"} = $obj->{"Gene-ref_syn_E"};
    } else {
	$ret->{"locus"} = $ret->{"pub_locus"};
    }
    $ret->{"pub_comment"} = $obj->{"Seq-feat_comment"};
    $ret->{"DIR"} = ($obj->{"Na-strand_atts"}->{"value"} eq "minus") ? '-' : '+';
    my $coord1 = $obj->{"Seq-interval_from"}->[0];
    my $coord2 = $obj->{"Seq-interval_to"}->[0];
    if ($ret->{"DIR"} eq '+') {    
	$ret->{"COORDS"}->{$coord1} = ++$coord2;
    } else {
	$ret->{"COORDS"}->{$coord2} = ++$coord1;
    }
    $ret->{"F_SPAN"} = &get_forward_span ($coord1, $coord2);
    $ret->{"type"} = "GENE";
    $ret->{"gi"} = $obj->{"Seq-id_gi"};
    return ($ret);
}


####
sub get_forward_span {
    my (@coords) = @_;
    @coords = sort {$a<=>$b} @coords;
    my @ret = ($coords[0], $coords[$#coords]);
    return (\@ret);
}


####
sub determine_orientation {
    my ($end5, $end3) = @_;
    my $direction;
    if ($end5 < $end3) {
	$direction = "+";
    } elsif ($end5 > $end3) {
	$direction = "-";
    }
    return ($direction);
}

sub toString {
    my ($self) = shift;
    my $output;

    # print submission information
    my ($locus, $accession, $version, $title, $update_date, $affiliation, $chromosome) = ($self->{locus},
											  $self->{accession},
											  $self->{version},
											  $self->{title},
											  $self->{update_date},
											  $self->{affiliation},
											  $self->{chromosome});
    
    $output .= "Locus: $locus\nAccession: $accession\nVersion: $version\nTitle: $title\nUpdate date: $update_date\naffiliation: $affiliation\n"
	. "chromosome: $chromosome\n";

    #print gene information
    my @genes = $self->get_genes();
    my $x = 0;
    foreach my $gene (@genes) {
	$x++;
	$output .=  "\nGENE $x\n";
	$output .=  $gene->toString();
    }

    #print submission's sequence.
    my $seq = $self->get_sequence();    
    #$output .=  "SEQUENCE:\n$seq\n";
    return ($output);
}

1;