Skip to content

Commit

Permalink
safer GTF vs GFF3 recognition; ignore lines without ID/Parent/transcr…
Browse files Browse the repository at this point in the history
…ipt_id
  • Loading branch information
gpertea committed Apr 5, 2018
1 parent fd1c8ba commit 9d007ba
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 10 deletions.
4 changes: 2 additions & 2 deletions GStr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ GStr::Data* GStr::new_data(const char* str, uint addcap) {
}

void GStr::prep_data(uint len, uint addcap) {
int newcap=len+addcap;
uint newcap=len+addcap;
if (newcap > 0 && my_data->ref_count <= 1 &&
my_data->cap>=newcap && my_data->cap-newcap<(newcap>>1)+2) {
//no need to shrink the already allocated space
//no need to shrink/reallocate the already allocated space
my_data->length = len;
my_data->chars[len]=0;
return;
Expand Down
26 changes: 18 additions & 8 deletions gff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,22 +361,32 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
/*
Rejecting non-transcript lines early if only transcripts are requested ?!
It would be faster to do this here but there are GFF cases when we reject a parent feature here
(e.g. protein with 2 CDS children) and then their exon/CDS children show up and
(e.g. protein with 2 CDS children) and then their exon/CDS children show up and
get assigned to an implicit parent mRNA
The solution is to still load this parent as GffObj for now and BAN it later
so its children get dismissed/discarded as well.
*/
char *gtf_tid=NULL;
char *gtf_gid=NULL;
if (reader->is_gff3 || reader->gff_type==0) {
ID=extractAttr("ID=",true);
Parent=extractAttr("Parent=",true);
if (reader->gff_type==0) {
if (ID!=NULL || Parent!=NULL) reader->is_gff3=true;
else reader->is_gtf=true;
else { //check if it looks like a GTF
gtf_tid=extractAttr("transcript_id", true, true);
if (gtf_tid==NULL) {
gtf_gid=extractAttr("gene_id", true, true);
if (gtf_gid==NULL) return; //cannot determine file type yet
}
reader->is_gtf=true;
}
}
}

if (reader->is_gff3) {
//parse as GFF3
//if (ID==NULL && Parent==NULL) return; //silently ignore unidentified/unlinked features
if (ID!=NULL) {
//has ID attr so it's likely to be a parent feature
//look for explicit gene name
Expand Down Expand Up @@ -480,8 +490,8 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
}
if (is_gene) {
reader->gtf_gene=true;
ID=extractAttr("transcript_id", true, true); //Ensemble GTF might lack this
gene_id=extractAttr("gene_id");
ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //Ensemble GTF might lack this
gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
if (ID==NULL) {
// no transcript_id -- this is not a valid GTF2 format, but Ensembl
//is being known to add "gene" features with only gene_id in their GTF
Expand All @@ -492,21 +502,21 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
// else if (strcmp(gene_id, ID)==0) //GENCODE v20 gene feature ?
}
else if (is_transcript) {
ID=extractAttr("transcript_id", true, true);
ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
//gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
if (ID==NULL) {
//something is wrong here, cannot parse the GTF ID
GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l);
return;
}
gene_id=extractAttr("gene_id");
gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
if (gene_id!=NULL)
Parent=Gstrdup(gene_id);
reader->gtf_transcript=true;
is_gtf_transcript=1;
} else { //must be an exon type
Parent=extractAttr("transcript_id", true, true);
gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
Parent = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); // for GTF this is the only attribute accepted as geneID
//old pre-GTF2 formats like Jigsaw's (legacy)
if (Parent==NULL && exontype==exgffExon) {
if (startsWith(track,"jigsaw")) {
Expand Down

0 comments on commit 9d007ba

Please sign in to comment.