Skip to content

Commit

Permalink
Add GFF support (#111)
Browse files Browse the repository at this point in the history
* feat: GFF input

currently only the GFF files from MGnify are supported

* fix: socialgene object genbank output

Was only saving the last locus due to loop

* art: black

* art: linting
  • Loading branch information
chasemc authored Aug 13, 2024
1 parent a557e9c commit 7930f8c
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 9 deletions.
14 changes: 7 additions & 7 deletions socialgene/base/socialgene.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ def __add__(self, sg_object):
self._merge_assemblies(sg_object)
self.protein_comparison.extend(sg_object.protein_comparison)

def write_genbank(self, outpath, mode="wt", compression=None):
def write_genbank(self, outpath, compression=None):
for assembly in self.assemblies.values():
for locus in assembly.loci.values():
record = SeqRecord(
Expand Down Expand Up @@ -572,12 +572,12 @@ def write_genbank(self, outpath, mode="wt", compression=None):
)
record.features.append(biofeat)
record.annotations["molecule_type"] = "DNA"
with open_write(outpath, mode, compression) as h:
SeqIO.write(
record,
h,
"genbank",
)
with open_write(outpath, "a", compression) as h:
SeqIO.write(
record,
h,
"genbank",
)

def drop_all_non_protein_features(self, **kwargs):
"""Drop features from all assembly/loci that aren't proteins/pseudo-proteins
Expand Down
92 changes: 92 additions & 0 deletions socialgene/parsers/gff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# parse gff file into socialgene object
from pathlib import Path

from Bio import SeqIO

import socialgene.utils.file_handling as fh


class GFFParserMixin:
def __init__(self):
pass

def _check_is_gff(self, input_path: str):
if fh.guess_filetype(input_path) != "gff":
raise ValueError("Input file is not a GFF file")

def _has_fasta(self, input_path: str):
with fh.open_read(input_path) as f:
for line in f:
if line.startswith("##FASTA"):
return True
return False

def _fasta_biopython(self, input_path: str):
with fh.open_read(input_path) as file:
for line in file:
if line.startswith("##FASTA"):
break
# read the remaining lines into fasta dictionary using biopython
fasta_dict = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
if not fasta_dict:
raise ValueError("No sequences found in FASTA section")
return fasta_dict

def parse_gff_file(self, input_path: str, keep_sequence: bool = True):
# name of file without extension
assembly_uid = Path(input_path).name
assembly_uid = assembly_uid.split(".gff")[0]
self._check_is_gff(input_path)
self._has_fasta(input_path)
nucleotide_sequence_dict = self._fasta_biopython(input_path)
self.add_assembly(assembly_uid)
with fh.open_read(input_path) as file:
for line in file:
if line.startswith("#") or line.startswith(">"):
continue
parts = line.strip().split("\t")
if len(parts) != 9:
continue
(
seq_id,
source,
feature_type,
start,
end,
score,
strand,
phase,
attributes,
) = parts
if feature_type not in ["protein", "CDS", "pseudogene"]:
continue
# get the translated sequence using biopython's translate after extracting the sequence
# from the fasta file
sequence = nucleotide_sequence_dict[seq_id][int(start) - 1 : int(end)]
if strand == "-":
sequence = sequence.reverse_complement()
sequence = sequence.translate()
uid = self.add_protein(
description=None,
external_id=None,
sequence=str(sequence.seq),
return_uid=True,
)
if not keep_sequence:
self.proteins[uid].sequence = None
if strand == "-":
strand = -1
elif strand == "+":
strand = 1
else:
strand = 0
self.assemblies[assembly_uid].add_locus(external_id=seq_id)
self.assemblies[assembly_uid].loci[seq_id].add_feature(
type=feature_type,
description=None,
external_id=None,
uid=uid,
start=int(start),
end=int(end),
strand=strand,
)
6 changes: 5 additions & 1 deletion socialgene/parsers/sequence_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import socialgene.utils.file_handling as fh
from socialgene.parsers.fasta import FastaParserMixin
from socialgene.parsers.genbank import GenbankParserMixin
from socialgene.parsers.gff import GFFParserMixin


class SequenceParser(GenbankParserMixin, FastaParserMixin):
class SequenceParser(GenbankParserMixin, FastaParserMixin, GFFParserMixin):
def __init__(self):
super().__init__()

Expand All @@ -25,6 +26,9 @@ def parse(self, filepath, **kwargs):
elif filetype == "fasta":
self.parse_fasta_file(input=filepath, **kwargs)
return
elif filetype == "gff":
self.parse_gff_file(input_path=filepath, **kwargs)
return

raise NotImplementedError(
"May not be implemented, or you need to use the genbank/fasta parser directly. (e.g. for tar archives)"
Expand Down
3 changes: 2 additions & 1 deletion socialgene/utils/file_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def guess_filetype(filepath):
"""
with open_read(filepath) as f:
l1 = f.readline()

if l1.startswith("LOCUS "):
return "genbank"
if l1.startswith(">"):
Expand All @@ -147,3 +146,5 @@ def guess_filetype(filepath):
== "#---fullsequence-----------------thisdomain-------------hmmcoordalicoordenvcoord\n"
):
return "domtblout"
if l1.startswith("##gff-version 3"):
return "gff"

0 comments on commit 7930f8c

Please sign in to comment.