Add GFF support (#111)

* feat: GFF input currently only the GFF files from MGnify are supported * fix: socialgene object genbank output Was only saving the last locus due to loop * art: black * art: linting
socialgene · Aug 13, 2024 · 7930f8c · 7930f8c
1 parent a557e9c
commit 7930f8c
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 9 deletions.
diff --git a/socialgene/base/socialgene.py b/socialgene/base/socialgene.py
@@ -544,7 +544,7 @@ def __add__(self, sg_object):
         self._merge_assemblies(sg_object)
         self.protein_comparison.extend(sg_object.protein_comparison)
 
-    def write_genbank(self, outpath, mode="wt", compression=None):
+    def write_genbank(self, outpath, compression=None):
         for assembly in self.assemblies.values():
             for locus in assembly.loci.values():
                 record = SeqRecord(
@@ -572,12 +572,12 @@ def write_genbank(self, outpath, mode="wt", compression=None):
                     )
                     record.features.append(biofeat)
                 record.annotations["molecule_type"] = "DNA"
-            with open_write(outpath, mode, compression) as h:
-                SeqIO.write(
-                    record,
-                    h,
-                    "genbank",
-                )
+                with open_write(outpath, "a", compression) as h:
+                    SeqIO.write(
+                        record,
+                        h,
+                        "genbank",
+                    )
 
     def drop_all_non_protein_features(self, **kwargs):
         """Drop features from all assembly/loci that aren't proteins/pseudo-proteins

diff --git a/socialgene/parsers/gff.py b/socialgene/parsers/gff.py
@@ -0,0 +1,92 @@
+# parse gff file into socialgene object
+from pathlib import Path
+
+from Bio import SeqIO
+
+import socialgene.utils.file_handling as fh
+
+
+class GFFParserMixin:
+    def __init__(self):
+        pass
+
+    def _check_is_gff(self, input_path: str):
+        if fh.guess_filetype(input_path) != "gff":
+            raise ValueError("Input file is not a GFF file")
+
+    def _has_fasta(self, input_path: str):
+        with fh.open_read(input_path) as f:
+            for line in f:
+                if line.startswith("##FASTA"):
+                    return True
+        return False
+
+    def _fasta_biopython(self, input_path: str):
+        with fh.open_read(input_path) as file:
+            for line in file:
+                if line.startswith("##FASTA"):
+                    break
+            # read the remaining lines into fasta dictionary using biopython
+            fasta_dict = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
+            if not fasta_dict:
+                raise ValueError("No sequences found in FASTA section")
+        return fasta_dict
+
+    def parse_gff_file(self, input_path: str, keep_sequence: bool = True):
+        # name of file without extension
+        assembly_uid = Path(input_path).name
+        assembly_uid = assembly_uid.split(".gff")[0]
+        self._check_is_gff(input_path)
+        self._has_fasta(input_path)
+        nucleotide_sequence_dict = self._fasta_biopython(input_path)
+        self.add_assembly(assembly_uid)
+        with fh.open_read(input_path) as file:
+            for line in file:
+                if line.startswith("#") or line.startswith(">"):
+                    continue
+                parts = line.strip().split("\t")
+                if len(parts) != 9:
+                    continue
+                (
+                    seq_id,
+                    source,
+                    feature_type,
+                    start,
+                    end,
+                    score,
+                    strand,
+                    phase,
+                    attributes,
+                ) = parts
+                if feature_type not in ["protein", "CDS", "pseudogene"]:
+                    continue
+                # get the translated sequence using biopython's translate after extracting the sequence
+                # from the fasta file
+                sequence = nucleotide_sequence_dict[seq_id][int(start) - 1 : int(end)]
+                if strand == "-":
+                    sequence = sequence.reverse_complement()
+                sequence = sequence.translate()
+                uid = self.add_protein(
+                    description=None,
+                    external_id=None,
+                    sequence=str(sequence.seq),
+                    return_uid=True,
+                )
+                if not keep_sequence:
+                    self.proteins[uid].sequence = None
+                if strand == "-":
+                    strand = -1
+                elif strand == "+":
+                    strand = 1
+                else:
+                    strand = 0
+                self.assemblies[assembly_uid].add_locus(external_id=seq_id)
+                self.assemblies[assembly_uid].loci[seq_id].add_feature(
+                    type=feature_type,
+                    description=None,
+                    external_id=None,
+                    uid=uid,
+                    start=int(start),
+                    end=int(end),
+                    strand=strand,
+                )
diff --git a/socialgene/parsers/sequence_parser.py b/socialgene/parsers/sequence_parser.py
@@ -1,9 +1,10 @@
 import socialgene.utils.file_handling as fh
 from socialgene.parsers.fasta import FastaParserMixin
 from socialgene.parsers.genbank import GenbankParserMixin
+from socialgene.parsers.gff import GFFParserMixin
 
 
-class SequenceParser(GenbankParserMixin, FastaParserMixin):
+class SequenceParser(GenbankParserMixin, FastaParserMixin, GFFParserMixin):
     def __init__(self):
         super().__init__()
 
@@ -25,6 +26,9 @@ def parse(self, filepath, **kwargs):
             elif filetype == "fasta":
                 self.parse_fasta_file(input=filepath, **kwargs)
                 return
+            elif filetype == "gff":
+                self.parse_gff_file(input_path=filepath, **kwargs)
+                return
 
         raise NotImplementedError(
             "May not be implemented, or you need to use the genbank/fasta parser directly. (e.g. for tar archives)"

diff --git a/socialgene/utils/file_handling.py b/socialgene/utils/file_handling.py
@@ -135,7 +135,6 @@ def guess_filetype(filepath):
     """
     with open_read(filepath) as f:
         l1 = f.readline()
-
     if l1.startswith("LOCUS "):
         return "genbank"
     if l1.startswith(">"):
@@ -147,3 +146,5 @@ def guess_filetype(filepath):
         == "#---fullsequence-----------------thisdomain-------------hmmcoordalicoordenvcoord\n"
     ):
         return "domtblout"
+    if l1.startswith("##gff-version 3"):
+        return "gff"