From 61cf106cafc195e59c1f55e7ff67c4e43132dec5 Mon Sep 17 00:00:00 2001
From: danielsf <scott.daniel@alleninstitute.org>
Date: Tue, 24 Jun 2025 11:04:02 -0700
Subject: [PATCH 1/6] factor URL parsing out into a separate method

this will allow us to ingest gff files like

https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz

that do not encode the species, annotation, etc. in the URL

(provided users provide this metadata by hand to the GFF constructor)
---
 .../genome_annotation_translator.py           | 159 +++++++++++-------
 1 file changed, 98 insertions(+), 61 deletions(-)

diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
index 4c47be9..b6dfea5 100644
--- a/bkbit/data_translators/genome_annotation_translator.py
+++ b/bkbit/data_translators/genome_annotation_translator.py
@@ -112,9 +112,6 @@ class Gff3:
         __init__(content_url, assembly_accession=None, assembly_strain=None, log_level="WARNING", log_to_file=False):
             Initializes the Gff3 class with the provided parameters.
 
-        parse_url():
-            Parses the content URL and extracts information about the genome annotation.
-
         __download_gff_file():
             Downloads a GFF file from a given URL and calculates the MD5, SHA256, and SHA1 hashes.
 
@@ -161,6 +158,7 @@ class Gff3:
     def __init__(
         self,
         content_url,
+        url_metadata,
         assembly_accession=None,
         assembly_strain=None,
         log_level="WARNING",
@@ -171,13 +169,13 @@ def __init__(
 
         Parameters:
         - content_url (str): The URL of the GFF file.
+        - url_metadata (dict): dict containing metadata about the genome annotation
         - assembly_id (str): The ID of the genome assembly.
         - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None.
         - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5').
         """
         self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file)
         try:
-            self.scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH)
             self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH)
             self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH)
         except FileNotFoundError as e:
@@ -189,7 +187,6 @@ def __init__(
 
         ## STEP 1: Parse the content URL to get metadata
         # Parse content_url to get metadata
-        url_metadata = self.parse_url()
         if url_metadata is None:
             self.logger.critical(
                 "The provided content URL is not supported. Please provide a valid URL."
@@ -212,13 +209,10 @@ def __init__(
         self.authority = url_metadata.get("authority")
 
         # Assign the taxon_id and assembly_id based on the authority
+        taxon_id = url_metadata.get("taxonid")
         if self.authority.value == ga.AuthorityType.NCBI.value:
-            taxon_id = url_metadata.get("taxonid")
             assembly_id = url_metadata.get("assembly_accession")
         elif self.authority.value == ga.AuthorityType.ENSEMBL.value:
-            taxon_id = self.scientific_name_to_taxonid.get(
-                url_metadata.get("scientific_name").replace("_", " ")
-            )
             if assembly_accession is None:
                 self.logger.critical(
                     "The assembly ID is required for Ensembl URLs. Please provide the assembly ID."
@@ -253,60 +247,35 @@ def __init__(
 
         self.gene_annotations = {}
 
-    def parse_url(self):
+    @classmethod
+    def from_url(
+            cls,
+            content_url,
+            assembly_accession=None,
+            assembly_strain=None,
+            log_level="WARNING",
+            log_to_file=False
+    ):
         """
-        Parses the content URL and extracts information about the genome annotation.
+        Initializes an instance of the GFFTranslator class, gleaning
+        metadata about the annotation from the URL of the GFF file.
 
-        Returns:
-            A dictionary containing the following information:
-            - 'authority': The authority type (NCBI or ENSEMBL).
-            - 'taxonid': The taxon ID of the genome.
-            - 'release_version': The release version of the genome annotation.
-            - 'assembly_accession': The assembly accession of the genome.
-            - 'assembly_name': The name of the assembly.
-            - 'species': The species name (only for ENSEMBL URLs).
-        """
-        # Define regex patterns for NCBI and Ensembl URLs
-        # NCBI : [assembly accession.version]_[assembly name]_[content type].[optional format]
-        # ENSEMBL :  <species>.<assembly>.<_version>.gff3.gz -> organism full name, assembly name, genome version
-        ncbi_pattern = r"/genomes/all/annotation_releases/(\d+)(?:/(\d+))?/(GCF_\d+\.\d+)[_-]([^/]+)/(GCF_\d+\.\d+)[_-]([^/]+)_genomic\.gff\.gz"
-        ensembl_pattern = (
-            r"/pub/release-(\d+)/gff3/([^/]+)/([^/.]+)\.([^/.]+)\.([^/.]+)\.gff3\.gz"
+        Parameters:
+        - content_url (str): The URL of the GFF file.
+        - assembly_id (str): The ID of the genome assembly.
+        - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None.
+        - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5').
+        """
+        url_metadata = parse_url(content_url=content_url)
+        return cls(
+            content_url=content_url,
+            url_metadata=url_metadata,
+            assembly_accession=assembly_accession,
+            assembly_strain=assembly_strain,
+            log_level=log_level,
+            log_to_file=log_to_file
         )
 
-        # Parse the URL to get the path
-        parsed_url = urlparse(self.content_url)
-        path = parsed_url.path
-
-        # Determine if the URL is from NCBI or Ensembl and extract information
-        if "ncbi" in parsed_url.netloc:
-            ncbi_match = re.search(ncbi_pattern, path)
-            if ncbi_match:
-                return {
-                    "authority": ga.AuthorityType.NCBI,
-                    "taxonid": ncbi_match.group(1),
-                    "release_version": (
-                        ncbi_match.group(2)
-                        if ncbi_match.group(2)
-                        else ncbi_match.group(4)
-                    ),
-                    "assembly_accession": ncbi_match.group(3),
-                    "assembly_name": ncbi_match.group(6),
-                }
-
-        elif "ensembl" in parsed_url.netloc:
-            ensembl_match = re.search(ensembl_pattern, path)
-            if ensembl_match:
-                return {
-                    "authority": ga.AuthorityType.ENSEMBL,
-                    "release_version": ensembl_match.group(1),
-                    "scientific_name": ensembl_match.group(3),
-                    "assembly_name": ensembl_match.group(4),
-                }
-
-        # If no match is found, return None
-        return None
-
     def __download_gff_file(self):
         """
         Downloads a GFF file from a given URL and calculates the MD5, SHA256, and SHA1 hashes.
@@ -896,6 +865,70 @@ def serialize_to_jsonld(
         print(json.dumps(output_data, indent=2))
 
 
+def parse_url(content_url):
+    """
+    Parses the content URL and extracts information about the genome annotation.
+
+    Parameters:
+        content_url: a string. The URL of the GFF file being parsed for metadata
+
+    Returns:
+        A dictionary containing the following information:
+        - 'authority': The authority type (NCBI or ENSEMBL).
+        - 'taxonid': The taxon ID of the genome.
+        - 'release_version': The release version of the genome annotation.
+        - 'assembly_accession': The assembly accession of the genome.
+        - 'assembly_name': The name of the assembly.
+        - 'species': The species name (only for ENSEMBL URLs).
+    """
+    # Define regex patterns for NCBI and Ensembl URLs
+    # NCBI : [assembly accession.version]_[assembly name]_[content type].[optional format]
+    # ENSEMBL :  <species>.<assembly>.<_version>.gff3.gz -> organism full name, assembly name, genome version
+    ncbi_pattern = r"/genomes/all/annotation_releases/(\d+)(?:/(\d+))?/(GCF_\d+\.\d+)[_-]([^/]+)/(GCF_\d+\.\d+)[_-]([^/]+)_genomic\.gff\.gz"
+    ensembl_pattern = (
+        r"/pub/release-(\d+)/gff3/([^/]+)/([^/.]+)\.([^/.]+)\.([^/.]+)\.gff3\.gz"
+    )
+
+    # Parse the URL to get the path
+    parsed_url = urlparse(content_url)
+    path = parsed_url.path
+
+    # Determine if the URL is from NCBI or Ensembl and extract information
+    if "ncbi" in parsed_url.netloc:
+        ncbi_match = re.search(ncbi_pattern, path)
+        if ncbi_match:
+            return {
+                "authority": ga.AuthorityType.NCBI,
+                "taxonid": ncbi_match.group(1),
+                "release_version": (
+                    ncbi_match.group(2)
+                    if ncbi_match.group(2)
+                    else ncbi_match.group(4)
+                ),
+                "assembly_accession": ncbi_match.group(3),
+                "assembly_name": ncbi_match.group(6),
+            }
+
+    elif "ensembl" in parsed_url.netloc:
+        ensembl_match = re.search(ensembl_pattern, path)
+        if ensembl_match:
+            scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH)
+            result = {
+                "authority": ga.AuthorityType.ENSEMBL,
+                "release_version": ensembl_match.group(1),
+                "scientific_name": ensembl_match.group(3),
+                "assembly_name": ensembl_match.group(4),
+            }
+            taxon_id = scientific_name_to_taxonid.get(
+                result.get("scientific_name").replace("_", " ")
+            )
+            result['taxonid'] = taxon_id
+            return result
+
+    # If no match is found, return None
+    return None
+
+
 @click.command()
 ##ARGUEMENTS##
 # Argument #1: The URL of the GFF file
@@ -932,8 +965,12 @@ def gff2jsonld(content_url, assembly_accession, assembly_strain, log_level, log_
     '''
     Creates GeneAnnotation objects from a GFF3 file and serializes them to JSON-LD format.
     '''
-    gff3 = Gff3(
-        content_url, assembly_accession, assembly_strain, log_level, log_to_file
+    gff3 = Gff3.from_url(
+        content_url=content_url,
+        assembly_accession=assembly_accession,
+        assembly_strain=assembly_strain,
+        log_level=log_level,
+        log_to_file=log_to_file
     )
     gff3.parse()
     gff3.serialize_to_jsonld()

From 38fbc91413051f05857c108d63efdfb673f274a0 Mon Sep 17 00:00:00 2001
From: danielsf <scott.daniel@alleninstitute.org>
Date: Tue, 24 Jun 2025 11:40:26 -0700
Subject: [PATCH 2/6] Gff3.serialize_to_jsonld returns a dict

this gives users the option of keeping the dict in memory,
in case they do not want the output written to stdout
---
 .../genome_annotation_translator.py           | 42 ++++++++++++++-----
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
index b6dfea5..62678bf 100644
--- a/bkbit/data_translators/genome_annotation_translator.py
+++ b/bkbit/data_translators/genome_annotation_translator.py
@@ -831,14 +831,22 @@ def serialize_to_jsonld(
         self, exclude_none: bool = True, exclude_unset: bool = False
     ):
         """
-        Serialize the object and either write it to the specified output file or print it to the CLI.
+        Serialize the object and either write it
+        to the specified output file or print it to the CLI.
 
         Parameters:
-            exclude_none (bool): Whether to exclude None values in the output.
-            exclude_unset (bool): Whether to exclude unset values in the output.
+            exclude_none (bool):
+                Whether to exclude None values in the output.
+            exclude_unset (bool):
+                Whether to exclude unset values in the output.
 
         Returns:
-            None
+            Dict
+
+        Notes
+        -----
+        Overrode default implementation so that we can return the dict
+        containing the jsonld graph
         """
 
         data = [
@@ -853,16 +861,29 @@ def serialize_to_jsonld(
             ),
         ]
         for ck in self.checksums:
-            data.append(ck.dict(exclude_none=exclude_none, exclude_unset=exclude_unset))
+            data.append(
+                ck.dict(
+                    exclude_none=exclude_none,
+                    exclude_unset=exclude_unset
+                )
+            )
         for ga in self.gene_annotations.values():
-            data.append(ga.dict(exclude_none=exclude_none, exclude_unset=exclude_unset))
+            data.append(
+                ga.dict(
+                    exclude_none=exclude_none,
+                    exclude_unset=exclude_unset
+                )
+            )
 
         output_data = {
-            "@context": "https://raw.githubusercontent.com/brain-bican/models/main/jsonld-context-autogen/genome_annotation.context.jsonld",
+            "@context": (
+                "https://raw.githubusercontent.com/brain-bican/"
+                "models/main/jsonld-context-autogen/"
+                "genome_annotation.context.jsonld"
+            ),
             "@graph": data,
         }
-
-        print(json.dumps(output_data, indent=2))
+        return output_data
 
 
 def parse_url(content_url):
@@ -973,7 +994,8 @@ def gff2jsonld(content_url, assembly_accession, assembly_strain, log_level, log_
         log_to_file=log_to_file
     )
     gff3.parse()
-    gff3.serialize_to_jsonld()
+    result = gff3.serialize_to_jsonld()
+    print(json.dumps(result, indent=2))
 
 
 if __name__ == "__main__":

From b89b8f3c35ca704263274ffa8b0c640d3963af7a Mon Sep 17 00:00:00 2001
From: danielsf <scott.daniel@alleninstitute.org>
Date: Tue, 24 Jun 2025 11:37:41 -0700
Subject: [PATCH 3/6] glean metadata from ensembl URLs based on filename

---
 .../genome_annotation_translator.py           | 39 +++++++++++--------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
index 62678bf..6b26870 100644
--- a/bkbit/data_translators/genome_annotation_translator.py
+++ b/bkbit/data_translators/genome_annotation_translator.py
@@ -65,6 +65,7 @@
 from tqdm import tqdm
 import click
 import pkg_resources
+import pathlib
 from bkbit.models import genome_annotation as ga
 from bkbit.utils.setup_logger import setup_logger
 from bkbit.utils.load_json import load_json
@@ -906,9 +907,6 @@ def parse_url(content_url):
     # NCBI : [assembly accession.version]_[assembly name]_[content type].[optional format]
     # ENSEMBL :  <species>.<assembly>.<_version>.gff3.gz -> organism full name, assembly name, genome version
     ncbi_pattern = r"/genomes/all/annotation_releases/(\d+)(?:/(\d+))?/(GCF_\d+\.\d+)[_-]([^/]+)/(GCF_\d+\.\d+)[_-]([^/]+)_genomic\.gff\.gz"
-    ensembl_pattern = (
-        r"/pub/release-(\d+)/gff3/([^/]+)/([^/.]+)\.([^/.]+)\.([^/.]+)\.gff3\.gz"
-    )
 
     # Parse the URL to get the path
     parsed_url = urlparse(content_url)
@@ -931,20 +929,27 @@ def parse_url(content_url):
             }
 
     elif "ensembl" in parsed_url.netloc:
-        ensembl_match = re.search(ensembl_pattern, path)
-        if ensembl_match:
-            scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH)
-            result = {
-                "authority": ga.AuthorityType.ENSEMBL,
-                "release_version": ensembl_match.group(1),
-                "scientific_name": ensembl_match.group(3),
-                "assembly_name": ensembl_match.group(4),
-            }
-            taxon_id = scientific_name_to_taxonid.get(
-                result.get("scientific_name").replace("_", " ")
-            )
-            result['taxonid'] = taxon_id
-            return result
+        file_path = pathlib.Path(parsed_url.path).name
+        file_path = file_path.replace('.gff3.gz', '')
+
+        scientific_name = file_path.split('.')[0]
+        file_path = file_path.replace(scientific_name+'.', '')
+        release_version = file_path.split('.')[-1]
+        file_path = file_path.replace('.' + release_version, '')
+        assembly_name = file_path
+
+        scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH)
+        result = {
+            "authority": ga.AuthorityType.ENSEMBL,
+            "release_version": release_version,
+            "scientific_name": scientific_name,
+            "assembly_name": assembly_name,
+        }
+        taxon_id = scientific_name_to_taxonid.get(
+            result.get("scientific_name").replace("_", " ")
+        )
+        result['taxonid'] = taxon_id
+        return result
 
     # If no match is found, return None
     return None

From 54ef7579e353dcea3941ae238788f7aa07bf56a8 Mon Sep 17 00:00:00 2001
From: danielsf <scott.daniel@alleninstitute.org>
Date: Thu, 3 Jul 2025 15:48:15 -0700
Subject: [PATCH 4/6] make progress bar optional

(this is a quality of life improvement for processing
large numbers of Gff3 files in a script or notebook)
---
 .../genome_annotation_translator.py           | 48 +++++++++++++------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
index 6b26870..04e9d1c 100644
--- a/bkbit/data_translators/genome_annotation_translator.py
+++ b/bkbit/data_translators/genome_annotation_translator.py
@@ -164,6 +164,7 @@ def __init__(
         assembly_strain=None,
         log_level="WARNING",
         log_to_file=False,
+        use_tqdm=True
     ):
         """
         Initializes an instance of the GFFTranslator class.
@@ -175,6 +176,7 @@ def __init__(
         - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None.
         - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5').
         """
+        self.use_tqdm = use_tqdm
         self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file)
         try:
             self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH)
@@ -255,7 +257,8 @@ def from_url(
             assembly_accession=None,
             assembly_strain=None,
             log_level="WARNING",
-            log_to_file=False
+            log_to_file=False,
+            use_tqdm=True
     ):
         """
         Initializes an instance of the GFFTranslator class, gleaning
@@ -274,7 +277,8 @@ def from_url(
             assembly_accession=assembly_accession,
             assembly_strain=assembly_strain,
             log_level=log_level,
-            log_to_file=log_to_file
+            log_to_file=log_to_file,
+            use_tqdm=use_tqdm
         )
 
     def __download_gff_file(self):
@@ -299,12 +303,15 @@ def __download_gff_file(self):
             gzip_file_path = f_gzip.name
 
             # Create a progress bar
-            progress_bar = tqdm(
-                total=total_size,
-                unit="iB",
-                unit_scale=True,
-                desc="Downloading GFF file",
-            )
+            if self.use_tqdm:
+                progress_bar = tqdm(
+                    total=total_size,
+                    unit="iB",
+                    unit_scale=True,
+                    desc="Downloading GFF file",
+                )
+            else:
+                progress_bar = None
 
             # Read the file in chunks, write to the temporary file, and update the hash
             while True:
@@ -315,9 +322,11 @@ def __download_gff_file(self):
                 md5_hash.update(data)
                 sha256_hash.update(data)
                 sha1_hash.update(data)
-                progress_bar.update(len(data))
+                if progress_bar is not None:
+                    progress_bar.update(len(data))
 
-            progress_bar.close()
+            if progress_bar is not None:
+                progress_bar.close()
 
         # Return the path to the temporary file and the md5 hash
         return gzip_file_path, {
@@ -528,9 +537,14 @@ def parse(self, feature_filter: tuple[str] = DEFAULT_FEATURE_FILTER):
 
         with open(gff_file, "r", encoding="utf-8") as file:
             curr_line_num = 1
-            progress_bar = tqdm(
-                total=self.__get_line_count(gff_file), desc="Parsing GFF3 File"
-            )
+
+            if self.use_tqdm:
+                progress_bar = tqdm(
+                    total=self.__get_line_count(gff_file), desc="Parsing GFF3 File"
+                )
+            else:
+                progress_bar = None
+
             for line_raw in file:
                 line_strip = line_raw.strip()
                 if curr_line_num == 1 and not line_strip.startswith("##gff-version 3"):
@@ -572,9 +586,13 @@ def parse(self, feature_filter: tuple[str] = DEFAULT_FEATURE_FILTER):
                                 self.gene_annotations[gene_annotation.id] = (
                                     gene_annotation
                                 )
-                progress_bar.update(1)
+                if progress_bar is not None:
+                    progress_bar.update(1)
+
                 curr_line_num += 1
-            progress_bar.close()
+
+            if progress_bar is not None:
+                progress_bar.close()
 
     def generate_ensembl_gene_annotation(self, attributes, curr_line_num):
         """

From 8b7f2ec517c77f3ac4e7279ec8d48084fe67c213 Mon Sep 17 00:00:00 2001
From: danielsf <scott.daniel@alleninstitute.org>
Date: Mon, 14 Jul 2025 15:20:38 -0700
Subject: [PATCH 5/6] can specify tmp_dir in Gff3 class

This will allow users who are downloading and processing
a large number of gff3 files to clean up after themselves
by creating a parent tmp_dir and deleting all of its contents
as needed.
---
 .../genome_annotation_translator.py            | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
index 04e9d1c..84e239d 100644
--- a/bkbit/data_translators/genome_annotation_translator.py
+++ b/bkbit/data_translators/genome_annotation_translator.py
@@ -164,7 +164,8 @@ def __init__(
         assembly_strain=None,
         log_level="WARNING",
         log_to_file=False,
-        use_tqdm=True
+        use_tqdm=True,
+        tmp_dir=None
     ):
         """
         Initializes an instance of the GFFTranslator class.
@@ -176,6 +177,7 @@ def __init__(
         - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None.
         - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5').
         """
+        self._tmp_dir = tmp_dir
         self.use_tqdm = use_tqdm
         self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file)
         try:
@@ -258,7 +260,8 @@ def from_url(
             assembly_strain=None,
             log_level="WARNING",
             log_to_file=False,
-            use_tqdm=True
+            use_tqdm=True,
+            tmp_dir=None
     ):
         """
         Initializes an instance of the GFFTranslator class, gleaning
@@ -278,7 +281,8 @@ def from_url(
             assembly_strain=assembly_strain,
             log_level=log_level,
             log_to_file=log_to_file,
-            use_tqdm=use_tqdm
+            use_tqdm=use_tqdm,
+            tmp_dir=tmp_dir
         )
 
     def __download_gff_file(self):
@@ -299,7 +303,11 @@ def __download_gff_file(self):
         sha1_hash = hashlib.sha1()
 
         # Create a temporary file for the gzip data
-        with tempfile.NamedTemporaryFile(suffix=".gz", delete=False) as f_gzip:
+        with tempfile.NamedTemporaryFile(
+                            suffix=".gz",
+                            delete=False,
+                            dir=self._tmp_dir) as f_gzip:
+
             gzip_file_path = f_gzip.name
 
             # Create a progress bar
@@ -527,7 +535,7 @@ def parse(self, feature_filter: tuple[str] = DEFAULT_FEATURE_FILTER):
             # Decompress the gzip file
             with gzip.open(self.gff_file, "rb") as f_in:
                 # Create a temporary file to save the decompressed data
-                with tempfile.NamedTemporaryFile(delete=False) as f_out:
+                with tempfile.NamedTemporaryFile(delete=False, dir=self._tmp_dir) as f_out:
                     # Copy the decompressed data to the temporary file
                     f_out.write(f_in.read())
                     gff_file = f_out.name

From e0ac0009c555a6e916c8377df981ae18f5d32b22 Mon Sep 17 00:00:00 2001
From: danielsf <scott.daniel@alleninstitute.org>
Date: Thu, 31 Jul 2025 11:40:50 -0700
Subject: [PATCH 6/6] factor out code detecting missing NCBI model

also, if the model is missing, throw a custom error

Previous commits in this branch made it possible for users
to run data_translators/genome_annotation_translator.py
without triggering the code that tells them how to download
the NCBI model. This should fix that.

Throwing a custom error will also allow users of this code to
handle different failures modes (maybe ENSEMBL is publishing
a non-standard GFF3 file) in different ways.
---
 bkbit/data_translators/error_classes.py       |  5 ++++
 .../genome_annotation_translator.py           | 30 ++++++++++++++-----
 2 files changed, 28 insertions(+), 7 deletions(-)
 create mode 100644 bkbit/data_translators/error_classes.py

diff --git a/bkbit/data_translators/error_classes.py b/bkbit/data_translators/error_classes.py
new file mode 100644
index 0000000..a653bf5
--- /dev/null
+++ b/bkbit/data_translators/error_classes.py
@@ -0,0 +1,5 @@
+class MissingModelError(Exception):
+    """
+    Custom error class to denote that a model is missing
+    """
+    pass
diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
index 84e239d..3893c4b 100644
--- a/bkbit/data_translators/genome_annotation_translator.py
+++ b/bkbit/data_translators/genome_annotation_translator.py
@@ -70,6 +70,7 @@
 from bkbit.utils.setup_logger import setup_logger
 from bkbit.utils.load_json import load_json
 
+import bkbit.data_translators.error_classes as error_classes
 
 
 ## CONSTANTS ##
@@ -180,13 +181,9 @@ def __init__(
         self._tmp_dir = tmp_dir
         self.use_tqdm = use_tqdm
         self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file)
-        try:
-            self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH)
-            self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH)
-        except FileNotFoundError as e:
-            self.logger.critical("NCBI Taxonomy not downloaded. Run 'bkbit download-ncbi-taxonomy' command first." )
-            print(e)
-            sys.exit(2)
+        detect_model_presence(logger=self.logger)
+        self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH)
+        self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH)
 
         self.content_url = content_url
 
@@ -273,6 +270,7 @@ def from_url(
         - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None.
         - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5').
         """
+        detect_model_presence()
         url_metadata = parse_url(content_url=content_url)
         return cls(
             content_url=content_url,
@@ -981,6 +979,24 @@ def parse_url(content_url):
     return None
 
 
+def detect_model_presence(logger=None):
+    """
+    Detect if NCBI taxonomy model is present. If not, throw a custom error.
+
+    Paramteers
+    ----------
+    logger:
+        custom  logger where message can be written
+    """
+    scientific_name_path = pathlib.Path(TAXON_SCIENTIFIC_NAME_PATH)
+    taxon_common_name_path = pathlib.Path(TAXON_COMMON_NAME_PATH)
+    if not scientific_name_path.is_file() or not taxon_common_name_path.is_file():
+        msg = "NCBI Taxonomy not downloaded. Run 'bkbit download-ncbi-taxonomy' command first."
+        if logger is not None:
+            logger.critical(msg)
+        raise error_classes.MissingModelError(msg)
+
+
 @click.command()
 ##ARGUEMENTS##
 # Argument #1: The URL of the GFF file