From 61cf106cafc195e59c1f55e7ff67c4e43132dec5 Mon Sep 17 00:00:00 2001 From: danielsf Date: Tue, 24 Jun 2025 11:04:02 -0700 Subject: [PATCH 1/6] factor URL parsing out into a separate method this will allow us to ingest gff files like https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz that do not encode the species, annotation, etc. in the URL (provided users provide this metadata by hand to the GFF constructor) --- .../genome_annotation_translator.py | 159 +++++++++++------- 1 file changed, 98 insertions(+), 61 deletions(-) diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py index 4c47be9..b6dfea5 100644 --- a/bkbit/data_translators/genome_annotation_translator.py +++ b/bkbit/data_translators/genome_annotation_translator.py @@ -112,9 +112,6 @@ class Gff3: __init__(content_url, assembly_accession=None, assembly_strain=None, log_level="WARNING", log_to_file=False): Initializes the Gff3 class with the provided parameters. - parse_url(): - Parses the content URL and extracts information about the genome annotation. - __download_gff_file(): Downloads a GFF file from a given URL and calculates the MD5, SHA256, and SHA1 hashes. @@ -161,6 +158,7 @@ class Gff3: def __init__( self, content_url, + url_metadata, assembly_accession=None, assembly_strain=None, log_level="WARNING", @@ -171,13 +169,13 @@ def __init__( Parameters: - content_url (str): The URL of the GFF file. + - url_metadata (dict): dict containing metadata about the genome annotation - assembly_id (str): The ID of the genome assembly. - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None. - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5'). """ self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file) try: - self.scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH) self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH) self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH) except FileNotFoundError as e: @@ -189,7 +187,6 @@ def __init__( ## STEP 1: Parse the content URL to get metadata # Parse content_url to get metadata - url_metadata = self.parse_url() if url_metadata is None: self.logger.critical( "The provided content URL is not supported. Please provide a valid URL." @@ -212,13 +209,10 @@ def __init__( self.authority = url_metadata.get("authority") # Assign the taxon_id and assembly_id based on the authority + taxon_id = url_metadata.get("taxonid") if self.authority.value == ga.AuthorityType.NCBI.value: - taxon_id = url_metadata.get("taxonid") assembly_id = url_metadata.get("assembly_accession") elif self.authority.value == ga.AuthorityType.ENSEMBL.value: - taxon_id = self.scientific_name_to_taxonid.get( - url_metadata.get("scientific_name").replace("_", " ") - ) if assembly_accession is None: self.logger.critical( "The assembly ID is required for Ensembl URLs. Please provide the assembly ID." @@ -253,60 +247,35 @@ def __init__( self.gene_annotations = {} - def parse_url(self): + @classmethod + def from_url( + cls, + content_url, + assembly_accession=None, + assembly_strain=None, + log_level="WARNING", + log_to_file=False + ): """ - Parses the content URL and extracts information about the genome annotation. + Initializes an instance of the GFFTranslator class, gleaning + metadata about the annotation from the URL of the GFF file. - Returns: - A dictionary containing the following information: - - 'authority': The authority type (NCBI or ENSEMBL). - - 'taxonid': The taxon ID of the genome. - - 'release_version': The release version of the genome annotation. - - 'assembly_accession': The assembly accession of the genome. - - 'assembly_name': The name of the assembly. - - 'species': The species name (only for ENSEMBL URLs). - """ - # Define regex patterns for NCBI and Ensembl URLs - # NCBI : [assembly accession.version]_[assembly name]_[content type].[optional format] - # ENSEMBL : ..<_version>.gff3.gz -> organism full name, assembly name, genome version - ncbi_pattern = r"/genomes/all/annotation_releases/(\d+)(?:/(\d+))?/(GCF_\d+\.\d+)[_-]([^/]+)/(GCF_\d+\.\d+)[_-]([^/]+)_genomic\.gff\.gz" - ensembl_pattern = ( - r"/pub/release-(\d+)/gff3/([^/]+)/([^/.]+)\.([^/.]+)\.([^/.]+)\.gff3\.gz" + Parameters: + - content_url (str): The URL of the GFF file. + - assembly_id (str): The ID of the genome assembly. + - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None. + - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5'). + """ + url_metadata = parse_url(content_url=content_url) + return cls( + content_url=content_url, + url_metadata=url_metadata, + assembly_accession=assembly_accession, + assembly_strain=assembly_strain, + log_level=log_level, + log_to_file=log_to_file ) - # Parse the URL to get the path - parsed_url = urlparse(self.content_url) - path = parsed_url.path - - # Determine if the URL is from NCBI or Ensembl and extract information - if "ncbi" in parsed_url.netloc: - ncbi_match = re.search(ncbi_pattern, path) - if ncbi_match: - return { - "authority": ga.AuthorityType.NCBI, - "taxonid": ncbi_match.group(1), - "release_version": ( - ncbi_match.group(2) - if ncbi_match.group(2) - else ncbi_match.group(4) - ), - "assembly_accession": ncbi_match.group(3), - "assembly_name": ncbi_match.group(6), - } - - elif "ensembl" in parsed_url.netloc: - ensembl_match = re.search(ensembl_pattern, path) - if ensembl_match: - return { - "authority": ga.AuthorityType.ENSEMBL, - "release_version": ensembl_match.group(1), - "scientific_name": ensembl_match.group(3), - "assembly_name": ensembl_match.group(4), - } - - # If no match is found, return None - return None - def __download_gff_file(self): """ Downloads a GFF file from a given URL and calculates the MD5, SHA256, and SHA1 hashes. @@ -896,6 +865,70 @@ def serialize_to_jsonld( print(json.dumps(output_data, indent=2)) +def parse_url(content_url): + """ + Parses the content URL and extracts information about the genome annotation. + + Parameters: + content_url: a string. The URL of the GFF file being parsed for metadata + + Returns: + A dictionary containing the following information: + - 'authority': The authority type (NCBI or ENSEMBL). + - 'taxonid': The taxon ID of the genome. + - 'release_version': The release version of the genome annotation. + - 'assembly_accession': The assembly accession of the genome. + - 'assembly_name': The name of the assembly. + - 'species': The species name (only for ENSEMBL URLs). + """ + # Define regex patterns for NCBI and Ensembl URLs + # NCBI : [assembly accession.version]_[assembly name]_[content type].[optional format] + # ENSEMBL : ..<_version>.gff3.gz -> organism full name, assembly name, genome version + ncbi_pattern = r"/genomes/all/annotation_releases/(\d+)(?:/(\d+))?/(GCF_\d+\.\d+)[_-]([^/]+)/(GCF_\d+\.\d+)[_-]([^/]+)_genomic\.gff\.gz" + ensembl_pattern = ( + r"/pub/release-(\d+)/gff3/([^/]+)/([^/.]+)\.([^/.]+)\.([^/.]+)\.gff3\.gz" + ) + + # Parse the URL to get the path + parsed_url = urlparse(content_url) + path = parsed_url.path + + # Determine if the URL is from NCBI or Ensembl and extract information + if "ncbi" in parsed_url.netloc: + ncbi_match = re.search(ncbi_pattern, path) + if ncbi_match: + return { + "authority": ga.AuthorityType.NCBI, + "taxonid": ncbi_match.group(1), + "release_version": ( + ncbi_match.group(2) + if ncbi_match.group(2) + else ncbi_match.group(4) + ), + "assembly_accession": ncbi_match.group(3), + "assembly_name": ncbi_match.group(6), + } + + elif "ensembl" in parsed_url.netloc: + ensembl_match = re.search(ensembl_pattern, path) + if ensembl_match: + scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH) + result = { + "authority": ga.AuthorityType.ENSEMBL, + "release_version": ensembl_match.group(1), + "scientific_name": ensembl_match.group(3), + "assembly_name": ensembl_match.group(4), + } + taxon_id = scientific_name_to_taxonid.get( + result.get("scientific_name").replace("_", " ") + ) + result['taxonid'] = taxon_id + return result + + # If no match is found, return None + return None + + @click.command() ##ARGUEMENTS## # Argument #1: The URL of the GFF file @@ -932,8 +965,12 @@ def gff2jsonld(content_url, assembly_accession, assembly_strain, log_level, log_ ''' Creates GeneAnnotation objects from a GFF3 file and serializes them to JSON-LD format. ''' - gff3 = Gff3( - content_url, assembly_accession, assembly_strain, log_level, log_to_file + gff3 = Gff3.from_url( + content_url=content_url, + assembly_accession=assembly_accession, + assembly_strain=assembly_strain, + log_level=log_level, + log_to_file=log_to_file ) gff3.parse() gff3.serialize_to_jsonld() From 38fbc91413051f05857c108d63efdfb673f274a0 Mon Sep 17 00:00:00 2001 From: danielsf Date: Tue, 24 Jun 2025 11:40:26 -0700 Subject: [PATCH 2/6] Gff3.serialize_to_jsonld returns a dict this gives users the option of keeping the dict in memory, in case they do not want the output written to stdout --- .../genome_annotation_translator.py | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py index b6dfea5..62678bf 100644 --- a/bkbit/data_translators/genome_annotation_translator.py +++ b/bkbit/data_translators/genome_annotation_translator.py @@ -831,14 +831,22 @@ def serialize_to_jsonld( self, exclude_none: bool = True, exclude_unset: bool = False ): """ - Serialize the object and either write it to the specified output file or print it to the CLI. + Serialize the object and either write it + to the specified output file or print it to the CLI. Parameters: - exclude_none (bool): Whether to exclude None values in the output. - exclude_unset (bool): Whether to exclude unset values in the output. + exclude_none (bool): + Whether to exclude None values in the output. + exclude_unset (bool): + Whether to exclude unset values in the output. Returns: - None + Dict + + Notes + ----- + Overrode default implementation so that we can return the dict + containing the jsonld graph """ data = [ @@ -853,16 +861,29 @@ def serialize_to_jsonld( ), ] for ck in self.checksums: - data.append(ck.dict(exclude_none=exclude_none, exclude_unset=exclude_unset)) + data.append( + ck.dict( + exclude_none=exclude_none, + exclude_unset=exclude_unset + ) + ) for ga in self.gene_annotations.values(): - data.append(ga.dict(exclude_none=exclude_none, exclude_unset=exclude_unset)) + data.append( + ga.dict( + exclude_none=exclude_none, + exclude_unset=exclude_unset + ) + ) output_data = { - "@context": "https://raw.githubusercontent.com/brain-bican/models/main/jsonld-context-autogen/genome_annotation.context.jsonld", + "@context": ( + "https://raw.githubusercontent.com/brain-bican/" + "models/main/jsonld-context-autogen/" + "genome_annotation.context.jsonld" + ), "@graph": data, } - - print(json.dumps(output_data, indent=2)) + return output_data def parse_url(content_url): @@ -973,7 +994,8 @@ def gff2jsonld(content_url, assembly_accession, assembly_strain, log_level, log_ log_to_file=log_to_file ) gff3.parse() - gff3.serialize_to_jsonld() + result = gff3.serialize_to_jsonld() + print(json.dumps(result, indent=2)) if __name__ == "__main__": From b89b8f3c35ca704263274ffa8b0c640d3963af7a Mon Sep 17 00:00:00 2001 From: danielsf Date: Tue, 24 Jun 2025 11:37:41 -0700 Subject: [PATCH 3/6] glean metadata from ensembl URLs based on filename --- .../genome_annotation_translator.py | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py index 62678bf..6b26870 100644 --- a/bkbit/data_translators/genome_annotation_translator.py +++ b/bkbit/data_translators/genome_annotation_translator.py @@ -65,6 +65,7 @@ from tqdm import tqdm import click import pkg_resources +import pathlib from bkbit.models import genome_annotation as ga from bkbit.utils.setup_logger import setup_logger from bkbit.utils.load_json import load_json @@ -906,9 +907,6 @@ def parse_url(content_url): # NCBI : [assembly accession.version]_[assembly name]_[content type].[optional format] # ENSEMBL : ..<_version>.gff3.gz -> organism full name, assembly name, genome version ncbi_pattern = r"/genomes/all/annotation_releases/(\d+)(?:/(\d+))?/(GCF_\d+\.\d+)[_-]([^/]+)/(GCF_\d+\.\d+)[_-]([^/]+)_genomic\.gff\.gz" - ensembl_pattern = ( - r"/pub/release-(\d+)/gff3/([^/]+)/([^/.]+)\.([^/.]+)\.([^/.]+)\.gff3\.gz" - ) # Parse the URL to get the path parsed_url = urlparse(content_url) @@ -931,20 +929,27 @@ def parse_url(content_url): } elif "ensembl" in parsed_url.netloc: - ensembl_match = re.search(ensembl_pattern, path) - if ensembl_match: - scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH) - result = { - "authority": ga.AuthorityType.ENSEMBL, - "release_version": ensembl_match.group(1), - "scientific_name": ensembl_match.group(3), - "assembly_name": ensembl_match.group(4), - } - taxon_id = scientific_name_to_taxonid.get( - result.get("scientific_name").replace("_", " ") - ) - result['taxonid'] = taxon_id - return result + file_path = pathlib.Path(parsed_url.path).name + file_path = file_path.replace('.gff3.gz', '') + + scientific_name = file_path.split('.')[0] + file_path = file_path.replace(scientific_name+'.', '') + release_version = file_path.split('.')[-1] + file_path = file_path.replace('.' + release_version, '') + assembly_name = file_path + + scientific_name_to_taxonid = load_json(SCIENTIFIC_NAME_TO_TAXONID_PATH) + result = { + "authority": ga.AuthorityType.ENSEMBL, + "release_version": release_version, + "scientific_name": scientific_name, + "assembly_name": assembly_name, + } + taxon_id = scientific_name_to_taxonid.get( + result.get("scientific_name").replace("_", " ") + ) + result['taxonid'] = taxon_id + return result # If no match is found, return None return None From 54ef7579e353dcea3941ae238788f7aa07bf56a8 Mon Sep 17 00:00:00 2001 From: danielsf Date: Thu, 3 Jul 2025 15:48:15 -0700 Subject: [PATCH 4/6] make progress bar optional (this is a quality of life improvement for processing large numbers of Gff3 files in a script or notebook) --- .../genome_annotation_translator.py | 48 +++++++++++++------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py index 6b26870..04e9d1c 100644 --- a/bkbit/data_translators/genome_annotation_translator.py +++ b/bkbit/data_translators/genome_annotation_translator.py @@ -164,6 +164,7 @@ def __init__( assembly_strain=None, log_level="WARNING", log_to_file=False, + use_tqdm=True ): """ Initializes an instance of the GFFTranslator class. @@ -175,6 +176,7 @@ def __init__( - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None. - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5'). """ + self.use_tqdm = use_tqdm self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file) try: self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH) @@ -255,7 +257,8 @@ def from_url( assembly_accession=None, assembly_strain=None, log_level="WARNING", - log_to_file=False + log_to_file=False, + use_tqdm=True ): """ Initializes an instance of the GFFTranslator class, gleaning @@ -274,7 +277,8 @@ def from_url( assembly_accession=assembly_accession, assembly_strain=assembly_strain, log_level=log_level, - log_to_file=log_to_file + log_to_file=log_to_file, + use_tqdm=use_tqdm ) def __download_gff_file(self): @@ -299,12 +303,15 @@ def __download_gff_file(self): gzip_file_path = f_gzip.name # Create a progress bar - progress_bar = tqdm( - total=total_size, - unit="iB", - unit_scale=True, - desc="Downloading GFF file", - ) + if self.use_tqdm: + progress_bar = tqdm( + total=total_size, + unit="iB", + unit_scale=True, + desc="Downloading GFF file", + ) + else: + progress_bar = None # Read the file in chunks, write to the temporary file, and update the hash while True: @@ -315,9 +322,11 @@ def __download_gff_file(self): md5_hash.update(data) sha256_hash.update(data) sha1_hash.update(data) - progress_bar.update(len(data)) + if progress_bar is not None: + progress_bar.update(len(data)) - progress_bar.close() + if progress_bar is not None: + progress_bar.close() # Return the path to the temporary file and the md5 hash return gzip_file_path, { @@ -528,9 +537,14 @@ def parse(self, feature_filter: tuple[str] = DEFAULT_FEATURE_FILTER): with open(gff_file, "r", encoding="utf-8") as file: curr_line_num = 1 - progress_bar = tqdm( - total=self.__get_line_count(gff_file), desc="Parsing GFF3 File" - ) + + if self.use_tqdm: + progress_bar = tqdm( + total=self.__get_line_count(gff_file), desc="Parsing GFF3 File" + ) + else: + progress_bar = None + for line_raw in file: line_strip = line_raw.strip() if curr_line_num == 1 and not line_strip.startswith("##gff-version 3"): @@ -572,9 +586,13 @@ def parse(self, feature_filter: tuple[str] = DEFAULT_FEATURE_FILTER): self.gene_annotations[gene_annotation.id] = ( gene_annotation ) - progress_bar.update(1) + if progress_bar is not None: + progress_bar.update(1) + curr_line_num += 1 - progress_bar.close() + + if progress_bar is not None: + progress_bar.close() def generate_ensembl_gene_annotation(self, attributes, curr_line_num): """ From 8b7f2ec517c77f3ac4e7279ec8d48084fe67c213 Mon Sep 17 00:00:00 2001 From: danielsf Date: Mon, 14 Jul 2025 15:20:38 -0700 Subject: [PATCH 5/6] can specify tmp_dir in Gff3 class This will allow users who are downloading and processing a large number of gff3 files to clean up after themselves by creating a parent tmp_dir and deleting all of its contents as needed. --- .../genome_annotation_translator.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py index 04e9d1c..84e239d 100644 --- a/bkbit/data_translators/genome_annotation_translator.py +++ b/bkbit/data_translators/genome_annotation_translator.py @@ -164,7 +164,8 @@ def __init__( assembly_strain=None, log_level="WARNING", log_to_file=False, - use_tqdm=True + use_tqdm=True, + tmp_dir=None ): """ Initializes an instance of the GFFTranslator class. @@ -176,6 +177,7 @@ def __init__( - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None. - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5'). """ + self._tmp_dir = tmp_dir self.use_tqdm = use_tqdm self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file) try: @@ -258,7 +260,8 @@ def from_url( assembly_strain=None, log_level="WARNING", log_to_file=False, - use_tqdm=True + use_tqdm=True, + tmp_dir=None ): """ Initializes an instance of the GFFTranslator class, gleaning @@ -278,7 +281,8 @@ def from_url( assembly_strain=assembly_strain, log_level=log_level, log_to_file=log_to_file, - use_tqdm=use_tqdm + use_tqdm=use_tqdm, + tmp_dir=tmp_dir ) def __download_gff_file(self): @@ -299,7 +303,11 @@ def __download_gff_file(self): sha1_hash = hashlib.sha1() # Create a temporary file for the gzip data - with tempfile.NamedTemporaryFile(suffix=".gz", delete=False) as f_gzip: + with tempfile.NamedTemporaryFile( + suffix=".gz", + delete=False, + dir=self._tmp_dir) as f_gzip: + gzip_file_path = f_gzip.name # Create a progress bar @@ -527,7 +535,7 @@ def parse(self, feature_filter: tuple[str] = DEFAULT_FEATURE_FILTER): # Decompress the gzip file with gzip.open(self.gff_file, "rb") as f_in: # Create a temporary file to save the decompressed data - with tempfile.NamedTemporaryFile(delete=False) as f_out: + with tempfile.NamedTemporaryFile(delete=False, dir=self._tmp_dir) as f_out: # Copy the decompressed data to the temporary file f_out.write(f_in.read()) gff_file = f_out.name From e0ac0009c555a6e916c8377df981ae18f5d32b22 Mon Sep 17 00:00:00 2001 From: danielsf Date: Thu, 31 Jul 2025 11:40:50 -0700 Subject: [PATCH 6/6] factor out code detecting missing NCBI model also, if the model is missing, throw a custom error Previous commits in this branch made it possible for users to run data_translators/genome_annotation_translator.py without triggering the code that tells them how to download the NCBI model. This should fix that. Throwing a custom error will also allow users of this code to handle different failures modes (maybe ENSEMBL is publishing a non-standard GFF3 file) in different ways. --- bkbit/data_translators/error_classes.py | 5 ++++ .../genome_annotation_translator.py | 30 ++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 bkbit/data_translators/error_classes.py diff --git a/bkbit/data_translators/error_classes.py b/bkbit/data_translators/error_classes.py new file mode 100644 index 0000000..a653bf5 --- /dev/null +++ b/bkbit/data_translators/error_classes.py @@ -0,0 +1,5 @@ +class MissingModelError(Exception): + """ + Custom error class to denote that a model is missing + """ + pass diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py index 84e239d..3893c4b 100644 --- a/bkbit/data_translators/genome_annotation_translator.py +++ b/bkbit/data_translators/genome_annotation_translator.py @@ -70,6 +70,7 @@ from bkbit.utils.setup_logger import setup_logger from bkbit.utils.load_json import load_json +import bkbit.data_translators.error_classes as error_classes ## CONSTANTS ## @@ -180,13 +181,9 @@ def __init__( self._tmp_dir = tmp_dir self.use_tqdm = use_tqdm self.logger = setup_logger(LOG_FILE_NAME, log_level, log_to_file) - try: - self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH) - self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH) - except FileNotFoundError as e: - self.logger.critical("NCBI Taxonomy not downloaded. Run 'bkbit download-ncbi-taxonomy' command first." ) - print(e) - sys.exit(2) + detect_model_presence(logger=self.logger) + self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH) + self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH) self.content_url = content_url @@ -273,6 +270,7 @@ def from_url( - assembly_strain (str, optional): The strain of the genome assembly. Defaults to None. - hash_functions (tuple[str]): A tuple of hash functions to use for generating checksums. Defaults to ('MD5'). """ + detect_model_presence() url_metadata = parse_url(content_url=content_url) return cls( content_url=content_url, @@ -981,6 +979,24 @@ def parse_url(content_url): return None +def detect_model_presence(logger=None): + """ + Detect if NCBI taxonomy model is present. If not, throw a custom error. + + Paramteers + ---------- + logger: + custom logger where message can be written + """ + scientific_name_path = pathlib.Path(TAXON_SCIENTIFIC_NAME_PATH) + taxon_common_name_path = pathlib.Path(TAXON_COMMON_NAME_PATH) + if not scientific_name_path.is_file() or not taxon_common_name_path.is_file(): + msg = "NCBI Taxonomy not downloaded. Run 'bkbit download-ncbi-taxonomy' command first." + if logger is not None: + logger.critical(msg) + raise error_classes.MissingModelError(msg) + + @click.command() ##ARGUEMENTS## # Argument #1: The URL of the GFF file