From 721e125bb12ec3039797b8c5ebda0aeda3069460 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 4 Jul 2023 15:45:25 +0200 Subject: [PATCH] 153 bgc mappings generation (#155) * change BGC attributes type from list to tuple The following BGC attributes are updated: - product_prediction - mibig_bgc_class - smiles * use positional-only parameter in BGC and GCF Parameters before "/" are positional-only parameters, see https://docs.python.org/3/glossary.html#term-parameter. * update BGC's `__eq__` and `__hash__` * update GCF's `__eq__` and `__hash__` * Update gcf.py * update Spectrum's `__eq__` and `__hash__` * update MolecularFamily `__eq__` and `__hash__` * Update molecular_family.py * update Strain `__eq__` and `__hash__` * update StrainCollection `__eq__` * add TODO comments to ObjectLink * add parameter type check for `add_alias` * add `__contains__` to Strain class * update `lookup` method of StrainCollection * update `__contains__` in StrainCollection * remove from __eq__ * update `__eq__` logic for Strain * rename `_strain_dict_id` to `_strain_dict_name` in StrainCollection * add comments to `get_common_strains` * add comments and rename variables for DataLinks * add comment about `met_only` parameter * add todo comments to LinkFinder * add comments to GNPSSpectrumLoader to figure out how `spectrum_id` is set * change Spectrum.spectrum_id from type int to str * update spec_dict * Update tests * update `__eq__` in MolecularFamily * change `MolecularFamily.family_id` from type int to str * add method `has_strain` to MolecularFamily * Update metcalf_scoring.py * change array to dataframe in DataLinks 1. Change array to dataframe: - self.M_gcf_strain -> self.gcf_strain_occurrence - self. M_spec_strain -> self.spec_strain_occurrence - self. M_fam_strain -> mf_strain_occurrence 2. update relevant methods to get the new dataframes 3. update logics of method `common_strains` using the new dataframes * update references of the new dataframes from DataLinks * update logics of `get_links` in NPLinker class * Update test_nplinker.py - add code to remove cached results * move SCORING_METHODS to LinkFinder * update method name to `get_common_strains` * refactor mapping dataframes in DataLinks * add TODOs and deprecation to LinkFinder * refactor cooccurrence in DataLinks * merge `load_data` and `find_correlations` to init in DataLinks * refactor DataLinks attributes - Move assignment of attributes to `__init__` - Rename attributes - Replace `fam` or `molfam` with `mf` to refer to molecular family - Add docstrings * Delete test_data_links.py * update get_common_strains methods - update parameters to be more clear and specific - change strain id in returned dict to strain objects - update docstrings * remove lookup_index method from StrainCollection (#90) - remove method `lookup_index` - remove attribute `_strain_dict_index` * Remove integer id from GCF * update lookup methods and attributes in NPLikner class * change cooccurrence from array to DataFrame in DataLinks * format link_finder.py * temp replace array with dataframe in LinkFinder for metcalf scoring * refactor `LinkFinder.get_scores` method * refactor `LinkFinder.metcalf_scoring` method - rename parameter name - wrap parameters for weights to one parameter - extract private method `_cal_mean_std` * refactor get_links * remove unused methods and scorings from LinkFinder - remove unused `likescore` and `hg` scoring types - remove all unused methods * refactor returned type of `LinkFinder.get_links` method * add `lookup_mf` method in NPLinker class * refactor MetcalfScoring class * add deprecation to LinkLikelihood class * add `__init__.py` to linking module * rename `data_linking.py` to `data_links.py` * rename `data_linking_functions.py` to `utils.py` * rename `test_data_linking_functions.py` to `test_linking_utils.py`.py * Delete test_scoring.py * add dtype to DataLinks dataframes * remove mapping dataframes and relevant method from DataLinks Removed: - self.mapping_spec - self.mapping_gcf - self.mapping_fam -self.mapping_strain - _get_mappings_from_occurrence() method * Create test_data_links.py * add `conftest.py` for scoring tests * update LinkFinder's attribute and private method - refactor method `_cal_mean_std` - rename attribute `raw_score_fam_gcf` to `raw_score_mf_gcf` * Create test_link_finder.py * Update vscode plugin autodocstring template - fix indentation bug in autodocsting - remove `Examples:` section * add scope for fixtures * Create test_metcalf_scoring.py * add docstrings and type hints to `MetcalfScoring` class * add util func `isinstance_all` * replace `_isinstance` with util func `isinstance_all` * update validation of args for `DataLinks` * Update test_data_links.py - add docstrings - add more tests * add type hints for returned values to unit tests * update exception types for invalid input * add docstrings and type hints to `LinkFinder` class * add more unit tests for `LinkFinder` * fix input type bug for `DataLinks.get_common_strains` * Create test_nplinker_scoring.py * add todo comments to `NPLinker` class * remove local integration tests for scoring part of `NPLinker` - rename `test_nplinker.py` to `test_nplinker_local.py` * remove unused imports * Fix mypy warnings as much as possible * check strain existence using strain dict * change calculate abbrevation from "cal" to "calc" * remove resolved TODO comment * move shared fixtures to conftest.py * remove unnecessary type hints * update docstrings for cooccurrences * use uuid for singleton molecular families #144 * add TODO comment for GNPSLoader * fix typos * remove useless parameter `met_only` The `met_only` is useless. NPlinker will stop working if met_only=True. * update exception type * refactor the usage of PODPDownloader 1. create instance in the private method, only when it's needed 2. change the scope of the instance from global to local * rename private config attributes in class DatasetLoader - add prefix `_config` for all config attributes - add comments to restructure `__init__` code * change the variable of app data dir to be global this variable is independent of DatasetLoader and other classes, so it should be a global variable * change two public methods to variables * change one public method to attribute for DatasetLoader * add value validation to Config - move the validation of antismash format config in DatasetLoader to Config class - refactor the config data validations into a private method * add TODO comments about init and validate paths * remove unused attribute `growth_media` * remove commented code * add TODO comments * remove unused imports * format the code * reorder methods in loader.py * add function `generate_genome_bgc_mappings_file` * Update __init__.py * add tests for `generate_genome_bgc_mappings_file` * update strain test when 113 issue is closed --- src/nplinker/genomics/__init__.py | 6 +++- src/nplinker/genomics/genomics.py | 49 ++++++++++++++++++++++++++++++- tests/genomics/test_genomics.py | 33 +++++++++++++++++++-- 3 files changed, 83 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py index 7f16a744..525c3015 100644 --- a/src/nplinker/genomics/__init__.py +++ b/src/nplinker/genomics/__init__.py @@ -1,15 +1,17 @@ import logging - from .abc import BGCLoaderBase from .bgc import BGC from .gcf import GCF from .genomics import filter_mibig_only_gcf +from .genomics import generate_genome_bgc_mappings_file +from .genomics import GENOME_BGC_MAPPINGS_FILENAME from .genomics import get_bgcs_from_gcfs from .genomics import get_strains_from_bgcs from .genomics import load_gcfs from .genomics import map_bgc_to_gcf from .genomics import map_strain_to_bgc + logging.getLogger(__name__).addHandler(logging.NullHandler()) __all__ = [ @@ -17,6 +19,8 @@ "BGC", "GCF", "filter_mibig_only_gcf", + "generate_genome_bgc_mappings_file", + "GENOME_BGC_MAPPINGS_FILENAME", "get_bgcs_from_gcfs", "get_strains_from_bgcs", "load_gcfs", diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index 2ed1ab62..751dfd29 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -1,16 +1,62 @@ from __future__ import annotations import csv +import json from os import PathLike from pathlib import Path from deprecated import deprecated from nplinker.logconfig import LogConfig from nplinker.strain_collection import StrainCollection +from nplinker.utils import list_dirs +from nplinker.utils import list_files from .bgc import BGC from .gcf import GCF logger = LogConfig.getLogger(__name__) +GENOME_BGC_MAPPINGS_FILENAME = "genome_bgc_mappings.json" + + +def generate_genome_bgc_mappings_file(bgc_dir: str | PathLike) -> None: + """Generate a file that maps genome id to BGC id. + + The output file is named in variable `GENOME_BGC_MAPPINGS_FILENAME` and + is placed in the same directory as `bgc_dir`. The file will be overwritten + if it already exists. + + Args: + bgc_dir(str | PathLike): The directory has one-layer of subfolders and + each subfolder contains BGC files in `.gbk` format. + It assumes that + - the subfolder name is the genome id (e.g. refseq), + - the BGC file name is the BGC id. + """ + bgc_dir = Path(bgc_dir) + genome_bgc_mappings = {} + + for subdir in list_dirs(bgc_dir): + genome_id = Path(subdir).name + bgc_files = list_files(subdir, suffix=(".gbk"), keep_parent=False) + bgc_ids = [ + bgc_id for f in bgc_files if (bgc_id := Path(f).stem) != genome_id + ] + genome_bgc_mappings[genome_id] = bgc_ids + + # sort mappings by genome_id and construct json data + genome_bgc_mappings = dict(sorted(genome_bgc_mappings.items())) + json_data = [{ + "genome_ID": k, + "BGC_ID": v + } for k, v in genome_bgc_mappings.items()] + json_data = { + "mappings": json_data, + "count": len(json_data), + "version": "1.0" + } + + with open(bgc_dir / GENOME_BGC_MAPPINGS_FILENAME, "w") as f: + json.dump(json_data, f) + def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC], bgc_genome_mapping: dict[str, str]): @@ -36,7 +82,8 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC], genome_id = bgc_genome_mapping[bgc.bgc_id] except KeyError as e: raise KeyError( - f"Not found BGC id {bgc.bgc_id} in BGC-genome mappings.") from e + f"Not found BGC id {bgc.bgc_id} in BGC-genome mappings." + ) from e try: strain = strains.lookup(genome_id) except KeyError as e: diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index dfde9e6c..2abc3dc6 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -1,14 +1,43 @@ from __future__ import annotations +import json import pytest from nplinker.genomics import BGC from nplinker.genomics import filter_mibig_only_gcf from nplinker.genomics import GCF +from nplinker.genomics import generate_genome_bgc_mappings_file +from nplinker.genomics import GENOME_BGC_MAPPINGS_FILENAME from nplinker.genomics import get_bgcs_from_gcfs from nplinker.genomics import get_strains_from_bgcs from nplinker.genomics import map_bgc_to_gcf from nplinker.genomics import map_strain_to_bgc from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain +from .. import DATA_DIR + + +def test_generate_genome_bgc_mappings_file(): + bgc_dir = DATA_DIR / "antismash" + + generate_genome_bgc_mappings_file(bgc_dir) + + with open(bgc_dir / GENOME_BGC_MAPPINGS_FILENAME) as f: + mappings = json.load(f) + + assert mappings["count"] == 2 + + assert mappings["mappings"][0]["genome_ID"] == "GCF_000514515.1" + assert len(mappings["mappings"][0]["BGC_ID"]) == 20 + for bgc_id in ["NZ_AZWB01000005.region001", "NZ_KI911412.1.region001"]: + assert bgc_id in mappings["mappings"][0]["BGC_ID"] + assert "GCF_000514515.1" not in mappings["mappings"][0]["BGC_ID"] + + assert mappings["mappings"][1]["genome_ID"] == "GCF_000514855.1" + assert len(mappings["mappings"][1]["BGC_ID"]) == 24 + for bgc_id in ["NZ_AZWS01000001.region001", "NZ_KI911483.1.region001"]: + assert bgc_id in mappings["mappings"][1]["BGC_ID"] + assert "GCF_000514855.1" not in mappings["mappings"][1]["BGC_ID"] + + (bgc_dir / GENOME_BGC_MAPPINGS_FILENAME).unlink() @pytest.fixture @@ -131,6 +160,4 @@ def test_get_strains_from_bgcs(strain_collection, bgc_list, map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping) strains = get_strains_from_bgcs(bgc_list) assert isinstance(strains, StrainCollection) - for strain in strain_collection: - assert strain in strains - # assert strains == strain_collection # use it when issue #113 is solved + assert strains == strain_collection