From 8e4dcd85c4285e0cb8d7d9d66b5168bc9a622c62 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 11 Aug 2023 13:57:31 +0200 Subject: [PATCH 01/21] remove useless step --- src/nplinker/nplinker.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 7c73804d..7b07ec9f 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -263,14 +263,8 @@ def load_data(self, new_bigscape_cutoff=None): return False else: # CG: only reload genomics data when changing bigscape cutoff - # TODO: this part should be removed, reload everything if bigscape data changes. - # 1. change the cutoff (which by itself doesn't do anything) self._loader._bigscape_cutoff = new_bigscape_cutoff - # 2. reload the strain mappings (MiBIG filtering may have removed strains - # that were originally present, need to restore them all so the filtering - # won't break when it runs again in next stage) - self._loader._load_strain_mappings() - # 3. reload the genomics data with the new cutoff applied + # TODO: only need to reload gcfs using load_gcfs() self._loader._load_genomics() self._spectra = self._loader.spectra From 49171613d8d3d68cabe5acad23e425560731b0b6 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 11 Aug 2023 13:58:30 +0200 Subject: [PATCH 02/21] remove completed TODO comment --- src/nplinker/genomics/gcf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 1bf413f1..c1b0d9d9 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -3,6 +3,7 @@ from nplinker.logconfig import LogConfig from nplinker.strain_collection import StrainCollection + if TYPE_CHECKING: from nplinker.strains import Strain from .bgc import BGC @@ -33,8 +34,6 @@ def __init__(self, gcf_id: str, /) -> None: self.gcf_id = gcf_id self._bgcs: set[BGC] = set() self.bigscape_class: str | None = None - # CG TODO: remove attribute id, see issue 103 - # https://github.com/NPLinker/nplinker/issues/103 self.bgc_ids: set[str] = set() self.strains: StrainCollection = StrainCollection() From afd81e5d31fa237e00c4d9bd60d17835cae0b1c7 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 11 Aug 2023 13:58:45 +0200 Subject: [PATCH 03/21] remove param `bgc_genome_mapping` from func `map_strain_to_bgc` The StrainCollection should contains the bgc id, so genome id is not required. --- src/nplinker/genomics/genomics.py | 27 +++------- tests/genomics/test_genomics.py | 87 +++++++++++++------------------ 2 files changed, 44 insertions(+), 70 deletions(-) diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index 3c1ec2ee..42b8001a 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -69,38 +69,25 @@ def generate_mappings_genome_id_bgc_id( logger.info("Generated genome-BGC mappings file: %s", output_file) -def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC], - bgc_genome_mapping: dict[str, str]): +def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]): """To set BGC object's strain with representative strain object. This method changes the list `bgcs` in place. - It's assumed that BGC's genome id is used as strain's name or alias, and - the genome id is used to lookup the representative strain. - Args: strains(StrainCollection): A collection of all strain objects. bgcs(list[BGC]): A list of BGC objects. - bgc_genome_mapping(dict[str, str]): The mappings from BGC id (key) to - genome id (value). Raises: - KeyError: BGC id not found in the `bgc_genome_mapping` dict. KeyError: Strain id not found in the strain collection. """ for bgc in bgcs: try: - genome_id = bgc_genome_mapping[bgc.bgc_id] + strain = strains.lookup(bgc.bgc_id) except KeyError as e: raise KeyError( - f"Not found BGC id {bgc.bgc_id} in BGC-genome mappings." - ) from e - try: - strain = strains.lookup(genome_id) - except KeyError as e: - raise KeyError( - f"Strain id {genome_id} from BGC object {bgc.bgc_id} " - "not found in the StrainCollection object.") from e + f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' " + "not found in the strain collection.") from e bgc.strain = strain @@ -122,8 +109,9 @@ def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]): try: bgc = bgc_dict[bgc_id] except KeyError as e: - raise KeyError(f"BGC id {bgc_id} from GCF object {gcf.gcf_id} " - "not found in the list of BGC objects.") from e + raise KeyError( + f"BGC id '{bgc_id}' from GCF object '{gcf.gcf_id}' " + "not found in the list of BGC objects.") from e gcf.add_bgc(bgc) @@ -155,6 +143,7 @@ def get_strains_from_bgcs(bgcs: list[BGC]) -> StrainCollection: return sc + @deprecated(version="1.3.3", reason="It is split to separate functions: " \ "map_strain_to_bgc, map_bgc_to_gcf, filter_mibig_only_gcf, " \ "get_bgcs_from_gcfs and get_strains_from_bgcs.") diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index 8b566b90..37fdef36 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -73,121 +73,106 @@ def test_generate_mappings_genome_id_bgc_id_empty_dir(tmp_path, caplog): @pytest.fixture def strain_collection() -> StrainCollection: sc = StrainCollection() - sc.add(Strain("G-SAMPLE0001")) - strain = Strain("S-EXAMPLE002") - strain.add_alias("G-SAMPLE0002") + strain = Strain("STRAIN_01") + strain.add_alias("BGC_01") sc.add(strain) - sc.add(Strain("BGC0000001")) - sc.add(Strain("BGC0000002")) - return sc + strain = Strain("STRAIN_02") + strain.add_alias("BGC_02") + strain.add_alias("BGC_02_1") + sc.add(strain) + strain = Strain("SAMPLE_BGC_03") + sc.add(strain) -@pytest.fixture -def bgc_genome_mapping() -> dict[str, str]: - return { - "SAMPLE0001": "G-SAMPLE0001", - "SAMPLE0002": "G-SAMPLE0002", - "BGC0000001": "BGC0000001", - "BGC0000002": "BGC0000002" - } + return sc @pytest.fixture def bgc_list() -> list[BGC]: return [ - BGC("SAMPLE0001", "NPR"), - BGC("SAMPLE0002", "Alkaloid"), - BGC("BGC0000001", "Polyketide"), - BGC("BGC0000002", "Terpene") + BGC("BGC_01", "NPR"), + BGC("BGC_02", "Alkaloid"), + BGC("SAMPLE_BGC_03", "Polyketide") ] @pytest.fixture def gcf_list() -> list[GCF]: gcf1 = GCF("1") - gcf1.bgc_ids |= set(("SAMPLE0001", "SAMPLE0002")) + gcf1.bgc_ids |= {"BGC_01"} gcf2 = GCF("2") - gcf2.bgc_ids |= set(("BGC0000001", "BGC0000002")) + gcf2.bgc_ids |= {"BGC_02", "SAMPLE_BGC_03"} return [gcf1, gcf2] @pytest.fixture def gcf_list_error() -> list[GCF]: gcf1 = GCF("1") - gcf1.bgc_ids |= set(("SAMPLE0001", "SAMPLE0003")) + gcf1.bgc_ids |= {"SAMPLE_BGC_03", "BGC_04"} return [gcf1] -def test_map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping): +def test_map_strain_to_bgc(strain_collection, bgc_list): for bgc in bgc_list: assert bgc.strain is None - map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping) + map_strain_to_bgc(strain_collection, bgc_list) for bgc in bgc_list: assert bgc.strain is not None - assert bgc_list[0].strain.id == "G-SAMPLE0001" - assert bgc_list[1].strain.id == "S-EXAMPLE002" - assert bgc_list[2].strain.id == "BGC0000001" - assert bgc_list[3].strain.id == "BGC0000002" + assert bgc_list[0].strain.id == "STRAIN_01" + assert bgc_list[1].strain.id == "STRAIN_02" + assert bgc_list[2].strain.id == "SAMPLE_BGC_03" def test_map_strain_to_bgc_error(strain_collection): - bgc_genome_mapping = {"SAMPLE0003": "G-SAMPLE0003"} - bgcs = [BGC("BGC0000003", ["Polyketide"])] - with pytest.raises(KeyError) as e: - map_strain_to_bgc(strain_collection, bgcs, bgc_genome_mapping) - assert "Not found BGC id BGC0000003 in BGC-genome mappings." in e.value.args[ - 0] - - bgcs = [BGC("SAMPLE0003", ["NPR"])] + bgcs = [BGC("BGC_04", "NPR")] with pytest.raises(KeyError) as e: - map_strain_to_bgc(strain_collection, bgcs, bgc_genome_mapping) - assert "Strain id G-SAMPLE0003 from BGC object SAMPLE0003 not found" in e.value.args[ + map_strain_to_bgc(strain_collection, bgcs) + assert "Strain id 'BGC_04' from BGC object 'BGC_04' not found" in e.value.args[ 0] def test_map_bgc_to_gcf(bgc_list, gcf_list): - assert gcf_list[0].bgc_ids == set(("SAMPLE0001", "SAMPLE0002")) - assert gcf_list[1].bgc_ids == set(("BGC0000001", "BGC0000002")) + assert gcf_list[0].bgc_ids == {"BGC_01"} + assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"} assert len(gcf_list[0].bgcs) == 0 assert len(gcf_list[1].bgcs) == 0 map_bgc_to_gcf(bgc_list, gcf_list) - assert gcf_list[0].bgc_ids == set(("SAMPLE0001", "SAMPLE0002")) - assert gcf_list[1].bgc_ids == set(("BGC0000001", "BGC0000002")) - assert len(gcf_list[0].bgcs) == 2 + assert gcf_list[0].bgc_ids == {"BGC_01"} + assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"} + assert len(gcf_list[0].bgcs) == 1 assert len(gcf_list[1].bgcs) == 2 - assert gcf_list[0].bgcs == set(bgc_list[:2]) - assert gcf_list[1].bgcs == set(bgc_list[2:]) + assert gcf_list[0].bgcs == set(bgc_list[:1]) + assert gcf_list[1].bgcs == set(bgc_list[1:]) def test_map_bgc_to_gcf_error(bgc_list, gcf_list_error): - assert gcf_list_error[0].bgc_ids == set(("SAMPLE0001", "SAMPLE0003")) + assert gcf_list_error[0].bgc_ids == {"SAMPLE_BGC_03", "BGC_04"} assert len(gcf_list_error[0].bgcs) == 0 with pytest.raises(KeyError) as e: map_bgc_to_gcf(bgc_list, gcf_list_error) - assert "BGC id SAMPLE0003 from GCF object 1 not found" in e.value.args[0] + assert "BGC id 'BGC_04' from GCF object '1' not found" in e.value.args[0] def test_filter_mibig_only_gcf(bgc_list, gcf_list): map_bgc_to_gcf(bgc_list, gcf_list) gcfs = filter_mibig_only_gcf(gcf_list) assert len(gcfs) == 1 - assert gcfs[0].gcf_id == "1" + assert gcfs[0].gcf_id == "2" def test_get_bgcs_from_gcfs(bgc_list, gcf_list): map_bgc_to_gcf(bgc_list, gcf_list) bgcs = get_bgcs_from_gcfs(gcf_list) assert isinstance(bgcs, list) - assert len(bgcs) == 4 + assert len(bgcs) == 3 for i in bgcs: assert isinstance(i, BGC) -def test_get_strains_from_bgcs(strain_collection, bgc_list, - bgc_genome_mapping): - map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping) +def test_get_strains_from_bgcs(strain_collection, bgc_list): + map_strain_to_bgc(strain_collection, bgc_list) strains = get_strains_from_bgcs(bgc_list) assert isinstance(strains, StrainCollection) assert strains == strain_collection From 21cd0fc83cdd6ed6ca99d16065db660f80e098c2 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 11 Aug 2023 16:34:44 +0200 Subject: [PATCH 04/21] add property names to class `Strain` --- src/nplinker/strains.py | 9 +++++++++ tests/test_strain.py | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 7b07a588..b1afcf70 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -43,6 +43,15 @@ def __contains__(self, alias: str) -> bool: raise TypeError(f'Expected str, got {type(alias)}') return alias in self._aliases + @property + def names(self) -> set[str]: + """Get the set of strain names including id and aliases. + + Returns: + set[str]: A set of names associated with the strain. + """ + return self._aliases | {self.id} + @property def aliases(self) -> set[str]: """Get the set of known aliases. diff --git a/tests/test_strain.py b/tests/test_strain.py index c8ed0a56..34540bec 100644 --- a/tests/test_strain.py +++ b/tests/test_strain.py @@ -26,10 +26,16 @@ def test_hash(strain: Strain): assert hash(strain) == hash("strain_1") +def test_names(strain: Strain): + assert strain.names == {"strain_1", "strain_1_a"} + + def test_alias(strain: Strain): assert len(strain.aliases) == 1 + assert "strain_1_a" in strain.aliases def test_add_alias(strain: Strain): strain.add_alias("strain_1_b") assert len(strain.aliases) == 2 + assert "strain_1_b" in strain.aliases From 3485155e921208c3f979b2cb28541449fab7de31 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 15 Aug 2023 14:37:12 +0200 Subject: [PATCH 05/21] update the logic of method `__contains__` in StrainCollection --- src/nplinker/strain_collection.py | 21 ++++++--------------- tests/test_strain_collection.py | 25 +++++++++---------------- 2 files changed, 15 insertions(+), 31 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index d221ce86..b9369bb9 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -1,14 +1,10 @@ import json from os import PathLike -from pathlib import Path from typing import Iterator -from deprecated import deprecated from jsonschema import validate from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA from .logconfig import LogConfig from .strains import Strain -from .utils import list_dirs -from .utils import list_files logger = LogConfig.getLogger(__name__) @@ -41,16 +37,12 @@ def __eq__(self, other) -> bool: and self._strain_dict_name == other._strain_dict_name) return NotImplemented - def __contains__(self, item: str | Strain) -> bool: - """Check if the strain collection contains the given strain. - - The given strain could be a Strain object, or a strain id or alias. + def __contains__(self, item: Strain) -> bool: + """Check if the strain collection contains the given Strain object. """ - if isinstance(item, str): - return item in self._strain_dict_name if isinstance(item, Strain): return item.id in self._strain_dict_name - raise TypeError(f"Expected Strain or str, got {type(item)}") + raise TypeError(f"Expected Strain, got {type(item)}") def __iter__(self) -> Iterator[Strain]: return iter(self._strains) @@ -83,9 +75,8 @@ def remove(self, strain: Strain): """ if strain.id in self._strain_dict_name: self._strains.remove(strain) - del self._strain_dict_name[strain.id] - for alias in strain.aliases: - del self._strain_dict_name[alias] + for name in strain.names: + del self._strain_dict_name[name] def filter(self, strain_set: set[Strain]): """ @@ -109,7 +100,7 @@ def lookup(self, name: str) -> Strain: Raises: KeyError: If the strain name is not found. """ - if name in self: + if name in self._strain_dict_name: return self._strain_dict_name[name] raise KeyError(f"Strain {name} not found in strain collection.") diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index c809759c..944464e4 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -31,10 +31,8 @@ def test_eq(collection: StrainCollection, strain: Strain): def test_contains(collection: StrainCollection, strain: Strain): assert strain in collection - assert strain.id in collection - for alias in strain.aliases: - assert alias in collection - assert "strain_not_exist" not in collection + strain2 = Strain("strain_2") + assert strain2 not in collection def test_iter(collection: StrainCollection, strain: Strain): @@ -46,31 +44,25 @@ def test_add(strain: Strain): sut = StrainCollection() sut.add(strain) assert strain in sut - for alias in strain.aliases: - assert alias in sut + for name in strain.names: + assert name in sut._strain_dict_name def test_remove(collection: StrainCollection, strain: Strain): - assert strain in collection collection.remove(strain) - with pytest.raises(KeyError): - _ = collection._strain_dict_name[strain.id] assert strain not in collection def test_filter(collection: StrainCollection, strain: Strain): - assert strain in collection collection.add(Strain("strain_2")) collection.filter({strain}) - assert strain in collection - assert "strain_2" not in collection + assert "strain_2" not in collection._strain_dict_name assert len(collection) == 1 def test_lookup(collection: StrainCollection, strain: Strain): - assert collection.lookup(strain.id) == strain - for alias in strain.aliases: - assert collection.lookup(alias) == strain + for name in strain.names: + assert collection.lookup(name) == strain with pytest.raises(KeyError): collection.lookup("strain_not_exist") @@ -85,7 +77,8 @@ def json_file(tmp_path): "strain_id": "strain_2", "strain_alias": ["alias_3", "alias_4"] }], - "version": "1.0" + "version": + "1.0" } file_path = tmp_path / "test.json" with open(file_path, "w") as f: From 1211657206275d45ee27c7b43a96eb24fdfbc486 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 11:11:56 +0200 Subject: [PATCH 06/21] add method `has_name` to class `StrainCollection` --- src/nplinker/metabolomics/load_gnps.py | 18 +++++++++--------- .../pairedomics/strain_mappings_generator.py | 2 +- src/nplinker/strain_collection.py | 11 +++++++++++ tests/test_strain_collection.py | 6 ++++++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py index 14a5ec2c..a725e9f4 100644 --- a/src/nplinker/metabolomics/load_gnps.py +++ b/src/nplinker/metabolomics/load_gnps.py @@ -30,14 +30,14 @@ def _messy_strain_naming_lookup(mzxml: str, strains: StrainCollection) -> Strain Returns: Strain or None: The strain identified to be matching or None. """ - if mzxml in strains: + if strains.has_name(mzxml): # life is simple! return strains.lookup(mzxml) # 1. knock off the .mzXML and try again (using splitext should also handle # other extensions like .mzML here) mzxml_no_ext = os.path.splitext(mzxml)[0] - if mzxml_no_ext in strains: + if strains.has_name(mzxml_no_ext): return strains.lookup(mzxml_no_ext) # 2. if that doesn't work, try using everything up to the first -/_ @@ -46,15 +46,15 @@ def _messy_strain_naming_lookup(mzxml: str, strains: StrainCollection) -> Strain mzxml_trunc_underscore = mzxml_no_ext if underscore_index == -1 else mzxml_no_ext[:underscore_index] mzxml_trunc_hyphen = mzxml_no_ext if hyphen_index == -1 else mzxml_no_ext[:hyphen_index] - if underscore_index != -1 and mzxml_trunc_underscore in strains: + if underscore_index != -1 and strains.has_name(mzxml_trunc_underscore): return strains.lookup(mzxml_trunc_underscore) - if hyphen_index != -1 and mzxml_trunc_hyphen in strains: + if hyphen_index != -1 and strains.has_name(mzxml_trunc_hyphen): return strains.lookup(mzxml_trunc_hyphen) # 3. in the case of original Crusemann dataset, many of the filenames seem to # match up to real strains with the initial "CN" missing ??? for mzxml_trunc in [mzxml_trunc_hyphen, mzxml_trunc_underscore]: - if 'CN' + mzxml_trunc in strains: + if strains.has_name('CN' + mzxml_trunc): return strains.lookup('CN' + mzxml_trunc) # give up @@ -113,7 +113,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, # check if the strain label exists in the set of strain mappings the user # has provided - if strain_name not in strains: + if not strains.has_name(strain_name): # if this check fails, it could mean a missing strain ID mapping. this isn't # fatal but will produce a warning and an entry added to the file # unknown_strains_met.csv in the dataset folder. @@ -139,7 +139,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, growth_medium = md_table[strain_name]['growthmedium'] strain_col_content = md_table[strain_name]['strain'] - if strain_col_content in strains: + if strains.has_name(strain_col_content): strain = strains.lookup(strain_col_content) strain.add_alias(strain_name) # this merges the set of aliases back into the internal @@ -158,7 +158,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, 'Unknown strain identifier: {} (parsed from {})'.format( strain_name, hdr)) - return (strain_name, growth_medium, strain_name not in strains) + return (strain_name, growth_medium, not strains.has_name(strain_name)) def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[str, Spectrum]) -> dict[str, int]: @@ -422,7 +422,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod # create a new strain object if the intensity value is a float > 0 v = _md_convert(v) - if strain_name in strains and isinstance(v, float) and float(v) > 0: + if strains.has_name(strain_name) and isinstance(v, float) and float(v) > 0: # find the strain object, and add the growth medium + intensity to it. the # growth medium will only be set if extended_metadata_table_parsing is # enabled in the config file and the metadata table file contains that info diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py index 091a37e0..8f9ec568 100644 --- a/src/nplinker/pairedomics/strain_mappings_generator.py +++ b/src/nplinker/pairedomics/strain_mappings_generator.py @@ -97,7 +97,7 @@ def podp_generate_strain_mappings( # Create StrainCollection sc = StrainCollection() for strain_id, bgc_ids in mappings.items(): - if strain_id not in sc: + if not sc.has_name(strain_id): strain = Strain(strain_id) for bgc_id in bgc_ids: strain.add_alias(bgc_id) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index b9369bb9..fb5e0697 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -87,6 +87,17 @@ def filter(self, strain_set: set[Strain]): if strain not in strain_set: self.remove(strain) + def has_name(self, name: str) -> bool: + """Check if the strain collection contains the given strain name (id or alias). + + Args: + name(str): Strain name (id or alias) to check. + + Returns: + bool: True if the strain name is in the collection, False otherwise. + """ + return name in self._strain_dict_name + def lookup(self, name: str) -> Strain: """Lookup a strain by name (id or alias). diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index 944464e4..b560b022 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -60,6 +60,12 @@ def test_filter(collection: StrainCollection, strain: Strain): assert len(collection) == 1 +def test_has_name(collection: StrainCollection): + assert collection.has_name("strain_1") + assert collection.has_name("strain_1_a") + assert not collection.has_name("strain_2") + + def test_lookup(collection: StrainCollection, strain: Strain): for name in strain.names: assert collection.lookup(name) == strain From 14b3453da76db88e802a2cc963ec5b677b1cf422 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 11:11:12 +0200 Subject: [PATCH 07/21] update method `has_strain` in classes --- src/nplinker/genomics/gcf.py | 4 ++-- src/nplinker/metabolomics/molecular_family.py | 4 ++-- src/nplinker/metabolomics/spectrum.py | 3 ++- tests/genomics/test_gcf.py | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index c1b0d9d9..0af3e407 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -82,11 +82,11 @@ def detach_bgc(self, bgc: BGC) -> None: return self.strains.remove(bgc.strain) - def has_strain(self, strain: str | Strain) -> bool: + def has_strain(self, strain: Strain) -> bool: """Check if the given strain exists. Args: - strain(str | Strain): strain id or `Strain` object. + strain(Strain): `Strain` object. Returns: bool: True when the given strain exist. diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index c4e584c5..8a5f9753 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -31,11 +31,11 @@ def strains(self) -> StrainCollection: strains.add(strain) return strains - def has_strain(self, strain: str | Strain) -> bool: + def has_strain(self, strain: Strain) -> bool: """Check if the given strain exists. Args: - strain(str | Strain): strain id or `Strain` object. + strain(Strain): `Strain` object. Returns: bool: True when the given strain exist. diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 12663871..6895d198 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -1,5 +1,6 @@ from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain from nplinker.utils import sqrt_normalise @@ -97,7 +98,7 @@ def get_metadata_value(self, key): val = self.metadata.get(key, None) return val - def has_strain(self, strain): + def has_strain(self, strain: Strain): return strain in self.strains def get_growth_medium(self, strain): diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py index a40f4728..c24a6952 100644 --- a/tests/genomics/test_gcf.py +++ b/tests/genomics/test_gcf.py @@ -55,8 +55,8 @@ def test_add_bgc_wo_strain(bgc_without_strain, caplog): def test_has_strain(bgc_with_strain): gcf = GCF("1") gcf.add_bgc(bgc_with_strain) - assert gcf.has_strain("strain001") is True - assert gcf.has_strain("strain002") is False + assert gcf.has_strain(Strain("strain001")) is True + assert gcf.has_strain(Strain("strain002")) is False def test_has_mibig_only(): mibig_bgc = BGC("BGC0000001", "NPR") From ab226d5683fffd8de1e52b46503a561653b033eb Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 13:19:15 +0200 Subject: [PATCH 08/21] rename MibigBGCLoader to MibigLoader --- src/nplinker/genomics/mibig/__init__.py | 4 +++- src/nplinker/genomics/mibig/mibig_loader.py | 4 ++-- src/nplinker/loader.py | 4 ++-- tests/genomics/test_mibig_loader.py | 8 ++++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/nplinker/genomics/mibig/__init__.py b/src/nplinker/genomics/mibig/__init__.py index d35b7a21..753cfdf7 100644 --- a/src/nplinker/genomics/mibig/__init__.py +++ b/src/nplinker/genomics/mibig/__init__.py @@ -1,6 +1,8 @@ import logging from .mibig_downloader import download_and_extract_mibig_metadata -from .mibig_loader import MibigBGCLoader, parse_bgc_metadata_json +from .mibig_loader import MibigLoader +from .mibig_loader import parse_bgc_metadata_json from .mibig_metadata import MibigMetadata + logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index 8ca30bd6..82226431 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -10,7 +10,7 @@ logger = LogConfig.getLogger(__name__) -class MibigBGCLoader: +class MibigLoader: def __init__(self, data_dir: str): """Parse MIBiG metadata files and return BGC objects @@ -129,4 +129,4 @@ def parse_bgc_metadata_json(file: str) -> BGC: # register as virtual class to prevent metaclass conflicts -BGCLoaderBase.register(MibigBGCLoader) +BGCLoaderBase.register(MibigLoader) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 6fd067af..f06024b5 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -9,7 +9,7 @@ from nplinker.genomics import load_gcfs from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.mibig import download_and_extract_mibig_metadata -from nplinker.genomics.mibig import MibigBGCLoader +from nplinker.genomics.mibig import MibigLoader from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.globals import GENOME_STATUS_FILENAME from nplinker.globals import GNPS_FILE_MAPPINGS_FILENAME @@ -426,7 +426,7 @@ def _load_genomics(self): # and update self.strains with mibig strains #---------------------------------------------------------------------- logger.debug(f'MibigBGCLoader({self.mibig_json_dir})') - mibig_bgc_loader = MibigBGCLoader(self.mibig_json_dir) + mibig_bgc_loader = MibigLoader(self.mibig_json_dir) self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs() # add mibig bgc strains diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py index c122c56e..21caf3c2 100644 --- a/tests/genomics/test_mibig_loader.py +++ b/tests/genomics/test_mibig_loader.py @@ -3,7 +3,7 @@ from nplinker.genomics import BGC from nplinker.genomics import BGCLoaderBase from nplinker.genomics.mibig import download_and_extract_mibig_metadata -from nplinker.genomics.mibig import MibigBGCLoader +from nplinker.genomics.mibig import MibigLoader from nplinker.genomics.mibig import parse_bgc_metadata_json from nplinker.genomics.mibig.mibig_metadata import MibigMetadata from .. import DATA_DIR @@ -22,11 +22,11 @@ def data_dir(self, tmp_path): @pytest.fixture def loader(self, data_dir): - loader = MibigBGCLoader(data_dir) + loader = MibigLoader(data_dir) yield loader def test_abc(self, loader): - assert issubclass(MibigBGCLoader, BGCLoaderBase) + assert issubclass(MibigLoader, BGCLoaderBase) assert isinstance(loader, BGCLoaderBase) def test_init(self, loader, data_dir): @@ -49,7 +49,7 @@ def test_get_files(self, loader): assert os.path.exists(files["BGC0000001"]) def test_parse_data_dir(self, data_dir): - files = MibigBGCLoader.parse_data_dir(data_dir) + files = MibigLoader.parse_data_dir(data_dir) assert isinstance(files, dict) assert len(files) == 2502 # MIBiG v3.1 has 2502 BGCs assert "BGC0000001" in files From 1058928583af85c61d602c4c87ca49562a48f2a3 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 13:19:46 +0200 Subject: [PATCH 09/21] rename method `get_bgc_genome_mapping` to `get_strain_bgc_mapping` --- src/nplinker/genomics/mibig/mibig_loader.py | 11 ++++------- tests/genomics/test_mibig_loader.py | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index 82226431..c3bf7424 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -26,16 +26,13 @@ def __init__(self, data_dir: str): self._metadata_dict = self._parse_metadatas() self._bgc_dict = self._parse_bgcs() - def get_bgc_genome_mapping(self) -> dict[str, str]: - """Get the mapping from BGC to genome. + def get_strain_bgc_mapping(self) -> dict[str, str]: + """Get the mapping from strain to BGC. - Note that for MIBiG BGC, same value is used for BGC id and genome id. - Users don't have to provide genome id for MIBiG BGCs in the - `strain_mappings.json` file. + Note that for MIBiG BGC, same value is used for strain name and BGC id. Returns: - dict[str, str]: key is BGC id/accession, value is - genome id that uses the value of BGC accession. + dict[str, str]: key is strain name, value is BGC id. """ return {bid: bid for bid in self._file_dict} diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py index 21caf3c2..6fac43c0 100644 --- a/tests/genomics/test_mibig_loader.py +++ b/tests/genomics/test_mibig_loader.py @@ -32,8 +32,8 @@ def test_abc(self, loader): def test_init(self, loader, data_dir): assert loader.data_dir == data_dir - def test_get_bgc_genome_mapping(self, loader): - mapping = loader.get_bgc_genome_mapping() + def test_get_strain_bgc_mapping(self, loader): + mapping = loader.get_strain_bgc_mapping() assert isinstance(mapping, dict) assert len(mapping) == 2502 for bid in mapping: From 36fda27e927be4cb8a62fda6c9cf8af4394ece4f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 14:30:42 +0200 Subject: [PATCH 10/21] refactor mibig loading related code to `_load_mibig` method --- src/nplinker/loader.py | 57 +++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index f06024b5..02594677 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -22,6 +22,7 @@ from nplinker.pairedomics.strain_mappings_generator import \ podp_generate_strain_mappings from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain try: @@ -112,6 +113,7 @@ def __init__(self, config_data): self._root)[-1] if not self._remote_loading else self._platform_id self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], [] self.mibig_bgc_dict = {} + self._mibig_strain_bgc_mapping = {} self.product_types = [] self.strains = StrainCollection() self.webapp_scoring_cutoff = self._config_webapp.get( @@ -149,7 +151,8 @@ def generate_strain_mappings(self): generate_mappings_genome_id_bgc_id(self._root / "antismash") - podp_project_json_file = self._root.parent.parent / (self._platform_id + ".json") + podp_project_json_file = self._root.parent.parent / ( + self._platform_id + ".json") genome_status_json_file = self._root.parent.parent / "downloads" / self._platform_id / GENOME_STATUS_FILENAME genome_bgc_mappings_file = self._root / "antismash" / GENOME_BGC_MAPPINGS_FILENAME gnps_file_mapping_tsv_file = self._root / GNPS_FILE_MAPPINGS_FILENAME @@ -161,7 +164,10 @@ def generate_strain_mappings(self): self.strain_mappings_file) def load(self): - # load strain mappings first + if self._use_mibig: + if not self._load_mibig(): + return False + if not self._load_strain_mappings(): return False @@ -345,12 +351,28 @@ def _validate_paths(self): logger.warning('Optional file/directory "%s" does not exist', f) - # TODO: this function should be refactored to Loader class + def _load_mibig(self): + mibig_bgc_loader = MibigLoader(self.mibig_json_dir) + self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs() + self._mibig_strain_bgc_mapping = mibig_bgc_loader.get_strain_bgc_mapping( + ) + return True + def _load_strain_mappings(self): + # First load user's strain mappings sc = StrainCollection.read_json(self.strain_mappings_file) for strain in sc: self.strains.add(strain) - logger.info('Loaded dataset strain IDs ({} total)'.format( + logger.info('Loaded {} non-MiBIG Strain objects'.format( + len(self.strains))) + + # Then load MiBIG strain mappings + if self._mibig_strain_bgc_mapping: + for k, v in self._mibig_strain_bgc_mapping.items(): + strain = Strain(k) + strain.add_alias(v) + self.strains.add(strain) + logger.info('Loaded {} Strain objects in total'.format( len(self.strains))) return True @@ -412,35 +434,14 @@ def _load_genomics(self): .format(d, d.replace(' ', '_'))) #---------------------------------------------------------------------- - # CG: download mibig metadata and run bigscape + # CG: run bigscape #---------------------------------------------------------------------- # both the bigscape and mibig_json dirs expected by nplinker may not exist at this point. in some # cases this will cause an error later in the process, but this can also potentially be # resolved automatically: - # mibig_json => download and extract the JSON database # bigscape => run BiG-SCAPE before continuing (if using the Docker image) self._load_genomics_extra() - #---------------------------------------------------------------------- - # CG: load mibig metadata to BGC object - # and update self.strains with mibig strains - #---------------------------------------------------------------------- - logger.debug(f'MibigBGCLoader({self.mibig_json_dir})') - mibig_bgc_loader = MibigLoader(self.mibig_json_dir) - self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs() - - # add mibig bgc strains - # CG TODO: update strain assignment logics, - # see issue 104 https://github.com/NPLinker/nplinker/issues/104 - for bgc in self.mibig_bgc_dict.values(): - if bgc.strain is not None: - self.strains.add(bgc.strain) - else: - logger.warning("No strain specified for BGC %s", bgc.bgc_id) - - logger.debug('mibig_bgc_dict has {} entries'.format( - len(self.mibig_bgc_dict))) - #---------------------------------------------------------------------- # CG: Parse AntiSMASH dir #---------------------------------------------------------------------- @@ -684,8 +685,8 @@ def _filter_user_strains(self): # get the list of spectra which have at least one strain in the set spectra_to_retain = { spec - for spec in self.spectra for sstrain in spec.strains - if sstrain in self.include_only_strains + for spec in self.spectra + for sstrain in spec.strains if sstrain in self.include_only_strains } logger.info('Current / filtered BGC counts: {} / {}'.format( From c668ca8c9cff1feff98ba537d921c6308754aa48 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 14:31:45 +0200 Subject: [PATCH 11/21] remove mibig downloading in class `DatasetLoader` --- src/nplinker/loader.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 02594677..080cf45c 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -484,19 +484,6 @@ def _load_genomics(self): return True def _load_genomics_extra(self): - if not os.path.exists(self.mibig_json_dir): - if self._use_mibig: - logger.info( - 'Attempting to download MiBIG JSON database (v{})...'. - format(self._mibig_version)) - download_and_extract_mibig_metadata(self._root, - self.mibig_json_dir, - self._mibig_version) - else: - logger.warning( - 'Not downloading MiBIG database automatically, use_mibig = false' - ) - if not os.path.exists(self.bigscape_dir): should_run_bigscape = self._config_docker.get( 'run_bigscape', self.RUN_BIGSCAPE_DEFAULT) From a9b808f50d74340039c504d77570064b85676c56 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 14:44:27 +0200 Subject: [PATCH 12/21] refactor bigscape running code to `_run_bigscape` --- src/nplinker/loader.py | 61 +++++++++++------------------------------- 1 file changed, 15 insertions(+), 46 deletions(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 080cf45c..103a4f1f 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -403,56 +403,12 @@ def _load_metabolomics(self): # CG: load gemonics data into memory def _load_genomics(self): - # TODO split this method up a bit - - # hmmscan apparently can't cope with filenames that have spaces in them, and so if you try - # to run bigscape on a dataset like that it will just refuse. so, to try and avoid anyone - # having to manually fix this stupid problem, we will attempt to rename every .gbk file here - # TODO is this something nplinker should do, or dataset authors?? logger.debug("\nLoading genomics starts...") - #---------------------------------------------------------------------- - # CG: replace space with underscore for antismash folder and file names - # TODO: - # 1.[] move this part to antismash downloader as post-download process - # 2.[x] separete listing gbk files and renaming - #---------------------------------------------------------------------- - logger.debug('Collecting .gbk files (and possibly renaming)') - - # if the option is enabled, check for spaces in the folder names under - # /antismash and rename them, replacing spaces with underscores - ignore_spaces = self._antismash_ignore_spaces - if not ignore_spaces: - logger.debug('Checking for spaces in antiSMASH folder names...') - for root, dirs, files in os.walk(self.antismash_dir): - for d in dirs: - if d.find(' ') != -1: - os.rename(os.path.join(root, d), - os.path.join(root, d.replace(' ', '_'))) - logger.warn( - 'Renaming antiSMASH folder "{}" to "{}" to remove spaces! (suppress with ignore_spaces = true in config file)' - .format(d, d.replace(' ', '_'))) - - #---------------------------------------------------------------------- - # CG: run bigscape - #---------------------------------------------------------------------- - # both the bigscape and mibig_json dirs expected by nplinker may not exist at this point. in some - # cases this will cause an error later in the process, but this can also potentially be - # resolved automatically: - # bigscape => run BiG-SCAPE before continuing (if using the Docker image) - self._load_genomics_extra() - - #---------------------------------------------------------------------- - # CG: Parse AntiSMASH dir - #---------------------------------------------------------------------- logger.debug('Parsing AntiSMASH directory...') antismash_bgc_loader = AntismashBGCLoader(self.antismash_dir) - #---------------------------------------------------------------------- - # CG: load all bgcs and gcfs - #---------------------------------------------------------------------- logger.debug('Loading GCFs...') - # TODO: refactor load_gcfs to independent steps # Step 1: load all strains # Step 2: load all bgcs @@ -483,7 +439,21 @@ def _load_genomics(self): return True - def _load_genomics_extra(self): + # TODO CG: run bigscape before loading and after downloading + def _run_bigscape(self): + # Check for spaces in the folder names under /antismash and + # rename them by replacing spaces with underscores + if not self._antismash_ignore_spaces: + logger.debug('Checking for spaces in antiSMASH folder names...') + for root, dirs, _ in os.walk(self.antismash_dir): + for d in dirs: + if d.find(' ') != -1: + os.rename(os.path.join(root, d), + os.path.join(root, d.replace(' ', '_'))) + logger.warn( + 'Renaming antiSMASH folder "{}" to "{}" to remove spaces! (suppress with ignore_spaces = true in config file)' + .format(d, d.replace(' ', '_'))) + if not os.path.exists(self.bigscape_dir): should_run_bigscape = self._config_docker.get( 'run_bigscape', self.RUN_BIGSCAPE_DEFAULT) @@ -491,7 +461,6 @@ def _load_genomics_extra(self): 'extra_bigscape_parameters', self.EXTRA_BIGSCAPE_PARAMS_DEFAULT) if should_run_bigscape: - # TODO this should not be attempted if not in Docker env logger.info( 'Running BiG-SCAPE! extra_bigscape_parameters="{}"'.format( extra_bigscape_parameters)) From 682a0a5d9c0e8598fe146c3d68f4607ad8a8776c Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 14 Aug 2023 15:56:05 +0200 Subject: [PATCH 13/21] refactor method `_load_genomics` --- src/nplinker/loader.py | 66 ++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 103a4f1f..8cae746c 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -6,9 +6,13 @@ from nplinker.class_info.class_matches import ClassMatches from nplinker.class_info.runcanopus import run_canopus from nplinker.genomics import generate_mappings_genome_id_bgc_id -from nplinker.genomics import load_gcfs from nplinker.genomics.antismash import AntismashBGCLoader -from nplinker.genomics.mibig import download_and_extract_mibig_metadata +from nplinker.genomics.bigscape import BigscapeGCFLoader +from nplinker.genomics.genomics import filter_mibig_only_gcf +from nplinker.genomics.genomics import get_bgcs_from_gcfs +from nplinker.genomics.genomics import get_strains_from_bgcs +from nplinker.genomics.genomics import map_bgc_to_gcf +from nplinker.genomics.genomics import map_strain_to_bgc from nplinker.genomics.mibig import MibigLoader from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.globals import GENOME_STATUS_FILENAME @@ -24,7 +28,6 @@ from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain - try: from importlib.resources import files except ImportError: @@ -401,42 +404,37 @@ def _load_metabolomics(self): self.spectra, spec_dict) return True - # CG: load gemonics data into memory + # TODO CG: self.strains will be overwritten by this method, rename it? def _load_genomics(self): - logger.debug("\nLoading genomics starts...") + """Loads all genomics data (BGCs and GCFs) into the object. + """ + logger.debug("\nLoading genomics data starts...") + # Step 1: load all BGC objects logger.debug('Parsing AntiSMASH directory...') - antismash_bgc_loader = AntismashBGCLoader(self.antismash_dir) - - logger.debug('Loading GCFs...') - # TODO: refactor load_gcfs to independent steps - # Step 1: load all strains - # Step 2: load all bgcs - # Step 3: load all gcfs - # Step 4: connect bgc to strain with `map_strain_to_bgc` - # Step 5: connect gcf to bgcs with `map_bgc_to_gcf` - # Step 6: `filter_mibig_only_gcf` - # Step 7: get clean gcfs, bgcs and strains with `get_bgcs_from_gcfs` - # and `get_strains_from_bgcs` - - self.gcfs, self.bgcs, self.strains, unknown_strains = load_gcfs( - self.bigscape_dir, self.strains, self.mibig_bgc_dict, - antismash_bgc_loader.get_bgcs(), antismash_bgc_loader.get_files(), - self._bigscape_cutoff) - - #---------------------------------------------------------------------- - # CG: write unknown strains in genomics to file - #---------------------------------------------------------------------- - us_path = os.path.join(self._root, 'unknown_strains_gen.csv') - logger.warning( - f'Writing unknown strains from GENOMICS data to {us_path}') - with open(us_path, 'w') as us: - us.write('# unknown strain label, filename\n') - for strain, filename in unknown_strains.items(): - us.write(f'{strain}, {filename}\n') + antismash_bgc_dict = AntismashBGCLoader(self.antismash_dir).get_bgcs() + raw_bgcs = list(antismash_bgc_dict.values()) + list( + self.mibig_bgc_dict.values()) + + # Step 2: load all GCF objects + bigscape_cluster_file = Path( + self.bigscape_dir + ) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv" + bigscape_gcf_list = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs() + raw_gcfs = bigscape_gcf_list + + # Step 3: assign Strain object to BGC.strain + map_strain_to_bgc(self.strains, raw_bgcs) + + # Step 4: assign BGC objects to GCF.bgcs + map_bgc_to_gcf(raw_bgcs, raw_gcfs) - logger.debug("Loading genomics completed\n") + # Step 5: get clean GCF objects, BGC objects and Strain objects + self.gcfs = filter_mibig_only_gcf(raw_gcfs) + self.bgcs = get_bgcs_from_gcfs(self.gcfs) + self.strains = get_strains_from_bgcs(self.bgcs) + logger.debug("Loading genomics data completed\n") return True # TODO CG: run bigscape before loading and after downloading From 0991a47aeff167858b98b0282be463d084780163 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 15 Aug 2023 11:14:41 +0200 Subject: [PATCH 14/21] add TODO comments --- src/nplinker/loader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 8cae746c..f0e77362 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -205,6 +205,9 @@ def load(self): def _start_downloads(self): downloader = PODPDownloader(self._platform_id) + # TODO CG: this step generates the real path for _root. Should generate + # it before loading process starts. Otherwise, npl.root_dir will get + # wrong value if loading from local data or not using download. self._root = Path(downloader.project_results_dir) logger.debug('remote loading mode, configuring root=%s', self._root) # CG: to download both MET and GEN data From 70e16401a4a63e3cd8946ad2e47d34919dcd643c Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 15 Aug 2023 14:24:56 +0200 Subject: [PATCH 15/21] add TODO comment --- src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py b/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py index 766944ad..2c2e0c9c 100644 --- a/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py @@ -26,6 +26,7 @@ def __init__(self, file: str | PathLike): NotImplementedError: Raises NotImplementedError if the GNPS format is not recognized. """ self._file: Path = Path(file) + # TODO CG: change list to set to avoid duplicates of spectra self._mapping: dict[str, list[str]] = {} self._gnps_format = gnps_format_from_file_mapping(file, False) From 10364efa011d279d6c8589a221363137a116ac68 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 16 Aug 2023 10:17:23 +0200 Subject: [PATCH 16/21] change `StrainCollection._strain_dict_name` dict value type to list - Updated methods `add`, `remove` and `lookup` accordingly - Updated and added unit tests for updated methods --- src/nplinker/strain_collection.py | 50 ++++++++++++++------- tests/test_strain_collection.py | 74 ++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index fb5e0697..9b492556 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -14,9 +14,9 @@ class StrainCollection(): def __init__(self): """A collection of Strain objects.""" + # the order of strains is needed for scoring part, so use a list self._strains: list[Strain] = [] - # dict of strain name (id and alias) to primary strain object - self._strain_dict_name: dict[str, Strain] = {} + self._strain_dict_name: dict[str, list[Strain]] = {} def __repr__(self) -> str: return str(self) @@ -55,28 +55,46 @@ def add(self, strain: Strain) -> None: Args: strain(Strain): The strain to add. """ - # if the strain exists, merge the aliases - if strain.id in self._strain_dict_name: - existing: Strain = self.lookup(strain.id) - for alias in strain.aliases: - existing.add_alias(alias) - self._strain_dict_name[alias] = existing + if strain in self._strains: + # only one strain object per id + strain_ref = self._strain_dict_name[strain.id][0] + new_aliases = [alias for alias in strain.aliases if alias not in strain_ref.aliases] + for alias in new_aliases: + strain_ref.add_alias(alias) + if alias not in self._strain_dict_name: + self._strain_dict_name[alias] = [strain_ref] + else: + self._strain_dict_name[alias].append(strain_ref) else: self._strains.append(strain) - self._strain_dict_name[strain.id] = strain - for alias in strain.aliases: - self._strain_dict_name[alias] = strain + for name in strain.names: + if name not in self._strain_dict_name: + self._strain_dict_name[name] = [strain] + else: + self._strain_dict_name[name].append(strain) def remove(self, strain: Strain): """Remove a strain from the collection. + It removes the given strain object from the collection by strain id. + If the strain id is not found, it does nothing. + Args: strain(Strain): The strain to remove. """ - if strain.id in self._strain_dict_name: + if strain in self._strains: self._strains.remove(strain) - for name in strain.names: - del self._strain_dict_name[name] + # only one strain object per id + strain_ref = self._strain_dict_name[strain.id][0] + for name in strain_ref.names: + if name in self._strain_dict_name: + new_strain_list = [ + s for s in self._strain_dict_name[name] if s.id != strain.id + ] + if not new_strain_list: + del self._strain_dict_name[name] + else: + self._strain_dict_name[name] = new_strain_list def filter(self, strain_set: set[Strain]): """ @@ -98,7 +116,7 @@ def has_name(self, name: str) -> bool: """ return name in self._strain_dict_name - def lookup(self, name: str) -> Strain: + def lookup(self, name: str) -> list[Strain]: """Lookup a strain by name (id or alias). Args: @@ -106,7 +124,7 @@ def lookup(self, name: str) -> Strain: name(str): Strain name (id or alias) to lookup. Returns: - Strain: Strain identified by the given name. + list[Strain]: List of Strain objects with the given name. Raises: KeyError: If the strain name is not found. diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index b560b022..0d12ad5b 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -43,14 +43,84 @@ def test_iter(collection: StrainCollection, strain: Strain): def test_add(strain: Strain): sut = StrainCollection() sut.add(strain) + assert len(sut) == 1 assert strain in sut for name in strain.names: assert name in sut._strain_dict_name -def test_remove(collection: StrainCollection, strain: Strain): +def test_add_same_id_same_alias(strain: Strain, collection: StrainCollection): + collection.add(strain) + assert strain in collection + assert len(collection) == 1 + assert len(collection._strain_dict_name) == 2 + + +def test_add_same_id_different_alias(collection: StrainCollection): + strain = Strain("strain_1") + strain.add_alias("strain_1_b") + collection.add(strain) + assert len(collection) == 1 + assert strain in collection + assert len(collection._strain_dict_name) == 3 + assert collection._strain_dict_name["strain_1"] == [strain] + assert collection._strain_dict_name["strain_1_b"] == [strain] + + +def test_add_different_id_same_alias(strain: Strain, + collection: StrainCollection): + strain2 = Strain("strain_2") + strain2.add_alias("strain_1_a") + collection.add(strain2) + assert len(collection) == 2 + assert strain2 in collection + assert len(collection._strain_dict_name) == 3 + assert collection._strain_dict_name["strain_1"] == [strain] + assert collection._strain_dict_name["strain_2"] == [strain2] + assert collection._strain_dict_name["strain_1_a"] == [strain, strain2] + + +def test_add_different_id_different_alias(strain: Strain, + collection: StrainCollection): + strain2 = Strain("strain_2") + strain2.add_alias("strain_2_a") + collection.add(strain2) + assert len(collection) == 2 + assert strain2 in collection + assert len(collection._strain_dict_name) == 4 + assert collection._strain_dict_name["strain_1"] == [strain] + assert collection._strain_dict_name["strain_1_a"] == [strain] + assert collection._strain_dict_name["strain_2"] == [strain2] + assert collection._strain_dict_name["strain_2_a"] == [strain2] + + +def test_remove(strain: Strain): + sc = StrainCollection() + sc.remove(strain) + assert strain not in sc + + +def test_remove_same_id_same_alias(collection: StrainCollection, + strain: Strain): + collection.remove(strain) + assert strain not in collection + + +def test_remove_same_id_different_alias(collection: StrainCollection): + strain = Strain("strain_1") + strain.add_alias("strain_1_b") collection.remove(strain) + assert len(collection) == 0 + assert strain not in collection + assert len(collection._strain_dict_name) == 0 + + +def test_remove_different_id(collection: StrainCollection): + strain = Strain("strain_2") + collection.remove(strain) + assert len(collection) == 1 assert strain not in collection + assert len(collection._strain_dict_name) == 2 def test_filter(collection: StrainCollection, strain: Strain): @@ -68,7 +138,7 @@ def test_has_name(collection: StrainCollection): def test_lookup(collection: StrainCollection, strain: Strain): for name in strain.names: - assert collection.lookup(name) == strain + assert collection.lookup(name) == [strain] with pytest.raises(KeyError): collection.lookup("strain_not_exist") From 43d7f233c733c5f9df2cb3b47b91a55ca1c4595e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 16 Aug 2023 12:05:37 +0200 Subject: [PATCH 17/21] update references of method `StrainCollection.lookup` --- src/nplinker/genomics/genomics.py | 9 +++++++-- src/nplinker/loader.py | 6 ++++-- src/nplinker/pairedomics/strain_mappings_generator.py | 5 +++-- src/nplinker/scoring/linking/data_links.py | 4 ++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index 42b8001a..e57bc32c 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -83,12 +83,17 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]): """ for bgc in bgcs: try: - strain = strains.lookup(bgc.bgc_id) + strain_list = strains.lookup(bgc.bgc_id) + if len(strain_list) > 1: + raise KeyError( + f"Multiple strain objects found for BGC id '{bgc.bgc_id}'." + f"BGC object accept only one strain." + ) except KeyError as e: raise KeyError( f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' " "not found in the strain collection.") from e - bgc.strain = strain + bgc.strain = strain_list[0] def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]): diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index f0e77362..3613bab1 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -28,6 +28,7 @@ from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain + try: from importlib.resources import files except ImportError: @@ -564,13 +565,14 @@ def _load_optional(self): for line_num, sid in enumerate(strain_list): sid = sid.strip() # get rid of newline try: - strain_obj = self.strains.lookup(sid) + strain_ref_list = self.strains.lookup(sid) except KeyError: logger.warning( 'Line {} of {}: invalid/unknown strain ID "{}"'.format( line_num + 1, self.include_strains_file, sid)) continue - self.include_only_strains.add(strain_obj) + for strain in strain_ref_list: + self.include_only_strains.add(strain) logger.debug('Found {} strain IDs in include_strains'.format( len(self.include_only_strains))) diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py index 8f9ec568..4587f1b2 100644 --- a/src/nplinker/pairedomics/strain_mappings_generator.py +++ b/src/nplinker/pairedomics/strain_mappings_generator.py @@ -103,9 +103,10 @@ def podp_generate_strain_mappings( strain.add_alias(bgc_id) sc.add(strain) else: - strain = sc.lookup(strain_id) + # strain_list has only one element + strain_list = sc.lookup(strain_id) for bgc_id in bgc_ids: - strain.add_alias(bgc_id) + strain_list[0].add_alias(bgc_id) # Write strain mappings JSON file sc.to_json(output_json_file) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index 30e7c1b8..76a2c088 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -149,8 +149,8 @@ def get_common_strains( if filter_no_shared and len(shared_strains) == 0: continue results[(obj, gcf)] = [ - self._strains.lookup(strain_id) - for strain_id in shared_strains + strain for strain_id in shared_strains + for strain in self._strains.lookup(strain_id) ] return results From c4899868394583d6d712874d19ef93621ecdc5cc Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 24 Aug 2023 15:53:04 +0200 Subject: [PATCH 18/21] change KeyError to ValueError in `StrainCollection.lookup` method --- src/nplinker/strain_collection.py | 4 ++-- tests/test_strain_collection.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 9b492556..5c1c8eaa 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -127,11 +127,11 @@ def lookup(self, name: str) -> list[Strain]: list[Strain]: List of Strain objects with the given name. Raises: - KeyError: If the strain name is not found. + ValueError: If the strain name is not found. """ if name in self._strain_dict_name: return self._strain_dict_name[name] - raise KeyError(f"Strain {name} not found in strain collection.") + raise ValueError(f"Strain {name} not found in the strain collection.") @staticmethod def read_json(file: str | PathLike) -> 'StrainCollection': diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index 0d12ad5b..e6fcacb9 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -139,7 +139,7 @@ def test_has_name(collection: StrainCollection): def test_lookup(collection: StrainCollection, strain: Strain): for name in strain.names: assert collection.lookup(name) == [strain] - with pytest.raises(KeyError): + with pytest.raises(ValueError): collection.lookup("strain_not_exist") From f73c683fcb75ea42e3797fe03d826d109bc117df Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 24 Aug 2023 15:56:20 +0200 Subject: [PATCH 19/21] add ValueError to `StrainCollection.remove` method --- src/nplinker/strain_collection.py | 7 ++++++- tests/test_strain_collection.py | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 5c1c8eaa..f2f83c3c 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -77,10 +77,13 @@ def remove(self, strain: Strain): """Remove a strain from the collection. It removes the given strain object from the collection by strain id. - If the strain id is not found, it does nothing. + If the strain id is not found, raise ValueError. Args: strain(Strain): The strain to remove. + + Raises: + ValueError: If the strain is not found in the collection. """ if strain in self._strains: self._strains.remove(strain) @@ -95,6 +98,8 @@ def remove(self, strain: Strain): del self._strain_dict_name[name] else: self._strain_dict_name[name] = new_strain_list + else: + raise ValueError(f"Strain {strain} not found in strain collection.") def filter(self, strain_set: set[Strain]): """ diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index e6fcacb9..01d5134d 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -96,7 +96,8 @@ def test_add_different_id_different_alias(strain: Strain, def test_remove(strain: Strain): sc = StrainCollection() - sc.remove(strain) + with pytest.raises(ValueError): + sc.remove(strain) assert strain not in sc @@ -117,7 +118,8 @@ def test_remove_same_id_different_alias(collection: StrainCollection): def test_remove_different_id(collection: StrainCollection): strain = Strain("strain_2") - collection.remove(strain) + with pytest.raises(ValueError): + collection.remove(strain) assert len(collection) == 1 assert strain not in collection assert len(collection._strain_dict_name) == 2 From 11c956321bf4cdea6001b875a41867023b273550 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 24 Aug 2023 16:15:53 +0200 Subject: [PATCH 20/21] change KeyError to ValueError for function `map_strain_to_bgc` --- src/nplinker/genomics/genomics.py | 13 ++++++------- tests/genomics/test_genomics.py | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index e57bc32c..a9234b8f 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -84,15 +84,14 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]): for bgc in bgcs: try: strain_list = strains.lookup(bgc.bgc_id) - if len(strain_list) > 1: - raise KeyError( - f"Multiple strain objects found for BGC id '{bgc.bgc_id}'." - f"BGC object accept only one strain." - ) - except KeyError as e: - raise KeyError( + except ValueError as e: + raise ValueError( f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' " "not found in the strain collection.") from e + if len(strain_list) > 1: + raise ValueError( + f"Multiple strain objects found for BGC id '{bgc.bgc_id}'." + f"BGC object accept only one strain.") bgc.strain = strain_list[0] diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index 37fdef36..3c3ff5ac 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -127,7 +127,7 @@ def test_map_strain_to_bgc(strain_collection, bgc_list): def test_map_strain_to_bgc_error(strain_collection): bgcs = [BGC("BGC_04", "NPR")] - with pytest.raises(KeyError) as e: + with pytest.raises(ValueError) as e: map_strain_to_bgc(strain_collection, bgcs) assert "Strain id 'BGC_04' from BGC object 'BGC_04' not found" in e.value.args[ 0] From bb9b7d64dbe8ae21214b807d65885b3a0cb527ee Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 24 Aug 2023 16:17:43 +0200 Subject: [PATCH 21/21] update docstring for function `map_strain_to_bgc` --- src/nplinker/genomics/genomics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index a9234b8f..9d8feade 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -79,7 +79,8 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]): bgcs(list[BGC]): A list of BGC objects. Raises: - KeyError: Strain id not found in the strain collection. + ValueError: Strain id not found in the strain collection. + ValueError: Multiple strain objects found for a BGC id. """ for bgc in bgcs: try: