From 8e4dcd85c4285e0cb8d7d9d66b5168bc9a622c62 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 11 Aug 2023 13:57:31 +0200
Subject: [PATCH 01/21] remove useless step

---
 src/nplinker/nplinker.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 7c73804d..7b07ec9f 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -263,14 +263,8 @@ def load_data(self, new_bigscape_cutoff=None):
                 return False
         else:
             # CG: only reload genomics data when changing bigscape cutoff
-            # TODO: this part should be removed, reload everything if bigscape data changes.
-            # 1. change the cutoff (which by itself doesn't do anything)
             self._loader._bigscape_cutoff = new_bigscape_cutoff
-            # 2. reload the strain mappings (MiBIG filtering may have removed strains
-            # that were originally present, need to restore them all so the filtering
-            # won't break when it runs again in next stage)
-            self._loader._load_strain_mappings()
-            # 3. reload the genomics data with the new cutoff applied
+            # TODO: only need to reload gcfs using load_gcfs()
             self._loader._load_genomics()
 
         self._spectra = self._loader.spectra

From 49171613d8d3d68cabe5acad23e425560731b0b6 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 11 Aug 2023 13:58:30 +0200
Subject: [PATCH 02/21] remove completed TODO comment

---
 src/nplinker/genomics/gcf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
index 1bf413f1..c1b0d9d9 100644
--- a/src/nplinker/genomics/gcf.py
+++ b/src/nplinker/genomics/gcf.py
@@ -3,6 +3,7 @@
 from nplinker.logconfig import LogConfig
 from nplinker.strain_collection import StrainCollection
 
+
 if TYPE_CHECKING:
     from nplinker.strains import Strain
     from .bgc import BGC
@@ -33,8 +34,6 @@ def __init__(self, gcf_id: str, /) -> None:
         self.gcf_id = gcf_id
         self._bgcs: set[BGC] = set()
         self.bigscape_class: str | None = None
-        # CG TODO: remove attribute id, see issue 103
-        #    https://github.com/NPLinker/nplinker/issues/103
         self.bgc_ids: set[str] = set()
         self.strains: StrainCollection = StrainCollection()
 

From afd81e5d31fa237e00c4d9bd60d17835cae0b1c7 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 11 Aug 2023 13:58:45 +0200
Subject: [PATCH 03/21] remove param `bgc_genome_mapping` from func
 `map_strain_to_bgc`

The StrainCollection should contains the bgc id, so genome id is not required.
---
 src/nplinker/genomics/genomics.py | 27 +++-------
 tests/genomics/test_genomics.py   | 87 +++++++++++++------------------
 2 files changed, 44 insertions(+), 70 deletions(-)

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
index 3c1ec2ee..42b8001a 100644
--- a/src/nplinker/genomics/genomics.py
+++ b/src/nplinker/genomics/genomics.py
@@ -69,38 +69,25 @@ def generate_mappings_genome_id_bgc_id(
     logger.info("Generated genome-BGC mappings file: %s", output_file)
 
 
-def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC],
-                      bgc_genome_mapping: dict[str, str]):
+def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]):
     """To set BGC object's strain with representative strain object.
 
     This method changes the list `bgcs` in place.
 
-    It's assumed that BGC's genome id is used as strain's name or alias, and
-    the genome id is used to lookup the representative strain.
-
     Args:
         strains(StrainCollection): A collection of all strain objects.
         bgcs(list[BGC]): A list of BGC objects.
-        bgc_genome_mapping(dict[str, str]): The mappings from BGC id (key) to
-            genome id (value).
 
     Raises:
-        KeyError: BGC id not found in the `bgc_genome_mapping` dict.
         KeyError: Strain id not found in the strain collection.
     """
     for bgc in bgcs:
         try:
-            genome_id = bgc_genome_mapping[bgc.bgc_id]
+            strain = strains.lookup(bgc.bgc_id)
         except KeyError as e:
             raise KeyError(
-                f"Not found BGC id {bgc.bgc_id} in BGC-genome mappings."
-            ) from e
-        try:
-            strain = strains.lookup(genome_id)
-        except KeyError as e:
-            raise KeyError(
-                f"Strain id {genome_id} from BGC object {bgc.bgc_id} "
-                "not found in the StrainCollection object.") from e
+                f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' "
+                "not found in the strain collection.") from e
         bgc.strain = strain
 
 
@@ -122,8 +109,9 @@ def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]):
             try:
                 bgc = bgc_dict[bgc_id]
             except KeyError as e:
-                raise KeyError(f"BGC id {bgc_id} from GCF object {gcf.gcf_id} "
-                               "not found in the list of BGC objects.") from e
+                raise KeyError(
+                    f"BGC id '{bgc_id}' from GCF object '{gcf.gcf_id}' "
+                    "not found in the list of BGC objects.") from e
             gcf.add_bgc(bgc)
 
 
@@ -155,6 +143,7 @@ def get_strains_from_bgcs(bgcs: list[BGC]) -> StrainCollection:
     return sc
 
 
+
 @deprecated(version="1.3.3", reason="It is split to separate functions: " \
             "map_strain_to_bgc, map_bgc_to_gcf, filter_mibig_only_gcf, " \
             "get_bgcs_from_gcfs and get_strains_from_bgcs.")
diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py
index 8b566b90..37fdef36 100644
--- a/tests/genomics/test_genomics.py
+++ b/tests/genomics/test_genomics.py
@@ -73,121 +73,106 @@ def test_generate_mappings_genome_id_bgc_id_empty_dir(tmp_path, caplog):
 @pytest.fixture
 def strain_collection() -> StrainCollection:
     sc = StrainCollection()
-    sc.add(Strain("G-SAMPLE0001"))
 
-    strain = Strain("S-EXAMPLE002")
-    strain.add_alias("G-SAMPLE0002")
+    strain = Strain("STRAIN_01")
+    strain.add_alias("BGC_01")
     sc.add(strain)
 
-    sc.add(Strain("BGC0000001"))
-    sc.add(Strain("BGC0000002"))
-    return sc
+    strain = Strain("STRAIN_02")
+    strain.add_alias("BGC_02")
+    strain.add_alias("BGC_02_1")
+    sc.add(strain)
 
+    strain = Strain("SAMPLE_BGC_03")
+    sc.add(strain)
 
-@pytest.fixture
-def bgc_genome_mapping() -> dict[str, str]:
-    return {
-        "SAMPLE0001": "G-SAMPLE0001",
-        "SAMPLE0002": "G-SAMPLE0002",
-        "BGC0000001": "BGC0000001",
-        "BGC0000002": "BGC0000002"
-    }
+    return sc
 
 
 @pytest.fixture
 def bgc_list() -> list[BGC]:
     return [
-        BGC("SAMPLE0001", "NPR"),
-        BGC("SAMPLE0002", "Alkaloid"),
-        BGC("BGC0000001", "Polyketide"),
-        BGC("BGC0000002", "Terpene")
+        BGC("BGC_01", "NPR"),
+        BGC("BGC_02", "Alkaloid"),
+        BGC("SAMPLE_BGC_03", "Polyketide")
     ]
 
 
 @pytest.fixture
 def gcf_list() -> list[GCF]:
     gcf1 = GCF("1")
-    gcf1.bgc_ids |= set(("SAMPLE0001", "SAMPLE0002"))
+    gcf1.bgc_ids |= {"BGC_01"}
     gcf2 = GCF("2")
-    gcf2.bgc_ids |= set(("BGC0000001", "BGC0000002"))
+    gcf2.bgc_ids |= {"BGC_02", "SAMPLE_BGC_03"}
     return [gcf1, gcf2]
 
 
 @pytest.fixture
 def gcf_list_error() -> list[GCF]:
     gcf1 = GCF("1")
-    gcf1.bgc_ids |= set(("SAMPLE0001", "SAMPLE0003"))
+    gcf1.bgc_ids |= {"SAMPLE_BGC_03", "BGC_04"}
     return [gcf1]
 
 
-def test_map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping):
+def test_map_strain_to_bgc(strain_collection, bgc_list):
     for bgc in bgc_list:
         assert bgc.strain is None
-    map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping)
+    map_strain_to_bgc(strain_collection, bgc_list)
     for bgc in bgc_list:
         assert bgc.strain is not None
-    assert bgc_list[0].strain.id == "G-SAMPLE0001"
-    assert bgc_list[1].strain.id == "S-EXAMPLE002"
-    assert bgc_list[2].strain.id == "BGC0000001"
-    assert bgc_list[3].strain.id == "BGC0000002"
+    assert bgc_list[0].strain.id == "STRAIN_01"
+    assert bgc_list[1].strain.id == "STRAIN_02"
+    assert bgc_list[2].strain.id == "SAMPLE_BGC_03"
 
 
 def test_map_strain_to_bgc_error(strain_collection):
-    bgc_genome_mapping = {"SAMPLE0003": "G-SAMPLE0003"}
-    bgcs = [BGC("BGC0000003", ["Polyketide"])]
-    with pytest.raises(KeyError) as e:
-        map_strain_to_bgc(strain_collection, bgcs, bgc_genome_mapping)
-    assert "Not found BGC id BGC0000003 in BGC-genome mappings." in e.value.args[
-        0]
-
-    bgcs = [BGC("SAMPLE0003", ["NPR"])]
+    bgcs = [BGC("BGC_04", "NPR")]
     with pytest.raises(KeyError) as e:
-        map_strain_to_bgc(strain_collection, bgcs, bgc_genome_mapping)
-    assert "Strain id G-SAMPLE0003 from BGC object SAMPLE0003 not found" in e.value.args[
+        map_strain_to_bgc(strain_collection, bgcs)
+    assert "Strain id 'BGC_04' from BGC object 'BGC_04' not found" in e.value.args[
         0]
 
 
 def test_map_bgc_to_gcf(bgc_list, gcf_list):
-    assert gcf_list[0].bgc_ids == set(("SAMPLE0001", "SAMPLE0002"))
-    assert gcf_list[1].bgc_ids == set(("BGC0000001", "BGC0000002"))
+    assert gcf_list[0].bgc_ids == {"BGC_01"}
+    assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"}
     assert len(gcf_list[0].bgcs) == 0
     assert len(gcf_list[1].bgcs) == 0
     map_bgc_to_gcf(bgc_list, gcf_list)
-    assert gcf_list[0].bgc_ids == set(("SAMPLE0001", "SAMPLE0002"))
-    assert gcf_list[1].bgc_ids == set(("BGC0000001", "BGC0000002"))
-    assert len(gcf_list[0].bgcs) == 2
+    assert gcf_list[0].bgc_ids == {"BGC_01"}
+    assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"}
+    assert len(gcf_list[0].bgcs) == 1
     assert len(gcf_list[1].bgcs) == 2
-    assert gcf_list[0].bgcs == set(bgc_list[:2])
-    assert gcf_list[1].bgcs == set(bgc_list[2:])
+    assert gcf_list[0].bgcs == set(bgc_list[:1])
+    assert gcf_list[1].bgcs == set(bgc_list[1:])
 
 
 def test_map_bgc_to_gcf_error(bgc_list, gcf_list_error):
-    assert gcf_list_error[0].bgc_ids == set(("SAMPLE0001", "SAMPLE0003"))
+    assert gcf_list_error[0].bgc_ids == {"SAMPLE_BGC_03", "BGC_04"}
     assert len(gcf_list_error[0].bgcs) == 0
     with pytest.raises(KeyError) as e:
         map_bgc_to_gcf(bgc_list, gcf_list_error)
-    assert "BGC id SAMPLE0003 from GCF object 1 not found" in e.value.args[0]
+    assert "BGC id 'BGC_04' from GCF object '1' not found" in e.value.args[0]
 
 
 def test_filter_mibig_only_gcf(bgc_list, gcf_list):
     map_bgc_to_gcf(bgc_list, gcf_list)
     gcfs = filter_mibig_only_gcf(gcf_list)
     assert len(gcfs) == 1
-    assert gcfs[0].gcf_id == "1"
+    assert gcfs[0].gcf_id == "2"
 
 
 def test_get_bgcs_from_gcfs(bgc_list, gcf_list):
     map_bgc_to_gcf(bgc_list, gcf_list)
     bgcs = get_bgcs_from_gcfs(gcf_list)
     assert isinstance(bgcs, list)
-    assert len(bgcs) == 4
+    assert len(bgcs) == 3
     for i in bgcs:
         assert isinstance(i, BGC)
 
 
-def test_get_strains_from_bgcs(strain_collection, bgc_list,
-                               bgc_genome_mapping):
-    map_strain_to_bgc(strain_collection, bgc_list, bgc_genome_mapping)
+def test_get_strains_from_bgcs(strain_collection, bgc_list):
+    map_strain_to_bgc(strain_collection, bgc_list)
     strains = get_strains_from_bgcs(bgc_list)
     assert isinstance(strains, StrainCollection)
     assert strains == strain_collection

From 21cd0fc83cdd6ed6ca99d16065db660f80e098c2 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 11 Aug 2023 16:34:44 +0200
Subject: [PATCH 04/21] add property names to class `Strain`

---
 src/nplinker/strains.py | 9 +++++++++
 tests/test_strain.py    | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 7b07a588..b1afcf70 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -43,6 +43,15 @@ def __contains__(self, alias: str) -> bool:
             raise TypeError(f'Expected str, got {type(alias)}')
         return alias in self._aliases
 
+    @property
+    def names(self) -> set[str]:
+        """Get the set of strain names including id and aliases.
+
+        Returns:
+            set[str]: A set of names associated with the strain.
+        """
+        return self._aliases | {self.id}
+
     @property
     def aliases(self) -> set[str]:
         """Get the set of known aliases.
diff --git a/tests/test_strain.py b/tests/test_strain.py
index c8ed0a56..34540bec 100644
--- a/tests/test_strain.py
+++ b/tests/test_strain.py
@@ -26,10 +26,16 @@ def test_hash(strain: Strain):
     assert hash(strain) == hash("strain_1")
 
 
+def test_names(strain: Strain):
+    assert strain.names == {"strain_1", "strain_1_a"}
+
+
 def test_alias(strain: Strain):
     assert len(strain.aliases) == 1
+    assert "strain_1_a" in strain.aliases
 
 
 def test_add_alias(strain: Strain):
     strain.add_alias("strain_1_b")
     assert len(strain.aliases) == 2
+    assert "strain_1_b" in strain.aliases

From 3485155e921208c3f979b2cb28541449fab7de31 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 15 Aug 2023 14:37:12 +0200
Subject: [PATCH 05/21] update the logic of method `__contains__` in
 StrainCollection

---
 src/nplinker/strain_collection.py | 21 ++++++---------------
 tests/test_strain_collection.py   | 25 +++++++++----------------
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index d221ce86..b9369bb9 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -1,14 +1,10 @@
 import json
 from os import PathLike
-from pathlib import Path
 from typing import Iterator
-from deprecated import deprecated
 from jsonschema import validate
 from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA
 from .logconfig import LogConfig
 from .strains import Strain
-from .utils import list_dirs
-from .utils import list_files
 
 
 logger = LogConfig.getLogger(__name__)
@@ -41,16 +37,12 @@ def __eq__(self, other) -> bool:
                     and self._strain_dict_name == other._strain_dict_name)
         return NotImplemented
 
-    def __contains__(self, item: str | Strain) -> bool:
-        """Check if the strain collection contains the given strain.
-
-        The given strain could be a Strain object, or a strain id or alias.
+    def __contains__(self, item: Strain) -> bool:
+        """Check if the strain collection contains the given Strain object.
         """
-        if isinstance(item, str):
-            return item in self._strain_dict_name
         if isinstance(item, Strain):
             return item.id in self._strain_dict_name
-        raise TypeError(f"Expected Strain or str, got {type(item)}")
+        raise TypeError(f"Expected Strain, got {type(item)}")
 
     def __iter__(self) -> Iterator[Strain]:
         return iter(self._strains)
@@ -83,9 +75,8 @@ def remove(self, strain: Strain):
         """
         if strain.id in self._strain_dict_name:
             self._strains.remove(strain)
-            del self._strain_dict_name[strain.id]
-            for alias in strain.aliases:
-                del self._strain_dict_name[alias]
+            for name in strain.names:
+                del self._strain_dict_name[name]
 
     def filter(self, strain_set: set[Strain]):
         """
@@ -109,7 +100,7 @@ def lookup(self, name: str) -> Strain:
         Raises:
             KeyError: If the strain name is not found.
         """
-        if name in self:
+        if name in self._strain_dict_name:
             return self._strain_dict_name[name]
         raise KeyError(f"Strain {name} not found in strain collection.")
 
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index c809759c..944464e4 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -31,10 +31,8 @@ def test_eq(collection: StrainCollection, strain: Strain):
 
 def test_contains(collection: StrainCollection, strain: Strain):
     assert strain in collection
-    assert strain.id in collection
-    for alias in strain.aliases:
-        assert alias in collection
-    assert "strain_not_exist" not in collection
+    strain2 = Strain("strain_2")
+    assert strain2 not in collection
 
 
 def test_iter(collection: StrainCollection, strain: Strain):
@@ -46,31 +44,25 @@ def test_add(strain: Strain):
     sut = StrainCollection()
     sut.add(strain)
     assert strain in sut
-    for alias in strain.aliases:
-        assert alias in sut
+    for name in strain.names:
+        assert name in sut._strain_dict_name
 
 
 def test_remove(collection: StrainCollection, strain: Strain):
-    assert strain in collection
     collection.remove(strain)
-    with pytest.raises(KeyError):
-        _ = collection._strain_dict_name[strain.id]
     assert strain not in collection
 
 
 def test_filter(collection: StrainCollection, strain: Strain):
-    assert strain in collection
     collection.add(Strain("strain_2"))
     collection.filter({strain})
-    assert strain in collection
-    assert "strain_2" not in collection
+    assert "strain_2" not in collection._strain_dict_name
     assert len(collection) == 1
 
 
 def test_lookup(collection: StrainCollection, strain: Strain):
-    assert collection.lookup(strain.id) == strain
-    for alias in strain.aliases:
-        assert collection.lookup(alias) == strain
+    for name in strain.names:
+        assert collection.lookup(name) == strain
     with pytest.raises(KeyError):
         collection.lookup("strain_not_exist")
 
@@ -85,7 +77,8 @@ def json_file(tmp_path):
             "strain_id": "strain_2",
             "strain_alias": ["alias_3", "alias_4"]
         }],
-        "version": "1.0"
+        "version":
+        "1.0"
     }
     file_path = tmp_path / "test.json"
     with open(file_path, "w") as f:

From 1211657206275d45ee27c7b43a96eb24fdfbc486 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 11:11:56 +0200
Subject: [PATCH 06/21] add method `has_name` to class `StrainCollection`

---
 src/nplinker/metabolomics/load_gnps.py         | 18 +++++++++---------
 .../pairedomics/strain_mappings_generator.py   |  2 +-
 src/nplinker/strain_collection.py              | 11 +++++++++++
 tests/test_strain_collection.py                |  6 ++++++
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py
index 14a5ec2c..a725e9f4 100644
--- a/src/nplinker/metabolomics/load_gnps.py
+++ b/src/nplinker/metabolomics/load_gnps.py
@@ -30,14 +30,14 @@ def _messy_strain_naming_lookup(mzxml: str, strains: StrainCollection) -> Strain
     Returns:
         Strain or None: The strain identified to be matching or None.
     """
-    if mzxml in strains:
+    if strains.has_name(mzxml):
         # life is simple!
         return strains.lookup(mzxml)
 
     # 1. knock off the .mzXML and try again (using splitext should also handle
     # other extensions like .mzML here)
     mzxml_no_ext = os.path.splitext(mzxml)[0]
-    if mzxml_no_ext in strains:
+    if strains.has_name(mzxml_no_ext):
         return strains.lookup(mzxml_no_ext)
 
     # 2. if that doesn't work, try using everything up to the first -/_
@@ -46,15 +46,15 @@ def _messy_strain_naming_lookup(mzxml: str, strains: StrainCollection) -> Strain
     mzxml_trunc_underscore = mzxml_no_ext if underscore_index == -1 else mzxml_no_ext[:underscore_index]
     mzxml_trunc_hyphen = mzxml_no_ext if hyphen_index == -1 else mzxml_no_ext[:hyphen_index]
 
-    if underscore_index != -1 and mzxml_trunc_underscore in strains:
+    if underscore_index != -1 and strains.has_name(mzxml_trunc_underscore):
         return strains.lookup(mzxml_trunc_underscore)
-    if hyphen_index != -1 and mzxml_trunc_hyphen in strains:
+    if hyphen_index != -1 and strains.has_name(mzxml_trunc_hyphen):
         return strains.lookup(mzxml_trunc_hyphen)
 
     # 3. in the case of original Crusemann dataset, many of the filenames seem to
     # match up to real strains with the initial "CN" missing ???
     for mzxml_trunc in [mzxml_trunc_hyphen, mzxml_trunc_underscore]:
-        if 'CN' + mzxml_trunc in strains:
+        if strains.has_name('CN' + mzxml_trunc):
             return strains.lookup('CN' + mzxml_trunc)
 
     # give up
@@ -113,7 +113,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str,
 
     # check if the strain label exists in the set of strain mappings the user
     # has provided
-    if strain_name not in strains:
+    if not strains.has_name(strain_name):
         # if this check fails, it could mean a missing strain ID mapping. this isn't
         # fatal but will produce a warning and an entry added to the file
         # unknown_strains_met.csv in the dataset folder.
@@ -139,7 +139,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str,
             growth_medium = md_table[strain_name]['growthmedium']
             strain_col_content = md_table[strain_name]['strain']
 
-            if strain_col_content in strains:
+            if strains.has_name(strain_col_content):
                 strain = strains.lookup(strain_col_content)
                 strain.add_alias(strain_name)
                 # this merges the set of aliases back into the internal
@@ -158,7 +158,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str,
                 'Unknown strain identifier: {} (parsed from {})'.format(
                     strain_name, hdr))
 
-    return (strain_name, growth_medium, strain_name not in strains)
+    return (strain_name, growth_medium, not strains.has_name(strain_name))
 
 
 def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[str, Spectrum]) -> dict[str, int]:
@@ -422,7 +422,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod
 
             # create a new strain object if the intensity value is a float > 0
             v = _md_convert(v)
-            if strain_name in strains and isinstance(v, float) and float(v) > 0:
+            if strains.has_name(strain_name) and isinstance(v, float) and float(v) > 0:
                 # find the strain object, and add the growth medium + intensity to it. the
                 # growth medium will only be set if extended_metadata_table_parsing is
                 # enabled in the config file and the metadata table file contains that info
diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py
index 091a37e0..8f9ec568 100644
--- a/src/nplinker/pairedomics/strain_mappings_generator.py
+++ b/src/nplinker/pairedomics/strain_mappings_generator.py
@@ -97,7 +97,7 @@ def podp_generate_strain_mappings(
     # Create StrainCollection
     sc = StrainCollection()
     for strain_id, bgc_ids in mappings.items():
-        if strain_id not in sc:
+        if not sc.has_name(strain_id):
             strain = Strain(strain_id)
             for bgc_id in bgc_ids:
                 strain.add_alias(bgc_id)
diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index b9369bb9..fb5e0697 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -87,6 +87,17 @@ def filter(self, strain_set: set[Strain]):
             if strain not in strain_set:
                 self.remove(strain)
 
+    def has_name(self, name: str) -> bool:
+        """Check if the strain collection contains the given strain name (id or alias).
+
+        Args:
+            name(str): Strain name (id or alias) to check.
+
+        Returns:
+            bool: True if the strain name is in the collection, False otherwise.
+        """
+        return name in self._strain_dict_name
+
     def lookup(self, name: str) -> Strain:
         """Lookup a strain by name (id or alias).
 
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index 944464e4..b560b022 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -60,6 +60,12 @@ def test_filter(collection: StrainCollection, strain: Strain):
     assert len(collection) == 1
 
 
+def test_has_name(collection: StrainCollection):
+    assert collection.has_name("strain_1")
+    assert collection.has_name("strain_1_a")
+    assert not collection.has_name("strain_2")
+
+
 def test_lookup(collection: StrainCollection, strain: Strain):
     for name in strain.names:
         assert collection.lookup(name) == strain

From 14b3453da76db88e802a2cc963ec5b677b1cf422 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 11:11:12 +0200
Subject: [PATCH 07/21] update method `has_strain` in classes

---
 src/nplinker/genomics/gcf.py                  | 4 ++--
 src/nplinker/metabolomics/molecular_family.py | 4 ++--
 src/nplinker/metabolomics/spectrum.py         | 3 ++-
 tests/genomics/test_gcf.py                    | 4 ++--
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
index c1b0d9d9..0af3e407 100644
--- a/src/nplinker/genomics/gcf.py
+++ b/src/nplinker/genomics/gcf.py
@@ -82,11 +82,11 @@ def detach_bgc(self, bgc: BGC) -> None:
                     return
             self.strains.remove(bgc.strain)
 
-    def has_strain(self, strain: str | Strain) -> bool:
+    def has_strain(self, strain: Strain) -> bool:
         """Check if the given strain exists.
 
         Args:
-            strain(str | Strain): strain id or `Strain` object.
+            strain(Strain): `Strain` object.
 
         Returns:
             bool: True when the given strain exist.
diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index c4e584c5..8a5f9753 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -31,11 +31,11 @@ def strains(self) -> StrainCollection:
                 strains.add(strain)
         return strains
 
-    def has_strain(self, strain: str | Strain) -> bool:
+    def has_strain(self, strain: Strain) -> bool:
         """Check if the given strain exists.
 
         Args:
-            strain(str | Strain): strain id or `Strain` object.
+            strain(Strain): `Strain` object.
 
         Returns:
             bool: True when the given strain exist.
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 12663871..6895d198 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -1,5 +1,6 @@
 
 from nplinker.strain_collection import StrainCollection
+from nplinker.strains import Strain
 from nplinker.utils import sqrt_normalise
 
 
@@ -97,7 +98,7 @@ def get_metadata_value(self, key):
         val = self.metadata.get(key, None)
         return val
 
-    def has_strain(self, strain):
+    def has_strain(self, strain: Strain):
         return strain in self.strains
 
     def get_growth_medium(self, strain):
diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py
index a40f4728..c24a6952 100644
--- a/tests/genomics/test_gcf.py
+++ b/tests/genomics/test_gcf.py
@@ -55,8 +55,8 @@ def test_add_bgc_wo_strain(bgc_without_strain, caplog):
 def test_has_strain(bgc_with_strain):
     gcf = GCF("1")
     gcf.add_bgc(bgc_with_strain)
-    assert gcf.has_strain("strain001") is True
-    assert gcf.has_strain("strain002") is False
+    assert gcf.has_strain(Strain("strain001")) is True
+    assert gcf.has_strain(Strain("strain002")) is False
 
 def test_has_mibig_only():
     mibig_bgc = BGC("BGC0000001", "NPR")

From ab226d5683fffd8de1e52b46503a561653b033eb Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 13:19:15 +0200
Subject: [PATCH 08/21] rename MibigBGCLoader to MibigLoader

---
 src/nplinker/genomics/mibig/__init__.py     | 4 +++-
 src/nplinker/genomics/mibig/mibig_loader.py | 4 ++--
 src/nplinker/loader.py                      | 4 ++--
 tests/genomics/test_mibig_loader.py         | 8 ++++----
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/nplinker/genomics/mibig/__init__.py b/src/nplinker/genomics/mibig/__init__.py
index d35b7a21..753cfdf7 100644
--- a/src/nplinker/genomics/mibig/__init__.py
+++ b/src/nplinker/genomics/mibig/__init__.py
@@ -1,6 +1,8 @@
 import logging
 from .mibig_downloader import download_and_extract_mibig_metadata
-from .mibig_loader import MibigBGCLoader, parse_bgc_metadata_json
+from .mibig_loader import MibigLoader
+from .mibig_loader import parse_bgc_metadata_json
 from .mibig_metadata import MibigMetadata
 
+
 logging.getLogger(__name__).addHandler(logging.NullHandler())
diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
index 8ca30bd6..82226431 100644
--- a/src/nplinker/genomics/mibig/mibig_loader.py
+++ b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -10,7 +10,7 @@
 logger = LogConfig.getLogger(__name__)
 
 
-class MibigBGCLoader:
+class MibigLoader:
 
     def __init__(self, data_dir: str):
         """Parse MIBiG metadata files and return BGC objects
@@ -129,4 +129,4 @@ def parse_bgc_metadata_json(file: str) -> BGC:
 
 
 # register as virtual class to prevent metaclass conflicts
-BGCLoaderBase.register(MibigBGCLoader)
+BGCLoaderBase.register(MibigLoader)
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 6fd067af..f06024b5 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -9,7 +9,7 @@
 from nplinker.genomics import load_gcfs
 from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.mibig import download_and_extract_mibig_metadata
-from nplinker.genomics.mibig import MibigBGCLoader
+from nplinker.genomics.mibig import MibigLoader
 from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
 from nplinker.globals import GENOME_STATUS_FILENAME
 from nplinker.globals import GNPS_FILE_MAPPINGS_FILENAME
@@ -426,7 +426,7 @@ def _load_genomics(self):
         # and update self.strains with mibig strains
         #----------------------------------------------------------------------
         logger.debug(f'MibigBGCLoader({self.mibig_json_dir})')
-        mibig_bgc_loader = MibigBGCLoader(self.mibig_json_dir)
+        mibig_bgc_loader = MibigLoader(self.mibig_json_dir)
         self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs()
 
         # add mibig bgc strains
diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py
index c122c56e..21caf3c2 100644
--- a/tests/genomics/test_mibig_loader.py
+++ b/tests/genomics/test_mibig_loader.py
@@ -3,7 +3,7 @@
 from nplinker.genomics import BGC
 from nplinker.genomics import BGCLoaderBase
 from nplinker.genomics.mibig import download_and_extract_mibig_metadata
-from nplinker.genomics.mibig import MibigBGCLoader
+from nplinker.genomics.mibig import MibigLoader
 from nplinker.genomics.mibig import parse_bgc_metadata_json
 from nplinker.genomics.mibig.mibig_metadata import MibigMetadata
 from .. import DATA_DIR
@@ -22,11 +22,11 @@ def data_dir(self, tmp_path):
 
     @pytest.fixture
     def loader(self, data_dir):
-        loader = MibigBGCLoader(data_dir)
+        loader = MibigLoader(data_dir)
         yield loader
 
     def test_abc(self, loader):
-        assert issubclass(MibigBGCLoader, BGCLoaderBase)
+        assert issubclass(MibigLoader, BGCLoaderBase)
         assert isinstance(loader, BGCLoaderBase)
 
     def test_init(self, loader, data_dir):
@@ -49,7 +49,7 @@ def test_get_files(self, loader):
         assert os.path.exists(files["BGC0000001"])
 
     def test_parse_data_dir(self, data_dir):
-        files = MibigBGCLoader.parse_data_dir(data_dir)
+        files = MibigLoader.parse_data_dir(data_dir)
         assert isinstance(files, dict)
         assert len(files) == 2502  # MIBiG v3.1 has 2502 BGCs
         assert "BGC0000001" in files

From 1058928583af85c61d602c4c87ca49562a48f2a3 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 13:19:46 +0200
Subject: [PATCH 09/21] rename method `get_bgc_genome_mapping` to
 `get_strain_bgc_mapping`

---
 src/nplinker/genomics/mibig/mibig_loader.py | 11 ++++-------
 tests/genomics/test_mibig_loader.py         |  4 ++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
index 82226431..c3bf7424 100644
--- a/src/nplinker/genomics/mibig/mibig_loader.py
+++ b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -26,16 +26,13 @@ def __init__(self, data_dir: str):
         self._metadata_dict = self._parse_metadatas()
         self._bgc_dict = self._parse_bgcs()
 
-    def get_bgc_genome_mapping(self) -> dict[str, str]:
-        """Get the mapping from BGC to genome.
+    def get_strain_bgc_mapping(self) -> dict[str, str]:
+        """Get the mapping from strain to BGC.
 
-        Note that for MIBiG BGC, same value is used for BGC id and genome id.
-        Users don't have to provide genome id for MIBiG BGCs in the
-        `strain_mappings.json` file.
+        Note that for MIBiG BGC, same value is used for strain name and BGC id.
 
         Returns:
-            dict[str, str]: key is BGC id/accession, value is
-                genome id that uses the value of BGC accession.
+            dict[str, str]: key is strain name, value is BGC id.
         """
         return {bid: bid for bid in self._file_dict}
 
diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py
index 21caf3c2..6fac43c0 100644
--- a/tests/genomics/test_mibig_loader.py
+++ b/tests/genomics/test_mibig_loader.py
@@ -32,8 +32,8 @@ def test_abc(self, loader):
     def test_init(self, loader, data_dir):
         assert loader.data_dir == data_dir
 
-    def test_get_bgc_genome_mapping(self, loader):
-        mapping = loader.get_bgc_genome_mapping()
+    def test_get_strain_bgc_mapping(self, loader):
+        mapping = loader.get_strain_bgc_mapping()
         assert isinstance(mapping, dict)
         assert len(mapping) == 2502
         for bid in mapping:

From 36fda27e927be4cb8a62fda6c9cf8af4394ece4f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 14:30:42 +0200
Subject: [PATCH 10/21] refactor mibig loading related code to `_load_mibig`
 method

---
 src/nplinker/loader.py | 57 +++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index f06024b5..02594677 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -22,6 +22,7 @@
 from nplinker.pairedomics.strain_mappings_generator import \
     podp_generate_strain_mappings
 from nplinker.strain_collection import StrainCollection
+from nplinker.strains import Strain
 
 
 try:
@@ -112,6 +113,7 @@ def __init__(self, config_data):
             self._root)[-1] if not self._remote_loading else self._platform_id
         self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
         self.mibig_bgc_dict = {}
+        self._mibig_strain_bgc_mapping = {}
         self.product_types = []
         self.strains = StrainCollection()
         self.webapp_scoring_cutoff = self._config_webapp.get(
@@ -149,7 +151,8 @@ def generate_strain_mappings(self):
 
         generate_mappings_genome_id_bgc_id(self._root / "antismash")
 
-        podp_project_json_file = self._root.parent.parent / (self._platform_id + ".json")
+        podp_project_json_file = self._root.parent.parent / (
+            self._platform_id + ".json")
         genome_status_json_file = self._root.parent.parent / "downloads" / self._platform_id / GENOME_STATUS_FILENAME
         genome_bgc_mappings_file = self._root / "antismash" / GENOME_BGC_MAPPINGS_FILENAME
         gnps_file_mapping_tsv_file = self._root / GNPS_FILE_MAPPINGS_FILENAME
@@ -161,7 +164,10 @@ def generate_strain_mappings(self):
                                       self.strain_mappings_file)
 
     def load(self):
-        # load strain mappings first
+        if self._use_mibig:
+            if not self._load_mibig():
+                return False
+
         if not self._load_strain_mappings():
             return False
 
@@ -345,12 +351,28 @@ def _validate_paths(self):
                 logger.warning('Optional file/directory "%s" does not exist',
                                f)
 
-    # TODO: this function should be refactored to Loader class
+    def _load_mibig(self):
+        mibig_bgc_loader = MibigLoader(self.mibig_json_dir)
+        self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs()
+        self._mibig_strain_bgc_mapping = mibig_bgc_loader.get_strain_bgc_mapping(
+        )
+        return True
+
     def _load_strain_mappings(self):
+        # First load user's strain mappings
         sc = StrainCollection.read_json(self.strain_mappings_file)
         for strain in sc:
             self.strains.add(strain)
-        logger.info('Loaded dataset strain IDs ({} total)'.format(
+        logger.info('Loaded {} non-MiBIG Strain objects'.format(
+            len(self.strains)))
+
+        # Then load MiBIG strain mappings
+        if self._mibig_strain_bgc_mapping:
+            for k, v in self._mibig_strain_bgc_mapping.items():
+                strain = Strain(k)
+                strain.add_alias(v)
+                self.strains.add(strain)
+        logger.info('Loaded {} Strain objects in total'.format(
             len(self.strains)))
 
         return True
@@ -412,35 +434,14 @@ def _load_genomics(self):
                             .format(d, d.replace(' ', '_')))
 
         #----------------------------------------------------------------------
-        # CG: download mibig metadata and run bigscape
+        # CG: run bigscape
         #----------------------------------------------------------------------
         # both the bigscape and mibig_json dirs expected by nplinker may not exist at this point. in some
         # cases this will cause an error later in the process, but this can also potentially be
         # resolved automatically:
-        #   mibig_json => download and extract the JSON database
         #   bigscape => run BiG-SCAPE before continuing (if using the Docker image)
         self._load_genomics_extra()
 
-        #----------------------------------------------------------------------
-        # CG: load mibig metadata to BGC object
-        # and update self.strains with mibig strains
-        #----------------------------------------------------------------------
-        logger.debug(f'MibigBGCLoader({self.mibig_json_dir})')
-        mibig_bgc_loader = MibigLoader(self.mibig_json_dir)
-        self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs()
-
-        # add mibig bgc strains
-        # CG TODO: update strain assignment logics,
-        #    see issue 104 https://github.com/NPLinker/nplinker/issues/104
-        for bgc in self.mibig_bgc_dict.values():
-            if bgc.strain is not None:
-                self.strains.add(bgc.strain)
-            else:
-                logger.warning("No strain specified for BGC %s", bgc.bgc_id)
-
-        logger.debug('mibig_bgc_dict has {} entries'.format(
-            len(self.mibig_bgc_dict)))
-
         #----------------------------------------------------------------------
         # CG: Parse AntiSMASH dir
         #----------------------------------------------------------------------
@@ -684,8 +685,8 @@ def _filter_user_strains(self):
         # get the list of spectra which have at least one strain in the set
         spectra_to_retain = {
             spec
-            for spec in self.spectra for sstrain in spec.strains
-            if sstrain in self.include_only_strains
+            for spec in self.spectra
+            for sstrain in spec.strains if sstrain in self.include_only_strains
         }
 
         logger.info('Current / filtered BGC counts: {} / {}'.format(

From c668ca8c9cff1feff98ba537d921c6308754aa48 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 14:31:45 +0200
Subject: [PATCH 11/21] remove mibig downloading in class `DatasetLoader`

---
 src/nplinker/loader.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 02594677..080cf45c 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -484,19 +484,6 @@ def _load_genomics(self):
         return True
 
     def _load_genomics_extra(self):
-        if not os.path.exists(self.mibig_json_dir):
-            if self._use_mibig:
-                logger.info(
-                    'Attempting to download MiBIG JSON database (v{})...'.
-                    format(self._mibig_version))
-                download_and_extract_mibig_metadata(self._root,
-                                                    self.mibig_json_dir,
-                                                    self._mibig_version)
-            else:
-                logger.warning(
-                    'Not downloading MiBIG database automatically, use_mibig = false'
-                )
-
         if not os.path.exists(self.bigscape_dir):
             should_run_bigscape = self._config_docker.get(
                 'run_bigscape', self.RUN_BIGSCAPE_DEFAULT)

From a9b808f50d74340039c504d77570064b85676c56 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 14:44:27 +0200
Subject: [PATCH 12/21] refactor bigscape running code to `_run_bigscape`

---
 src/nplinker/loader.py | 61 +++++++++++-------------------------------
 1 file changed, 15 insertions(+), 46 deletions(-)

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 080cf45c..103a4f1f 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -403,56 +403,12 @@ def _load_metabolomics(self):
 
     # CG: load gemonics data into memory
     def _load_genomics(self):
-        # TODO split this method up a bit
-
-        # hmmscan apparently can't cope with filenames that have spaces in them, and so if you try
-        # to run bigscape on a dataset like that it will just refuse. so, to try and avoid anyone
-        # having to manually fix this stupid problem, we will attempt to rename every .gbk file here
-        # TODO is this something nplinker should do, or dataset authors??
         logger.debug("\nLoading genomics starts...")
 
-        #----------------------------------------------------------------------
-        # CG: replace space with underscore for antismash folder and file names
-        # TODO:
-        # 1.[] move this part to antismash downloader as post-download process
-        # 2.[x] separete listing gbk files and renaming
-        #----------------------------------------------------------------------
-        logger.debug('Collecting .gbk files (and possibly renaming)')
-
-        # if the option is enabled, check for spaces in the folder names under
-        # <dataset>/antismash and rename them, replacing spaces with underscores
-        ignore_spaces = self._antismash_ignore_spaces
-        if not ignore_spaces:
-            logger.debug('Checking for spaces in antiSMASH folder names...')
-            for root, dirs, files in os.walk(self.antismash_dir):
-                for d in dirs:
-                    if d.find(' ') != -1:
-                        os.rename(os.path.join(root, d),
-                                  os.path.join(root, d.replace(' ', '_')))
-                        logger.warn(
-                            'Renaming antiSMASH folder "{}" to "{}" to remove spaces! (suppress with ignore_spaces = true in config file)'
-                            .format(d, d.replace(' ', '_')))
-
-        #----------------------------------------------------------------------
-        # CG: run bigscape
-        #----------------------------------------------------------------------
-        # both the bigscape and mibig_json dirs expected by nplinker may not exist at this point. in some
-        # cases this will cause an error later in the process, but this can also potentially be
-        # resolved automatically:
-        #   bigscape => run BiG-SCAPE before continuing (if using the Docker image)
-        self._load_genomics_extra()
-
-        #----------------------------------------------------------------------
-        # CG: Parse AntiSMASH dir
-        #----------------------------------------------------------------------
         logger.debug('Parsing AntiSMASH directory...')
         antismash_bgc_loader = AntismashBGCLoader(self.antismash_dir)
 
-        #----------------------------------------------------------------------
-        # CG: load all bgcs and gcfs
-        #----------------------------------------------------------------------
         logger.debug('Loading GCFs...')
-
         # TODO: refactor load_gcfs to independent steps
         # Step 1: load all strains
         # Step 2: load all bgcs
@@ -483,7 +439,21 @@ def _load_genomics(self):
 
         return True
 
-    def _load_genomics_extra(self):
+    # TODO CG: run bigscape before loading and after downloading
+    def _run_bigscape(self):
+        # Check for spaces in the folder names under <dataset>/antismash and
+        # rename them by replacing spaces with underscores
+        if not self._antismash_ignore_spaces:
+            logger.debug('Checking for spaces in antiSMASH folder names...')
+            for root, dirs, _ in os.walk(self.antismash_dir):
+                for d in dirs:
+                    if d.find(' ') != -1:
+                        os.rename(os.path.join(root, d),
+                                  os.path.join(root, d.replace(' ', '_')))
+                        logger.warn(
+                            'Renaming antiSMASH folder "{}" to "{}" to remove spaces! (suppress with ignore_spaces = true in config file)'
+                            .format(d, d.replace(' ', '_')))
+
         if not os.path.exists(self.bigscape_dir):
             should_run_bigscape = self._config_docker.get(
                 'run_bigscape', self.RUN_BIGSCAPE_DEFAULT)
@@ -491,7 +461,6 @@ def _load_genomics_extra(self):
                 'extra_bigscape_parameters',
                 self.EXTRA_BIGSCAPE_PARAMS_DEFAULT)
             if should_run_bigscape:
-                # TODO this should not be attempted if not in Docker env
                 logger.info(
                     'Running BiG-SCAPE! extra_bigscape_parameters="{}"'.format(
                         extra_bigscape_parameters))

From 682a0a5d9c0e8598fe146c3d68f4607ad8a8776c Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 14 Aug 2023 15:56:05 +0200
Subject: [PATCH 13/21] refactor method `_load_genomics`

---
 src/nplinker/loader.py | 66 ++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 103a4f1f..8cae746c 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -6,9 +6,13 @@
 from nplinker.class_info.class_matches import ClassMatches
 from nplinker.class_info.runcanopus import run_canopus
 from nplinker.genomics import generate_mappings_genome_id_bgc_id
-from nplinker.genomics import load_gcfs
 from nplinker.genomics.antismash import AntismashBGCLoader
-from nplinker.genomics.mibig import download_and_extract_mibig_metadata
+from nplinker.genomics.bigscape import BigscapeGCFLoader
+from nplinker.genomics.genomics import filter_mibig_only_gcf
+from nplinker.genomics.genomics import get_bgcs_from_gcfs
+from nplinker.genomics.genomics import get_strains_from_bgcs
+from nplinker.genomics.genomics import map_bgc_to_gcf
+from nplinker.genomics.genomics import map_strain_to_bgc
 from nplinker.genomics.mibig import MibigLoader
 from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
 from nplinker.globals import GENOME_STATUS_FILENAME
@@ -24,7 +28,6 @@
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
 
-
 try:
     from importlib.resources import files
 except ImportError:
@@ -401,42 +404,37 @@ def _load_metabolomics(self):
                                         self.spectra, spec_dict)
         return True
 
-    # CG: load gemonics data into memory
+    # TODO CG: self.strains will be overwritten by this method, rename it?
     def _load_genomics(self):
-        logger.debug("\nLoading genomics starts...")
+        """Loads all genomics data (BGCs and GCFs) into the object.
+        """
+        logger.debug("\nLoading genomics data starts...")
 
+        # Step 1: load all BGC objects
         logger.debug('Parsing AntiSMASH directory...')
-        antismash_bgc_loader = AntismashBGCLoader(self.antismash_dir)
-
-        logger.debug('Loading GCFs...')
-        # TODO: refactor load_gcfs to independent steps
-        # Step 1: load all strains
-        # Step 2: load all bgcs
-        # Step 3: load all gcfs
-        # Step 4: connect bgc to strain with `map_strain_to_bgc`
-        # Step 5: connect gcf to bgcs with `map_bgc_to_gcf`
-        # Step 6: `filter_mibig_only_gcf`
-        # Step 7: get clean gcfs, bgcs and strains with `get_bgcs_from_gcfs`
-        #   and `get_strains_from_bgcs`
-
-        self.gcfs, self.bgcs, self.strains, unknown_strains = load_gcfs(
-            self.bigscape_dir, self.strains, self.mibig_bgc_dict,
-            antismash_bgc_loader.get_bgcs(), antismash_bgc_loader.get_files(),
-            self._bigscape_cutoff)
-
-        #----------------------------------------------------------------------
-        # CG: write unknown strains in genomics to file
-        #----------------------------------------------------------------------
-        us_path = os.path.join(self._root, 'unknown_strains_gen.csv')
-        logger.warning(
-            f'Writing unknown strains from GENOMICS data to {us_path}')
-        with open(us_path, 'w') as us:
-            us.write('# unknown strain label, filename\n')
-            for strain, filename in unknown_strains.items():
-                us.write(f'{strain}, {filename}\n')
+        antismash_bgc_dict = AntismashBGCLoader(self.antismash_dir).get_bgcs()
+        raw_bgcs = list(antismash_bgc_dict.values()) + list(
+            self.mibig_bgc_dict.values())
+
+        # Step 2: load all GCF objects
+        bigscape_cluster_file = Path(
+            self.bigscape_dir
+        ) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv"
+        bigscape_gcf_list = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs()
+        raw_gcfs = bigscape_gcf_list
+
+        # Step 3: assign Strain object to BGC.strain
+        map_strain_to_bgc(self.strains, raw_bgcs)
+
+        # Step 4: assign BGC objects to GCF.bgcs
+        map_bgc_to_gcf(raw_bgcs, raw_gcfs)
 
-        logger.debug("Loading genomics completed\n")
+        # Step 5: get clean GCF objects, BGC objects and Strain objects
+        self.gcfs = filter_mibig_only_gcf(raw_gcfs)
+        self.bgcs = get_bgcs_from_gcfs(self.gcfs)
+        self.strains = get_strains_from_bgcs(self.bgcs)
 
+        logger.debug("Loading genomics data completed\n")
         return True
 
     # TODO CG: run bigscape before loading and after downloading

From 0991a47aeff167858b98b0282be463d084780163 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 15 Aug 2023 11:14:41 +0200
Subject: [PATCH 14/21] add TODO comments

---
 src/nplinker/loader.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 8cae746c..f0e77362 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -205,6 +205,9 @@ def load(self):
 
     def _start_downloads(self):
         downloader = PODPDownloader(self._platform_id)
+        # TODO CG: this step generates the real path for _root. Should generate
+        # it before loading process starts. Otherwise, npl.root_dir will get
+        # wrong value if loading from local data or not using download.
         self._root = Path(downloader.project_results_dir)
         logger.debug('remote loading mode, configuring root=%s', self._root)
         # CG: to download both MET and GEN data

From 70e16401a4a63e3cd8946ad2e47d34919dcd643c Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 15 Aug 2023 14:24:56 +0200
Subject: [PATCH 15/21] add TODO comment

---
 src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py b/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py
index 766944ad..2c2e0c9c 100644
--- a/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py
@@ -26,6 +26,7 @@ def __init__(self, file: str | PathLike):
             NotImplementedError: Raises NotImplementedError if the GNPS format is not recognized.
         """
         self._file: Path = Path(file)
+        # TODO CG: change list to set to avoid duplicates of spectra
         self._mapping: dict[str, list[str]] = {}
         self._gnps_format = gnps_format_from_file_mapping(file, False)
 

From 10364efa011d279d6c8589a221363137a116ac68 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 16 Aug 2023 10:17:23 +0200
Subject: [PATCH 16/21] change `StrainCollection._strain_dict_name` dict value
 type to list

- Updated methods `add`, `remove` and `lookup` accordingly
- Updated and added unit tests for updated methods
---
 src/nplinker/strain_collection.py | 50 ++++++++++++++-------
 tests/test_strain_collection.py   | 74 ++++++++++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 18 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index fb5e0697..9b492556 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -14,9 +14,9 @@ class StrainCollection():
 
     def __init__(self):
         """A collection of Strain objects."""
+        # the order of strains is needed for scoring part, so use a list
         self._strains: list[Strain] = []
-        # dict of strain name (id and alias) to primary strain object
-        self._strain_dict_name: dict[str, Strain] = {}
+        self._strain_dict_name: dict[str, list[Strain]] = {}
 
     def __repr__(self) -> str:
         return str(self)
@@ -55,28 +55,46 @@ def add(self, strain: Strain) -> None:
         Args:
             strain(Strain): The strain to add.
         """
-        # if the strain exists, merge the aliases
-        if strain.id in self._strain_dict_name:
-            existing: Strain = self.lookup(strain.id)
-            for alias in strain.aliases:
-                existing.add_alias(alias)
-                self._strain_dict_name[alias] = existing
+        if strain in self._strains:
+            # only one strain object per id
+            strain_ref = self._strain_dict_name[strain.id][0]
+            new_aliases = [alias for alias in strain.aliases if alias not in strain_ref.aliases]
+            for alias in new_aliases:
+                strain_ref.add_alias(alias)
+                if alias not in self._strain_dict_name:
+                    self._strain_dict_name[alias] = [strain_ref]
+                else:
+                    self._strain_dict_name[alias].append(strain_ref)
         else:
             self._strains.append(strain)
-            self._strain_dict_name[strain.id] = strain
-            for alias in strain.aliases:
-                self._strain_dict_name[alias] = strain
+            for name in strain.names:
+                if name not in self._strain_dict_name:
+                    self._strain_dict_name[name] = [strain]
+                else:
+                    self._strain_dict_name[name].append(strain)
 
     def remove(self, strain: Strain):
         """Remove a strain from the collection.
 
+        It removes the given strain object from the collection by strain id.
+        If the strain id is not found, it does nothing.
+
         Args:
             strain(Strain): The strain to remove.
         """
-        if strain.id in self._strain_dict_name:
+        if strain in self._strains:
             self._strains.remove(strain)
-            for name in strain.names:
-                del self._strain_dict_name[name]
+            # only one strain object per id
+            strain_ref = self._strain_dict_name[strain.id][0]
+            for name in strain_ref.names:
+                if name in self._strain_dict_name:
+                    new_strain_list = [
+                        s for s in self._strain_dict_name[name] if s.id != strain.id
+                    ]
+                    if not new_strain_list:
+                        del self._strain_dict_name[name]
+                    else:
+                        self._strain_dict_name[name] = new_strain_list
 
     def filter(self, strain_set: set[Strain]):
         """
@@ -98,7 +116,7 @@ def has_name(self, name: str) -> bool:
         """
         return name in self._strain_dict_name
 
-    def lookup(self, name: str) -> Strain:
+    def lookup(self, name: str) -> list[Strain]:
         """Lookup a strain by name (id or alias).
 
         Args:
@@ -106,7 +124,7 @@ def lookup(self, name: str) -> Strain:
             name(str): Strain name (id or alias) to lookup.
 
         Returns:
-            Strain: Strain identified by the given name.
+            list[Strain]: List of Strain objects with the given name.
 
         Raises:
             KeyError: If the strain name is not found.
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index b560b022..0d12ad5b 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -43,14 +43,84 @@ def test_iter(collection: StrainCollection, strain: Strain):
 def test_add(strain: Strain):
     sut = StrainCollection()
     sut.add(strain)
+    assert len(sut) == 1
     assert strain in sut
     for name in strain.names:
         assert name in sut._strain_dict_name
 
 
-def test_remove(collection: StrainCollection, strain: Strain):
+def test_add_same_id_same_alias(strain: Strain, collection: StrainCollection):
+    collection.add(strain)
+    assert strain in collection
+    assert len(collection) == 1
+    assert len(collection._strain_dict_name) == 2
+
+
+def test_add_same_id_different_alias(collection: StrainCollection):
+    strain = Strain("strain_1")
+    strain.add_alias("strain_1_b")
+    collection.add(strain)
+    assert len(collection) == 1
+    assert strain in collection
+    assert len(collection._strain_dict_name) == 3
+    assert collection._strain_dict_name["strain_1"] == [strain]
+    assert collection._strain_dict_name["strain_1_b"] == [strain]
+
+
+def test_add_different_id_same_alias(strain: Strain,
+                                     collection: StrainCollection):
+    strain2 = Strain("strain_2")
+    strain2.add_alias("strain_1_a")
+    collection.add(strain2)
+    assert len(collection) == 2
+    assert strain2 in collection
+    assert len(collection._strain_dict_name) == 3
+    assert collection._strain_dict_name["strain_1"] == [strain]
+    assert collection._strain_dict_name["strain_2"] == [strain2]
+    assert collection._strain_dict_name["strain_1_a"] == [strain, strain2]
+
+
+def test_add_different_id_different_alias(strain: Strain,
+                                          collection: StrainCollection):
+    strain2 = Strain("strain_2")
+    strain2.add_alias("strain_2_a")
+    collection.add(strain2)
+    assert len(collection) == 2
+    assert strain2 in collection
+    assert len(collection._strain_dict_name) == 4
+    assert collection._strain_dict_name["strain_1"] == [strain]
+    assert collection._strain_dict_name["strain_1_a"] == [strain]
+    assert collection._strain_dict_name["strain_2"] == [strain2]
+    assert collection._strain_dict_name["strain_2_a"] == [strain2]
+
+
+def test_remove(strain: Strain):
+    sc = StrainCollection()
+    sc.remove(strain)
+    assert strain not in sc
+
+
+def test_remove_same_id_same_alias(collection: StrainCollection,
+                                   strain: Strain):
+    collection.remove(strain)
+    assert strain not in collection
+
+
+def test_remove_same_id_different_alias(collection: StrainCollection):
+    strain = Strain("strain_1")
+    strain.add_alias("strain_1_b")
     collection.remove(strain)
+    assert len(collection) == 0
+    assert strain not in collection
+    assert len(collection._strain_dict_name) == 0
+
+
+def test_remove_different_id(collection: StrainCollection):
+    strain = Strain("strain_2")
+    collection.remove(strain)
+    assert len(collection) == 1
     assert strain not in collection
+    assert len(collection._strain_dict_name) == 2
 
 
 def test_filter(collection: StrainCollection, strain: Strain):
@@ -68,7 +138,7 @@ def test_has_name(collection: StrainCollection):
 
 def test_lookup(collection: StrainCollection, strain: Strain):
     for name in strain.names:
-        assert collection.lookup(name) == strain
+        assert collection.lookup(name) == [strain]
     with pytest.raises(KeyError):
         collection.lookup("strain_not_exist")
 

From 43d7f233c733c5f9df2cb3b47b91a55ca1c4595e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 16 Aug 2023 12:05:37 +0200
Subject: [PATCH 17/21] update references of  method `StrainCollection.lookup`

---
 src/nplinker/genomics/genomics.py                     | 9 +++++++--
 src/nplinker/loader.py                                | 6 ++++--
 src/nplinker/pairedomics/strain_mappings_generator.py | 5 +++--
 src/nplinker/scoring/linking/data_links.py            | 4 ++--
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
index 42b8001a..e57bc32c 100644
--- a/src/nplinker/genomics/genomics.py
+++ b/src/nplinker/genomics/genomics.py
@@ -83,12 +83,17 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]):
     """
     for bgc in bgcs:
         try:
-            strain = strains.lookup(bgc.bgc_id)
+            strain_list = strains.lookup(bgc.bgc_id)
+            if len(strain_list) > 1:
+                raise KeyError(
+                    f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
+                    f"BGC object accept only one strain."
+                )
         except KeyError as e:
             raise KeyError(
                 f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' "
                 "not found in the strain collection.") from e
-        bgc.strain = strain
+        bgc.strain = strain_list[0]
 
 
 def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]):
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index f0e77362..3613bab1 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -28,6 +28,7 @@
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
 
+
 try:
     from importlib.resources import files
 except ImportError:
@@ -564,13 +565,14 @@ def _load_optional(self):
             for line_num, sid in enumerate(strain_list):
                 sid = sid.strip()  # get rid of newline
                 try:
-                    strain_obj = self.strains.lookup(sid)
+                    strain_ref_list = self.strains.lookup(sid)
                 except KeyError:
                     logger.warning(
                         'Line {} of {}: invalid/unknown strain ID "{}"'.format(
                             line_num + 1, self.include_strains_file, sid))
                     continue
-                self.include_only_strains.add(strain_obj)
+                for strain in strain_ref_list:
+                    self.include_only_strains.add(strain)
             logger.debug('Found {} strain IDs in include_strains'.format(
                 len(self.include_only_strains)))
 
diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py
index 8f9ec568..4587f1b2 100644
--- a/src/nplinker/pairedomics/strain_mappings_generator.py
+++ b/src/nplinker/pairedomics/strain_mappings_generator.py
@@ -103,9 +103,10 @@ def podp_generate_strain_mappings(
                 strain.add_alias(bgc_id)
             sc.add(strain)
         else:
-            strain = sc.lookup(strain_id)
+            # strain_list has only one element
+            strain_list = sc.lookup(strain_id)
             for bgc_id in bgc_ids:
-                strain.add_alias(bgc_id)
+                strain_list[0].add_alias(bgc_id)
 
     # Write strain mappings JSON file
     sc.to_json(output_json_file)
diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index 30e7c1b8..76a2c088 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -149,8 +149,8 @@ def get_common_strains(
                 if filter_no_shared and len(shared_strains) == 0:
                     continue
                 results[(obj, gcf)] = [
-                    self._strains.lookup(strain_id)
-                    for strain_id in shared_strains
+                    strain for strain_id in shared_strains
+                    for strain in self._strains.lookup(strain_id)
                 ]
         return results
 

From c4899868394583d6d712874d19ef93621ecdc5cc Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 24 Aug 2023 15:53:04 +0200
Subject: [PATCH 18/21] change KeyError to ValueError in
 `StrainCollection.lookup` method

---
 src/nplinker/strain_collection.py | 4 ++--
 tests/test_strain_collection.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 9b492556..5c1c8eaa 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -127,11 +127,11 @@ def lookup(self, name: str) -> list[Strain]:
             list[Strain]: List of Strain objects with the given name.
 
         Raises:
-            KeyError: If the strain name is not found.
+            ValueError: If the strain name is not found.
         """
         if name in self._strain_dict_name:
             return self._strain_dict_name[name]
-        raise KeyError(f"Strain {name} not found in strain collection.")
+        raise ValueError(f"Strain {name} not found in the strain collection.")
 
     @staticmethod
     def read_json(file: str | PathLike) -> 'StrainCollection':
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index 0d12ad5b..e6fcacb9 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -139,7 +139,7 @@ def test_has_name(collection: StrainCollection):
 def test_lookup(collection: StrainCollection, strain: Strain):
     for name in strain.names:
         assert collection.lookup(name) == [strain]
-    with pytest.raises(KeyError):
+    with pytest.raises(ValueError):
         collection.lookup("strain_not_exist")
 
 

From f73c683fcb75ea42e3797fe03d826d109bc117df Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 24 Aug 2023 15:56:20 +0200
Subject: [PATCH 19/21] add ValueError to `StrainCollection.remove` method

---
 src/nplinker/strain_collection.py | 7 ++++++-
 tests/test_strain_collection.py   | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 5c1c8eaa..f2f83c3c 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -77,10 +77,13 @@ def remove(self, strain: Strain):
         """Remove a strain from the collection.
 
         It removes the given strain object from the collection by strain id.
-        If the strain id is not found, it does nothing.
+        If the strain id is not found, raise ValueError.
 
         Args:
             strain(Strain): The strain to remove.
+
+        Raises:
+            ValueError: If the strain is not found in the collection.
         """
         if strain in self._strains:
             self._strains.remove(strain)
@@ -95,6 +98,8 @@ def remove(self, strain: Strain):
                         del self._strain_dict_name[name]
                     else:
                         self._strain_dict_name[name] = new_strain_list
+        else:
+            raise ValueError(f"Strain {strain} not found in strain collection.")
 
     def filter(self, strain_set: set[Strain]):
         """
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index e6fcacb9..01d5134d 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -96,7 +96,8 @@ def test_add_different_id_different_alias(strain: Strain,
 
 def test_remove(strain: Strain):
     sc = StrainCollection()
-    sc.remove(strain)
+    with pytest.raises(ValueError):
+        sc.remove(strain)
     assert strain not in sc
 
 
@@ -117,7 +118,8 @@ def test_remove_same_id_different_alias(collection: StrainCollection):
 
 def test_remove_different_id(collection: StrainCollection):
     strain = Strain("strain_2")
-    collection.remove(strain)
+    with pytest.raises(ValueError):
+        collection.remove(strain)
     assert len(collection) == 1
     assert strain not in collection
     assert len(collection._strain_dict_name) == 2

From 11c956321bf4cdea6001b875a41867023b273550 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 24 Aug 2023 16:15:53 +0200
Subject: [PATCH 20/21] change KeyError to ValueError for function
 `map_strain_to_bgc`

---
 src/nplinker/genomics/genomics.py | 13 ++++++-------
 tests/genomics/test_genomics.py   |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
index e57bc32c..a9234b8f 100644
--- a/src/nplinker/genomics/genomics.py
+++ b/src/nplinker/genomics/genomics.py
@@ -84,15 +84,14 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]):
     for bgc in bgcs:
         try:
             strain_list = strains.lookup(bgc.bgc_id)
-            if len(strain_list) > 1:
-                raise KeyError(
-                    f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
-                    f"BGC object accept only one strain."
-                )
-        except KeyError as e:
-            raise KeyError(
+        except ValueError as e:
+            raise ValueError(
                 f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' "
                 "not found in the strain collection.") from e
+        if len(strain_list) > 1:
+            raise ValueError(
+                f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
+                f"BGC object accept only one strain.")
         bgc.strain = strain_list[0]
 
 
diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py
index 37fdef36..3c3ff5ac 100644
--- a/tests/genomics/test_genomics.py
+++ b/tests/genomics/test_genomics.py
@@ -127,7 +127,7 @@ def test_map_strain_to_bgc(strain_collection, bgc_list):
 
 def test_map_strain_to_bgc_error(strain_collection):
     bgcs = [BGC("BGC_04", "NPR")]
-    with pytest.raises(KeyError) as e:
+    with pytest.raises(ValueError) as e:
         map_strain_to_bgc(strain_collection, bgcs)
     assert "Strain id 'BGC_04' from BGC object 'BGC_04' not found" in e.value.args[
         0]

From bb9b7d64dbe8ae21214b807d65885b3a0cb527ee Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 24 Aug 2023 16:17:43 +0200
Subject: [PATCH 21/21] update docstring for function `map_strain_to_bgc`

---
 src/nplinker/genomics/genomics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
index a9234b8f..9d8feade 100644
--- a/src/nplinker/genomics/genomics.py
+++ b/src/nplinker/genomics/genomics.py
@@ -79,7 +79,8 @@ def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]):
         bgcs(list[BGC]): A list of BGC objects.
 
     Raises:
-        KeyError: Strain id not found in the strain collection.
+        ValueError: Strain id not found in the strain collection.
+        ValueError: Multiple strain objects found for a BGC id.
     """
     for bgc in bgcs:
         try: