Refactor __eq__ and __hash__ methods (#139)

* change BGC attributes type from list to tuple The following BGC attributes are updated: - product_prediction - mibig_bgc_class - smiles * use positional-only parameter in BGC and GCF Parameters before "/" are positional-only parameters, see https://docs.python.org/3/glossary.html#term-parameter. * update BGC's `__eq__` and `__hash__` * update GCF's `__eq__` and `__hash__` * Update gcf.py * update Spectrum's `__eq__` and `__hash__` * update MolecularFamily `__eq__` and `__hash__` * Update molecular_family.py * update Strain `__eq__` and `__hash__` * update StrainCollection `__eq__` * add TODO comments to ObjectLink
NPLinker · Apr 26, 2023 · 63beb97 · 63beb97
1 parent 3230a4c
commit 63beb97
Show file tree

Hide file tree

Showing 16 changed files with 90 additions and 61 deletions.
diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -117,10 +117,6 @@ def _parse_bgcs(bgc_files: dict[str, str]) -> dict[str, BGC]:
 def parse_bgc_genbank(file: str) -> BGC:
     """Parse a single BGC gbk file to BGC object.
 
-    Note:
-        If product info is not available in gbk file, the product of BGC
-            object (bgc.product_prediction) is set to empty list.
-
     Args:
         file(str): Path to BGC gbk file
 
@@ -143,7 +139,7 @@ def parse_bgc_genbank(file: str) -> BGC:
             f"Not found product prediction in antiSMASH Genbank file {file}")
 
     # init BGC
-    bgc = BGC(bgc_id=fname, product_prediction=product_prediction)
+    bgc = BGC(fname, *product_prediction)
     bgc.description = description
     bgc.antismash_id = antismash_id
     bgc.antismash_file = file
@@ -166,7 +162,7 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict:
             # space is not allowed in SMILES spec
             # biopython generates space when reading multi-line SMILES from .gbk
             if smiles is not None:
-                smiles = [i.replace(' ', '') for i in smiles]
+                smiles = tuple(i.replace(' ', '') for i in smiles)
             features["smiles"] = smiles
     return features
 

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -4,7 +4,6 @@
 from nplinker.logconfig import LogConfig
 from .aa_pred import predict_aa
 
-
 if TYPE_CHECKING:
     from ..strains import Strain
     from .gcf import GCF
@@ -14,9 +13,7 @@
 
 class BGC():
 
-    def __init__(self,
-                 bgc_id: str,
-                 product_prediction: list[str]):
+    def __init__(self, bgc_id: str, /, *product_prediction: str):
         """Class to model BGC (biosynthetic gene cluster) data.
 
         BGC data include both annotations and sequence data. This class is
@@ -32,18 +29,18 @@ def __init__(self,
 
         Args:
             bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession.
-            product_prediction(list[str]): list of BGC's (predicted) natural
-                products or product classes.
+            product_prediction(tuple[str]): BGC's (predicted) natural products
+                or product classes.
 
         Attributes:
             bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession.
-            product_prediction(list[str]): List of BGC's (predicted) natural
-                products or product classes.
+            product_prediction(tuple[str]): A tuple of (predicted) natural
+                products or product classes of the BGC.
                 For antiSMASH's GenBank data, the feature `region /product`
                 gives product information.
                 For MIBiG metadata, its biosynthetic class provides such info.
-            mibig_bgc_class(list[str] | None): List of MIBiG biosynthetic classes to
-                which the BGC belongs.
+            mibig_bgc_class(tuple[str] | None): A tuple of MIBiG biosynthetic
+                classes to which the BGC belongs.
                 Defaults to None.
                 MIBiG defines 6 major biosynthetic classes for natural products,
                 including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide"
@@ -52,7 +49,8 @@ def __init__(self,
                 More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.
             description(str | None): Brief description of the BGC.
                 Defaults to None.
-            smiles(list[str] | None): SMILES formula of the BGC's product.
+            smiles(tuple[str] | None): A tuple of SMILES formulas of the BGC's
+                products.
                 Defaults to None.
             antismash_file(str | None): The path to the antiSMASH GenBank file.
                 Defaults to None.
@@ -72,9 +70,9 @@ def __init__(self,
         self.bgc_id = bgc_id
         self.product_prediction = product_prediction
 
-        self.mibig_bgc_class: list[str] | None = None
+        self.mibig_bgc_class: tuple[str] | None = None
         self.description: str | None = None
-        self.smiles: list[str] | None = None
+        self.smiles: tuple[str] | None = None
 
         # antismash related attributes
         self.antismash_file: str | None = None
@@ -93,11 +91,14 @@ def __str__(self):
             self.__class__.__name__, self.bgc_id, self.strain,
             self.antismash_id, self.antismash_region)
 
-    def __eq__(self, other):
-        return self.bgc_id == other.bgc_id
+    def __eq__(self, other) -> bool:
+        if isinstance(other, BGC):
+            return (self.bgc_id == other.bgc_id
+                    and self.product_prediction == other.product_prediction)
+        return NotImplemented
 
-    def __hash__(self):
-        return hash(self.bgc_id)
+    def __hash__(self) -> int:
+        return hash((self.bgc_id, self.product_prediction))
 
     def add_parent(self, gcf: GCF) -> None:
         """Add a parent GCF to the BGC.

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
@@ -3,7 +3,6 @@
 from nplinker.logconfig import LogConfig
 from nplinker.strain_collection import StrainCollection
 
-
 if TYPE_CHECKING:
     from nplinker.strains import Strain
     from .bgc import BGC
@@ -13,7 +12,7 @@
 
 class GCF():
 
-    def __init__(self, gcf_id: str) -> None:
+    def __init__(self, gcf_id: str, /) -> None:
         """Class to model gene cluster family (GCF).
 
         GCF is a group of similar BGCs and generated by clustering BGCs with
@@ -46,10 +45,17 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def __eq__(self, other):
-        return self.gcf_id == other.gcf_id
+    def __eq__(self, other) -> bool:
+        if isinstance(other, GCF):
+            return (self.gcf_id == other.gcf_id and self.bgcs == other.bgcs)
+        return NotImplemented
+
+    def __hash__(self) -> int:
+        """Hash function for GCF.
 
-    def __hash__(self):
+        Note that GCF class is a mutable container. We only hash the GCF id to
+        avoid the hash value changes when `self._bgcs` is updated.
+        """
         return hash(self.gcf_id)
 
     @property

diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -122,7 +122,7 @@ def parse_bgc_metadata_json(file: str) -> BGC:
         BGC: :class:`nplinker.genomics.BGC` object
     """
     metadata = MibigMetadata(file)
-    mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class)
+    mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class)
     mibig_bgc.mibig_bgc_class = metadata.biosyn_class
     mibig_bgc.strain = Strain(metadata.mibig_accession)
     return mibig_bgc

diff --git a/src/nplinker/genomics/mibig/mibig_metadata.py b/src/nplinker/genomics/mibig/mibig_metadata.py
@@ -21,7 +21,7 @@ def __init__(self, file) -> None:
             self.metadata = json.load(f)
 
         self._mibig_accession: str
-        self._biosyn_class: list[str]
+        self._biosyn_class: tuple[str]
         self._parse_metadata()
 
     @property
@@ -30,7 +30,7 @@ def mibig_accession(self) -> str:
         return self._mibig_accession
 
     @property
-    def biosyn_class(self) -> list[str]:
+    def biosyn_class(self) -> tuple[str]:
         """Get the value of metadata item 'biosyn_class'.
 
         The 'biosyn_class' is biosynthetic class(es), namely the type of
@@ -50,8 +50,8 @@ def _parse_metadata(self) -> None:
         if 'general_params' in self.metadata:
             self._mibig_accession = self.metadata['general_params'][
                 'mibig_accession']
-            self._biosyn_class = self.metadata['general_params'][
-                'biosyn_class']
+            self._biosyn_class = tuple(self.metadata['general_params'][
+                'biosyn_class'])
         else:  # version≥2.0
             self._mibig_accession = self.metadata['cluster']['mibig_accession']
-            self._biosyn_class = self.metadata['cluster']['biosyn_class']
+            self._biosyn_class = tuple(self.metadata['cluster']['biosyn_class'])
diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
@@ -51,7 +51,17 @@ def __str__(self) -> str:
             self.family_id, len(self.spectra))
 
     def __eq__(self, other: Self) -> bool:
-        return bool(self.id == other.id)
+        if isinstance(other, MolecularFamily):
+            return (self.id == other.id
+                    and self.family_id == other.family_id
+                    and set(self.spectra) == set(other.spectra))
+        return NotImplemented
 
     def __hash__(self) -> int:
-        return hash(self.id)
+        """Hash function for MolecularFamily.
+
+        Note that MolecularFamily is a mutable container, so here we hash on
+        the id and family_id only to avoid the hash value changing when
+        `self.spectra` is updated.
+        """
+        return hash((self.id, self.family_id))
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -121,11 +121,16 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def __eq__(self, other):
-        return self.id == other.id
-
-    def __hash__(self):
-        return hash(self.id)
+    def __eq__(self, other) -> bool:
+        if isinstance(other, Spectrum):
+            return (self.id == other.id
+                    and self.spectrum_id == other.spectrum_id
+                    and self.precursor_mz == other.precursor_mz
+                    and self.parent_mz == other.parent_mz)
+        return NotImplemented
+
+    def __hash__(self) -> int:
+        return hash((self.id, self.spectrum_id, self.precursor_mz, self.parent_mz))
 
     def __cmp__(self, other):
         if self.parent_mz >= other.parent_mz:

diff --git a/src/nplinker/scoring/object_link.py b/src/nplinker/scoring/object_link.py
@@ -48,6 +48,8 @@ def __getitem__(self, name):
 
     def __hash__(self):
         # return the nplinker internal ID as hash value (for set/dict etc)
+        # TODO: hashable object should also have `__eq__` defined, see #136.
+        # this implementation is not ideal as the hash value is not unique
         return hash(self.source.id)
 
     def __str__(self):

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
@@ -35,9 +35,11 @@ def __len__(self) -> int:
         return len(self._strains)
 
     def __eq__(self, other) -> bool:
-        return (self._strains == other._strains
-                and self._strain_dict_id == other._strain_dict_id
-                and self._strain_dict_index == other._strain_dict_index)
+        if isinstance(other, StrainCollection):
+            return (self._strains == other._strains
+                    and self._strain_dict_id == other._strain_dict_id
+                    and self._strain_dict_index == other._strain_dict_index)
+        return NotImplemented
 
     def __contains__(self, strain: str | Strain) -> bool:
         if isinstance(strain, str):

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
@@ -26,10 +26,17 @@ def __str__(self) -> str:
         return f'Strain({self.id}) [{len(self._aliases)} aliases]'
 
     def __eq__(self, other) -> bool:
-        return (isinstance(other, Strain) and self.id == other.id
-                and self._aliases == other._aliases)
+        if isinstance(other, Strain):
+            return (self.id == other.id
+                    and self.aliases == other.aliases)
+        return NotImplemented
 
     def __hash__(self) -> int:
+        """Hash function for Strain.
+
+        Note that Strain is a mutable container, so here we hash on only the id
+        to avoid the hash value changes when `self._aliases` is updated.
+        """
         return hash(self.id)
 
     @property

diff --git a/tests/genomics/antismash/test_antismash_loader.py b/tests/genomics/antismash/test_antismash_loader.py
@@ -67,12 +67,12 @@ def test_parse_bgc_genbank():
     bgc = parse_bgc_genbank(gbk_file)
     assert isinstance(bgc, BGC)
     assert bgc.bgc_id == "NZ_AZWB01000005.region001"
-    assert bgc.product_prediction == ["NRPS", "lanthipeptide"]
+    assert bgc.product_prediction == ("NRPS", "lanthipeptide")
     assert "Salinispora pacifica CNT029 B170DRAFT_scaffold" in bgc.description
     assert bgc.antismash_id == "NZ_AZWB01000005"
     assert bgc.antismash_file == gbk_file
     assert bgc.antismash_region == "1"
-    assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"]
+    assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",)
 
 def test_parse_bgc_genbank_error():
     gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk")

diff --git a/tests/genomics/test_bgc.py b/tests/genomics/test_bgc.py
@@ -5,9 +5,9 @@
 
 def test_default():
 
-    bgc = BGC("BGC0000001", ["Polyketide"])
+    bgc = BGC("BGC0000001", "Polyketide")
     assert bgc.bgc_id == "BGC0000001"
-    assert bgc.product_prediction == ["Polyketide"]
+    assert bgc.product_prediction == ("Polyketide",)
     assert bgc.is_mibig() is True
     assert bgc.parents == set()
     assert bgc.bigscape_classes == set()
@@ -19,7 +19,7 @@ def test_default():
 
 
 def test_add_and_detach_parent():
-    bgc = BGC("BGC0000001", ["Polyketide"])
+    bgc = BGC("BGC0000001", "Polyketide")
     gcf = GCF("1")
     bgc.add_parent(gcf)
     assert bgc.parents == {gcf}

diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py
@@ -7,14 +7,14 @@
 
 @pytest.fixture()
 def bgc_with_strain():
-    bgc = BGC("S0001", ["NPR"])
+    bgc = BGC("S0001", "NPR")
     bgc.strain = Strain("strain001")
     yield bgc
 
 
 @pytest.fixture()
 def bgc_without_strain():
-    bgc = BGC("S002", ["NPR"])
+    bgc = BGC("S002", "NPR")
     yield bgc
 
 
@@ -59,8 +59,8 @@ def test_has_strain(bgc_with_strain):
     assert gcf.has_strain("strain002") is False
 
 def test_has_mibig_only():
-    mibig_bgc = BGC("BGC0000001", ["NPR"])
-    nonmibig_bgc = BGC("S0001", ["NPR"])
+    mibig_bgc = BGC("BGC0000001", "NPR")
+    nonmibig_bgc = BGC("S0001", "NPR")
     gcf = GCF("1")
     gcf.add_bgc(mibig_bgc)
     assert gcf.has_mibig_only() is True

diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py
@@ -38,10 +38,10 @@ def bgc_genome_mapping() -> dict[str, str]:
 @pytest.fixture
 def bgc_list() -> list[BGC]:
     return [
-        BGC("SAMPLE0001", ["NPR"]),
-        BGC("SAMPLE0002", ["Alkaloid"]),
-        BGC("BGC0000001", ["Polyketide"]),
-        BGC("BGC0000002", ["Terpene"])
+        BGC("SAMPLE0001", "NPR"),
+        BGC("SAMPLE0002", "Alkaloid"),
+        BGC("BGC0000001", "Polyketide"),
+        BGC("BGC0000002", "Terpene")
     ]
 
 

diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py
@@ -79,4 +79,4 @@ def test_parse_bgc_metadata_json():
     bgc = parse_bgc_metadata_json(str(json_file))
     assert isinstance(bgc, BGC)
     assert bgc.bgc_id == "BGC0000001"
-    assert bgc.mibig_bgc_class == ["Polyketide"]
+    assert bgc.mibig_bgc_class == ("Polyketide",)
diff --git a/tests/genomics/test_mibig_metadata.py b/tests/genomics/test_mibig_metadata.py
@@ -23,4 +23,4 @@ def test_mibig_accession(self, metadata):
         assert metadata.mibig_accession == "BGC0000001"
 
     def test_biosyn_class(self, metadata):
-        assert metadata.biosyn_class == ["Polyketide"]
+        assert metadata.biosyn_class == ("Polyketide",)