diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index 1f9ade80..e4bdcf6b 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -117,10 +117,6 @@ def _parse_bgcs(bgc_files: dict[str, str]) -> dict[str, BGC]: def parse_bgc_genbank(file: str) -> BGC: """Parse a single BGC gbk file to BGC object. - Note: - If product info is not available in gbk file, the product of BGC - object (bgc.product_prediction) is set to empty list. - Args: file(str): Path to BGC gbk file @@ -143,7 +139,7 @@ def parse_bgc_genbank(file: str) -> BGC: f"Not found product prediction in antiSMASH Genbank file {file}") # init BGC - bgc = BGC(bgc_id=fname, product_prediction=product_prediction) + bgc = BGC(fname, *product_prediction) bgc.description = description bgc.antismash_id = antismash_id bgc.antismash_file = file @@ -166,7 +162,7 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict: # space is not allowed in SMILES spec # biopython generates space when reading multi-line SMILES from .gbk if smiles is not None: - smiles = [i.replace(' ', '') for i in smiles] + smiles = tuple(i.replace(' ', '') for i in smiles) features["smiles"] = smiles return features diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index b1b1e2a9..7eb7921a 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -4,7 +4,6 @@ from nplinker.logconfig import LogConfig from .aa_pred import predict_aa - if TYPE_CHECKING: from ..strains import Strain from .gcf import GCF @@ -14,9 +13,7 @@ class BGC(): - def __init__(self, - bgc_id: str, - product_prediction: list[str]): + def __init__(self, bgc_id: str, /, *product_prediction: str): """Class to model BGC (biosynthetic gene cluster) data. BGC data include both annotations and sequence data. This class is @@ -32,18 +29,18 @@ def __init__(self, Args: bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession. - product_prediction(list[str]): list of BGC's (predicted) natural - products or product classes. + product_prediction(tuple[str]): BGC's (predicted) natural products + or product classes. Attributes: bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession. - product_prediction(list[str]): List of BGC's (predicted) natural - products or product classes. + product_prediction(tuple[str]): A tuple of (predicted) natural + products or product classes of the BGC. For antiSMASH's GenBank data, the feature `region /product` gives product information. For MIBiG metadata, its biosynthetic class provides such info. - mibig_bgc_class(list[str] | None): List of MIBiG biosynthetic classes to - which the BGC belongs. + mibig_bgc_class(tuple[str] | None): A tuple of MIBiG biosynthetic + classes to which the BGC belongs. Defaults to None. MIBiG defines 6 major biosynthetic classes for natural products, including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide" @@ -52,7 +49,8 @@ def __init__(self, More details see the publication: https://doi.org/10.1186/s40793-018-0318-y. description(str | None): Brief description of the BGC. Defaults to None. - smiles(list[str] | None): SMILES formula of the BGC's product. + smiles(tuple[str] | None): A tuple of SMILES formulas of the BGC's + products. Defaults to None. antismash_file(str | None): The path to the antiSMASH GenBank file. Defaults to None. @@ -72,9 +70,9 @@ def __init__(self, self.bgc_id = bgc_id self.product_prediction = product_prediction - self.mibig_bgc_class: list[str] | None = None + self.mibig_bgc_class: tuple[str] | None = None self.description: str | None = None - self.smiles: list[str] | None = None + self.smiles: tuple[str] | None = None # antismash related attributes self.antismash_file: str | None = None @@ -93,11 +91,14 @@ def __str__(self): self.__class__.__name__, self.bgc_id, self.strain, self.antismash_id, self.antismash_region) - def __eq__(self, other): - return self.bgc_id == other.bgc_id + def __eq__(self, other) -> bool: + if isinstance(other, BGC): + return (self.bgc_id == other.bgc_id + and self.product_prediction == other.product_prediction) + return NotImplemented - def __hash__(self): - return hash(self.bgc_id) + def __hash__(self) -> int: + return hash((self.bgc_id, self.product_prediction)) def add_parent(self, gcf: GCF) -> None: """Add a parent GCF to the BGC. diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index e0c68c13..5ea61486 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -3,7 +3,6 @@ from nplinker.logconfig import LogConfig from nplinker.strain_collection import StrainCollection - if TYPE_CHECKING: from nplinker.strains import Strain from .bgc import BGC @@ -13,7 +12,7 @@ class GCF(): - def __init__(self, gcf_id: str) -> None: + def __init__(self, gcf_id: str, /) -> None: """Class to model gene cluster family (GCF). GCF is a group of similar BGCs and generated by clustering BGCs with @@ -46,10 +45,17 @@ def __str__(self): def __repr__(self): return str(self) - def __eq__(self, other): - return self.gcf_id == other.gcf_id + def __eq__(self, other) -> bool: + if isinstance(other, GCF): + return (self.gcf_id == other.gcf_id and self.bgcs == other.bgcs) + return NotImplemented + + def __hash__(self) -> int: + """Hash function for GCF. - def __hash__(self): + Note that GCF class is a mutable container. We only hash the GCF id to + avoid the hash value changes when `self._bgcs` is updated. + """ return hash(self.gcf_id) @property diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index a4f665c1..0be9f889 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -122,7 +122,7 @@ def parse_bgc_metadata_json(file: str) -> BGC: BGC: :class:`nplinker.genomics.BGC` object """ metadata = MibigMetadata(file) - mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class) + mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class) mibig_bgc.mibig_bgc_class = metadata.biosyn_class mibig_bgc.strain = Strain(metadata.mibig_accession) return mibig_bgc diff --git a/src/nplinker/genomics/mibig/mibig_metadata.py b/src/nplinker/genomics/mibig/mibig_metadata.py index c3921f37..94c85315 100644 --- a/src/nplinker/genomics/mibig/mibig_metadata.py +++ b/src/nplinker/genomics/mibig/mibig_metadata.py @@ -21,7 +21,7 @@ def __init__(self, file) -> None: self.metadata = json.load(f) self._mibig_accession: str - self._biosyn_class: list[str] + self._biosyn_class: tuple[str] self._parse_metadata() @property @@ -30,7 +30,7 @@ def mibig_accession(self) -> str: return self._mibig_accession @property - def biosyn_class(self) -> list[str]: + def biosyn_class(self) -> tuple[str]: """Get the value of metadata item 'biosyn_class'. The 'biosyn_class' is biosynthetic class(es), namely the type of @@ -50,8 +50,8 @@ def _parse_metadata(self) -> None: if 'general_params' in self.metadata: self._mibig_accession = self.metadata['general_params'][ 'mibig_accession'] - self._biosyn_class = self.metadata['general_params'][ - 'biosyn_class'] + self._biosyn_class = tuple(self.metadata['general_params'][ + 'biosyn_class']) else: # versionā‰„2.0 self._mibig_accession = self.metadata['cluster']['mibig_accession'] - self._biosyn_class = self.metadata['cluster']['biosyn_class'] + self._biosyn_class = tuple(self.metadata['cluster']['biosyn_class']) diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 1eec71fc..a55a4df9 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -51,7 +51,17 @@ def __str__(self) -> str: self.family_id, len(self.spectra)) def __eq__(self, other: Self) -> bool: - return bool(self.id == other.id) + if isinstance(other, MolecularFamily): + return (self.id == other.id + and self.family_id == other.family_id + and set(self.spectra) == set(other.spectra)) + return NotImplemented def __hash__(self) -> int: - return hash(self.id) + """Hash function for MolecularFamily. + + Note that MolecularFamily is a mutable container, so here we hash on + the id and family_id only to avoid the hash value changing when + `self.spectra` is updated. + """ + return hash((self.id, self.family_id)) diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index c65824ce..78a8e8ce 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -121,11 +121,16 @@ def __str__(self): def __repr__(self): return str(self) - def __eq__(self, other): - return self.id == other.id - - def __hash__(self): - return hash(self.id) + def __eq__(self, other) -> bool: + if isinstance(other, Spectrum): + return (self.id == other.id + and self.spectrum_id == other.spectrum_id + and self.precursor_mz == other.precursor_mz + and self.parent_mz == other.parent_mz) + return NotImplemented + + def __hash__(self) -> int: + return hash((self.id, self.spectrum_id, self.precursor_mz, self.parent_mz)) def __cmp__(self, other): if self.parent_mz >= other.parent_mz: diff --git a/src/nplinker/scoring/object_link.py b/src/nplinker/scoring/object_link.py index eb380db5..a88e543d 100644 --- a/src/nplinker/scoring/object_link.py +++ b/src/nplinker/scoring/object_link.py @@ -48,6 +48,8 @@ def __getitem__(self, name): def __hash__(self): # return the nplinker internal ID as hash value (for set/dict etc) + # TODO: hashable object should also have `__eq__` defined, see #136. + # this implementation is not ideal as the hash value is not unique return hash(self.source.id) def __str__(self): diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 5169f02a..00603e96 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -35,9 +35,11 @@ def __len__(self) -> int: return len(self._strains) def __eq__(self, other) -> bool: - return (self._strains == other._strains - and self._strain_dict_id == other._strain_dict_id - and self._strain_dict_index == other._strain_dict_index) + if isinstance(other, StrainCollection): + return (self._strains == other._strains + and self._strain_dict_id == other._strain_dict_id + and self._strain_dict_index == other._strain_dict_index) + return NotImplemented def __contains__(self, strain: str | Strain) -> bool: if isinstance(strain, str): diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index de41a8b0..661c7268 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -26,10 +26,17 @@ def __str__(self) -> str: return f'Strain({self.id}) [{len(self._aliases)} aliases]' def __eq__(self, other) -> bool: - return (isinstance(other, Strain) and self.id == other.id - and self._aliases == other._aliases) + if isinstance(other, Strain): + return (self.id == other.id + and self.aliases == other.aliases) + return NotImplemented def __hash__(self) -> int: + """Hash function for Strain. + + Note that Strain is a mutable container, so here we hash on only the id + to avoid the hash value changes when `self._aliases` is updated. + """ return hash(self.id) @property diff --git a/tests/genomics/antismash/test_antismash_loader.py b/tests/genomics/antismash/test_antismash_loader.py index f3f30820..027e2c7a 100644 --- a/tests/genomics/antismash/test_antismash_loader.py +++ b/tests/genomics/antismash/test_antismash_loader.py @@ -67,12 +67,12 @@ def test_parse_bgc_genbank(): bgc = parse_bgc_genbank(gbk_file) assert isinstance(bgc, BGC) assert bgc.bgc_id == "NZ_AZWB01000005.region001" - assert bgc.product_prediction == ["NRPS", "lanthipeptide"] + assert bgc.product_prediction == ("NRPS", "lanthipeptide") assert "Salinispora pacifica CNT029 B170DRAFT_scaffold" in bgc.description assert bgc.antismash_id == "NZ_AZWB01000005" assert bgc.antismash_file == gbk_file assert bgc.antismash_region == "1" - assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"] + assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",) def test_parse_bgc_genbank_error(): gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk") diff --git a/tests/genomics/test_bgc.py b/tests/genomics/test_bgc.py index 7434ff8d..83d7da63 100644 --- a/tests/genomics/test_bgc.py +++ b/tests/genomics/test_bgc.py @@ -5,9 +5,9 @@ def test_default(): - bgc = BGC("BGC0000001", ["Polyketide"]) + bgc = BGC("BGC0000001", "Polyketide") assert bgc.bgc_id == "BGC0000001" - assert bgc.product_prediction == ["Polyketide"] + assert bgc.product_prediction == ("Polyketide",) assert bgc.is_mibig() is True assert bgc.parents == set() assert bgc.bigscape_classes == set() @@ -19,7 +19,7 @@ def test_default(): def test_add_and_detach_parent(): - bgc = BGC("BGC0000001", ["Polyketide"]) + bgc = BGC("BGC0000001", "Polyketide") gcf = GCF("1") bgc.add_parent(gcf) assert bgc.parents == {gcf} diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py index 0ccb20ca..a40f4728 100644 --- a/tests/genomics/test_gcf.py +++ b/tests/genomics/test_gcf.py @@ -7,14 +7,14 @@ @pytest.fixture() def bgc_with_strain(): - bgc = BGC("S0001", ["NPR"]) + bgc = BGC("S0001", "NPR") bgc.strain = Strain("strain001") yield bgc @pytest.fixture() def bgc_without_strain(): - bgc = BGC("S002", ["NPR"]) + bgc = BGC("S002", "NPR") yield bgc @@ -59,8 +59,8 @@ def test_has_strain(bgc_with_strain): assert gcf.has_strain("strain002") is False def test_has_mibig_only(): - mibig_bgc = BGC("BGC0000001", ["NPR"]) - nonmibig_bgc = BGC("S0001", ["NPR"]) + mibig_bgc = BGC("BGC0000001", "NPR") + nonmibig_bgc = BGC("S0001", "NPR") gcf = GCF("1") gcf.add_bgc(mibig_bgc) assert gcf.has_mibig_only() is True diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index a48c9d6c..dfde9e6c 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -38,10 +38,10 @@ def bgc_genome_mapping() -> dict[str, str]: @pytest.fixture def bgc_list() -> list[BGC]: return [ - BGC("SAMPLE0001", ["NPR"]), - BGC("SAMPLE0002", ["Alkaloid"]), - BGC("BGC0000001", ["Polyketide"]), - BGC("BGC0000002", ["Terpene"]) + BGC("SAMPLE0001", "NPR"), + BGC("SAMPLE0002", "Alkaloid"), + BGC("BGC0000001", "Polyketide"), + BGC("BGC0000002", "Terpene") ] diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py index bdeea051..c122c56e 100644 --- a/tests/genomics/test_mibig_loader.py +++ b/tests/genomics/test_mibig_loader.py @@ -79,4 +79,4 @@ def test_parse_bgc_metadata_json(): bgc = parse_bgc_metadata_json(str(json_file)) assert isinstance(bgc, BGC) assert bgc.bgc_id == "BGC0000001" - assert bgc.mibig_bgc_class == ["Polyketide"] + assert bgc.mibig_bgc_class == ("Polyketide",) diff --git a/tests/genomics/test_mibig_metadata.py b/tests/genomics/test_mibig_metadata.py index 0e2096e1..08607d0e 100644 --- a/tests/genomics/test_mibig_metadata.py +++ b/tests/genomics/test_mibig_metadata.py @@ -23,4 +23,4 @@ def test_mibig_accession(self, metadata): assert metadata.mibig_accession == "BGC0000001" def test_biosyn_class(self, metadata): - assert metadata.biosyn_class == ["Polyketide"] + assert metadata.biosyn_class == ("Polyketide",)