Skip to content

Commit

Permalink
Refactor __eq__ and __hash__ methods (#139)
Browse files Browse the repository at this point in the history
* change BGC attributes type from list to tuple

The following BGC attributes are updated:
- product_prediction
- mibig_bgc_class
- smiles

* use positional-only parameter in BGC and GCF

Parameters before "/" are positional-only parameters, see https://docs.python.org/3/glossary.html#term-parameter.

* update BGC's `__eq__` and `__hash__`

* update GCF's `__eq__` and `__hash__`

* Update gcf.py

* update Spectrum's `__eq__` and `__hash__`

* update MolecularFamily `__eq__` and `__hash__`

* Update molecular_family.py

* update Strain `__eq__` and `__hash__`

* update StrainCollection `__eq__`

* add TODO comments to ObjectLink
  • Loading branch information
CunliangGeng authored Apr 26, 2023
1 parent 3230a4c commit 63beb97
Show file tree
Hide file tree
Showing 16 changed files with 90 additions and 61 deletions.
8 changes: 2 additions & 6 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,6 @@ def _parse_bgcs(bgc_files: dict[str, str]) -> dict[str, BGC]:
def parse_bgc_genbank(file: str) -> BGC:
"""Parse a single BGC gbk file to BGC object.
Note:
If product info is not available in gbk file, the product of BGC
object (bgc.product_prediction) is set to empty list.
Args:
file(str): Path to BGC gbk file
Expand All @@ -143,7 +139,7 @@ def parse_bgc_genbank(file: str) -> BGC:
f"Not found product prediction in antiSMASH Genbank file {file}")

# init BGC
bgc = BGC(bgc_id=fname, product_prediction=product_prediction)
bgc = BGC(fname, *product_prediction)
bgc.description = description
bgc.antismash_id = antismash_id
bgc.antismash_file = file
Expand All @@ -166,7 +162,7 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict:
# space is not allowed in SMILES spec
# biopython generates space when reading multi-line SMILES from .gbk
if smiles is not None:
smiles = [i.replace(' ', '') for i in smiles]
smiles = tuple(i.replace(' ', '') for i in smiles)
features["smiles"] = smiles
return features

Expand Down
35 changes: 18 additions & 17 deletions src/nplinker/genomics/bgc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from nplinker.logconfig import LogConfig
from .aa_pred import predict_aa


if TYPE_CHECKING:
from ..strains import Strain
from .gcf import GCF
Expand All @@ -14,9 +13,7 @@

class BGC():

def __init__(self,
bgc_id: str,
product_prediction: list[str]):
def __init__(self, bgc_id: str, /, *product_prediction: str):
"""Class to model BGC (biosynthetic gene cluster) data.
BGC data include both annotations and sequence data. This class is
Expand All @@ -32,18 +29,18 @@ def __init__(self,
Args:
bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession.
product_prediction(list[str]): list of BGC's (predicted) natural
products or product classes.
product_prediction(tuple[str]): BGC's (predicted) natural products
or product classes.
Attributes:
bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession.
product_prediction(list[str]): List of BGC's (predicted) natural
products or product classes.
product_prediction(tuple[str]): A tuple of (predicted) natural
products or product classes of the BGC.
For antiSMASH's GenBank data, the feature `region /product`
gives product information.
For MIBiG metadata, its biosynthetic class provides such info.
mibig_bgc_class(list[str] | None): List of MIBiG biosynthetic classes to
which the BGC belongs.
mibig_bgc_class(tuple[str] | None): A tuple of MIBiG biosynthetic
classes to which the BGC belongs.
Defaults to None.
MIBiG defines 6 major biosynthetic classes for natural products,
including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide"
Expand All @@ -52,7 +49,8 @@ def __init__(self,
More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.
description(str | None): Brief description of the BGC.
Defaults to None.
smiles(list[str] | None): SMILES formula of the BGC's product.
smiles(tuple[str] | None): A tuple of SMILES formulas of the BGC's
products.
Defaults to None.
antismash_file(str | None): The path to the antiSMASH GenBank file.
Defaults to None.
Expand All @@ -72,9 +70,9 @@ def __init__(self,
self.bgc_id = bgc_id
self.product_prediction = product_prediction

self.mibig_bgc_class: list[str] | None = None
self.mibig_bgc_class: tuple[str] | None = None
self.description: str | None = None
self.smiles: list[str] | None = None
self.smiles: tuple[str] | None = None

# antismash related attributes
self.antismash_file: str | None = None
Expand All @@ -93,11 +91,14 @@ def __str__(self):
self.__class__.__name__, self.bgc_id, self.strain,
self.antismash_id, self.antismash_region)

def __eq__(self, other):
return self.bgc_id == other.bgc_id
def __eq__(self, other) -> bool:
if isinstance(other, BGC):
return (self.bgc_id == other.bgc_id
and self.product_prediction == other.product_prediction)
return NotImplemented

def __hash__(self):
return hash(self.bgc_id)
def __hash__(self) -> int:
return hash((self.bgc_id, self.product_prediction))

def add_parent(self, gcf: GCF) -> None:
"""Add a parent GCF to the BGC.
Expand Down
16 changes: 11 additions & 5 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from nplinker.logconfig import LogConfig
from nplinker.strain_collection import StrainCollection


if TYPE_CHECKING:
from nplinker.strains import Strain
from .bgc import BGC
Expand All @@ -13,7 +12,7 @@

class GCF():

def __init__(self, gcf_id: str) -> None:
def __init__(self, gcf_id: str, /) -> None:
"""Class to model gene cluster family (GCF).
GCF is a group of similar BGCs and generated by clustering BGCs with
Expand Down Expand Up @@ -46,10 +45,17 @@ def __str__(self):
def __repr__(self):
return str(self)

def __eq__(self, other):
return self.gcf_id == other.gcf_id
def __eq__(self, other) -> bool:
if isinstance(other, GCF):
return (self.gcf_id == other.gcf_id and self.bgcs == other.bgcs)
return NotImplemented

def __hash__(self) -> int:
"""Hash function for GCF.
def __hash__(self):
Note that GCF class is a mutable container. We only hash the GCF id to
avoid the hash value changes when `self._bgcs` is updated.
"""
return hash(self.gcf_id)

@property
Expand Down
2 changes: 1 addition & 1 deletion src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def parse_bgc_metadata_json(file: str) -> BGC:
BGC: :class:`nplinker.genomics.BGC` object
"""
metadata = MibigMetadata(file)
mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class)
mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class)
mibig_bgc.mibig_bgc_class = metadata.biosyn_class
mibig_bgc.strain = Strain(metadata.mibig_accession)
return mibig_bgc
Expand Down
10 changes: 5 additions & 5 deletions src/nplinker/genomics/mibig/mibig_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, file) -> None:
self.metadata = json.load(f)

self._mibig_accession: str
self._biosyn_class: list[str]
self._biosyn_class: tuple[str]
self._parse_metadata()

@property
Expand All @@ -30,7 +30,7 @@ def mibig_accession(self) -> str:
return self._mibig_accession

@property
def biosyn_class(self) -> list[str]:
def biosyn_class(self) -> tuple[str]:
"""Get the value of metadata item 'biosyn_class'.
The 'biosyn_class' is biosynthetic class(es), namely the type of
Expand All @@ -50,8 +50,8 @@ def _parse_metadata(self) -> None:
if 'general_params' in self.metadata:
self._mibig_accession = self.metadata['general_params'][
'mibig_accession']
self._biosyn_class = self.metadata['general_params'][
'biosyn_class']
self._biosyn_class = tuple(self.metadata['general_params'][
'biosyn_class'])
else: # version≥2.0
self._mibig_accession = self.metadata['cluster']['mibig_accession']
self._biosyn_class = self.metadata['cluster']['biosyn_class']
self._biosyn_class = tuple(self.metadata['cluster']['biosyn_class'])
14 changes: 12 additions & 2 deletions src/nplinker/metabolomics/molecular_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,17 @@ def __str__(self) -> str:
self.family_id, len(self.spectra))

def __eq__(self, other: Self) -> bool:
return bool(self.id == other.id)
if isinstance(other, MolecularFamily):
return (self.id == other.id
and self.family_id == other.family_id
and set(self.spectra) == set(other.spectra))
return NotImplemented

def __hash__(self) -> int:
return hash(self.id)
"""Hash function for MolecularFamily.
Note that MolecularFamily is a mutable container, so here we hash on
the id and family_id only to avoid the hash value changing when
`self.spectra` is updated.
"""
return hash((self.id, self.family_id))
15 changes: 10 additions & 5 deletions src/nplinker/metabolomics/spectrum.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,16 @@ def __str__(self):
def __repr__(self):
return str(self)

def __eq__(self, other):
return self.id == other.id

def __hash__(self):
return hash(self.id)
def __eq__(self, other) -> bool:
if isinstance(other, Spectrum):
return (self.id == other.id
and self.spectrum_id == other.spectrum_id
and self.precursor_mz == other.precursor_mz
and self.parent_mz == other.parent_mz)
return NotImplemented

def __hash__(self) -> int:
return hash((self.id, self.spectrum_id, self.precursor_mz, self.parent_mz))

def __cmp__(self, other):
if self.parent_mz >= other.parent_mz:
Expand Down
2 changes: 2 additions & 0 deletions src/nplinker/scoring/object_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def __getitem__(self, name):

def __hash__(self):
# return the nplinker internal ID as hash value (for set/dict etc)
# TODO: hashable object should also have `__eq__` defined, see #136.
# this implementation is not ideal as the hash value is not unique
return hash(self.source.id)

def __str__(self):
Expand Down
8 changes: 5 additions & 3 deletions src/nplinker/strain_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ def __len__(self) -> int:
return len(self._strains)

def __eq__(self, other) -> bool:
return (self._strains == other._strains
and self._strain_dict_id == other._strain_dict_id
and self._strain_dict_index == other._strain_dict_index)
if isinstance(other, StrainCollection):
return (self._strains == other._strains
and self._strain_dict_id == other._strain_dict_id
and self._strain_dict_index == other._strain_dict_index)
return NotImplemented

def __contains__(self, strain: str | Strain) -> bool:
if isinstance(strain, str):
Expand Down
11 changes: 9 additions & 2 deletions src/nplinker/strains.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@ def __str__(self) -> str:
return f'Strain({self.id}) [{len(self._aliases)} aliases]'

def __eq__(self, other) -> bool:
return (isinstance(other, Strain) and self.id == other.id
and self._aliases == other._aliases)
if isinstance(other, Strain):
return (self.id == other.id
and self.aliases == other.aliases)
return NotImplemented

def __hash__(self) -> int:
"""Hash function for Strain.
Note that Strain is a mutable container, so here we hash on only the id
to avoid the hash value changes when `self._aliases` is updated.
"""
return hash(self.id)

@property
Expand Down
4 changes: 2 additions & 2 deletions tests/genomics/antismash/test_antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ def test_parse_bgc_genbank():
bgc = parse_bgc_genbank(gbk_file)
assert isinstance(bgc, BGC)
assert bgc.bgc_id == "NZ_AZWB01000005.region001"
assert bgc.product_prediction == ["NRPS", "lanthipeptide"]
assert bgc.product_prediction == ("NRPS", "lanthipeptide")
assert "Salinispora pacifica CNT029 B170DRAFT_scaffold" in bgc.description
assert bgc.antismash_id == "NZ_AZWB01000005"
assert bgc.antismash_file == gbk_file
assert bgc.antismash_region == "1"
assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"]
assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",)

def test_parse_bgc_genbank_error():
gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk")
Expand Down
6 changes: 3 additions & 3 deletions tests/genomics/test_bgc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

def test_default():

bgc = BGC("BGC0000001", ["Polyketide"])
bgc = BGC("BGC0000001", "Polyketide")
assert bgc.bgc_id == "BGC0000001"
assert bgc.product_prediction == ["Polyketide"]
assert bgc.product_prediction == ("Polyketide",)
assert bgc.is_mibig() is True
assert bgc.parents == set()
assert bgc.bigscape_classes == set()
Expand All @@ -19,7 +19,7 @@ def test_default():


def test_add_and_detach_parent():
bgc = BGC("BGC0000001", ["Polyketide"])
bgc = BGC("BGC0000001", "Polyketide")
gcf = GCF("1")
bgc.add_parent(gcf)
assert bgc.parents == {gcf}
Expand Down
8 changes: 4 additions & 4 deletions tests/genomics/test_gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@

@pytest.fixture()
def bgc_with_strain():
bgc = BGC("S0001", ["NPR"])
bgc = BGC("S0001", "NPR")
bgc.strain = Strain("strain001")
yield bgc


@pytest.fixture()
def bgc_without_strain():
bgc = BGC("S002", ["NPR"])
bgc = BGC("S002", "NPR")
yield bgc


Expand Down Expand Up @@ -59,8 +59,8 @@ def test_has_strain(bgc_with_strain):
assert gcf.has_strain("strain002") is False

def test_has_mibig_only():
mibig_bgc = BGC("BGC0000001", ["NPR"])
nonmibig_bgc = BGC("S0001", ["NPR"])
mibig_bgc = BGC("BGC0000001", "NPR")
nonmibig_bgc = BGC("S0001", "NPR")
gcf = GCF("1")
gcf.add_bgc(mibig_bgc)
assert gcf.has_mibig_only() is True
Expand Down
8 changes: 4 additions & 4 deletions tests/genomics/test_genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def bgc_genome_mapping() -> dict[str, str]:
@pytest.fixture
def bgc_list() -> list[BGC]:
return [
BGC("SAMPLE0001", ["NPR"]),
BGC("SAMPLE0002", ["Alkaloid"]),
BGC("BGC0000001", ["Polyketide"]),
BGC("BGC0000002", ["Terpene"])
BGC("SAMPLE0001", "NPR"),
BGC("SAMPLE0002", "Alkaloid"),
BGC("BGC0000001", "Polyketide"),
BGC("BGC0000002", "Terpene")
]


Expand Down
2 changes: 1 addition & 1 deletion tests/genomics/test_mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,4 @@ def test_parse_bgc_metadata_json():
bgc = parse_bgc_metadata_json(str(json_file))
assert isinstance(bgc, BGC)
assert bgc.bgc_id == "BGC0000001"
assert bgc.mibig_bgc_class == ["Polyketide"]
assert bgc.mibig_bgc_class == ("Polyketide",)
2 changes: 1 addition & 1 deletion tests/genomics/test_mibig_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ def test_mibig_accession(self, metadata):
assert metadata.mibig_accession == "BGC0000001"

def test_biosyn_class(self, metadata):
assert metadata.biosyn_class == ["Polyketide"]
assert metadata.biosyn_class == ("Polyketide",)

0 comments on commit 63beb97

Please sign in to comment.