From 742d4b583dce7183a2f7ab2f7eb0db1ab863386b Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 5 Apr 2023 18:07:21 +0200 Subject: [PATCH 01/95] change BGC attributes type from list to tuple The following BGC attributes are updated: - product_prediction - mibig_bgc_class - smiles --- .../genomics/antismash/antismash_loader.py | 8 ++----- src/nplinker/genomics/bgc.py | 21 ++++++++++--------- src/nplinker/genomics/mibig/mibig_loader.py | 2 +- src/nplinker/genomics/mibig/mibig_metadata.py | 10 ++++----- .../antismash/test_antismash_loader.py | 4 ++-- tests/genomics/test_bgc.py | 6 +++--- tests/genomics/test_gcf.py | 8 +++---- tests/genomics/test_genomics.py | 8 +++---- tests/genomics/test_mibig_loader.py | 2 +- tests/genomics/test_mibig_metadata.py | 2 +- 10 files changed, 34 insertions(+), 37 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index 1f9ade80..e4bdcf6b 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -117,10 +117,6 @@ def _parse_bgcs(bgc_files: dict[str, str]) -> dict[str, BGC]: def parse_bgc_genbank(file: str) -> BGC: """Parse a single BGC gbk file to BGC object. - Note: - If product info is not available in gbk file, the product of BGC - object (bgc.product_prediction) is set to empty list. - Args: file(str): Path to BGC gbk file @@ -143,7 +139,7 @@ def parse_bgc_genbank(file: str) -> BGC: f"Not found product prediction in antiSMASH Genbank file {file}") # init BGC - bgc = BGC(bgc_id=fname, product_prediction=product_prediction) + bgc = BGC(fname, *product_prediction) bgc.description = description bgc.antismash_id = antismash_id bgc.antismash_file = file @@ -166,7 +162,7 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict: # space is not allowed in SMILES spec # biopython generates space when reading multi-line SMILES from .gbk if smiles is not None: - smiles = [i.replace(' ', '') for i in smiles] + smiles = tuple(i.replace(' ', '') for i in smiles) features["smiles"] = smiles return features diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index b1b1e2a9..16caee08 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -16,7 +16,7 @@ class BGC(): def __init__(self, bgc_id: str, - product_prediction: list[str]): + *product_prediction: str): """Class to model BGC (biosynthetic gene cluster) data. BGC data include both annotations and sequence data. This class is @@ -32,18 +32,18 @@ def __init__(self, Args: bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession. - product_prediction(list[str]): list of BGC's (predicted) natural - products or product classes. + product_prediction(tuple[str]): BGC's (predicted) natural products + or product classes. Attributes: bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession. - product_prediction(list[str]): List of BGC's (predicted) natural - products or product classes. + product_prediction(tuple[str]): A tuple of (predicted) natural + products or product classes of the BGC. For antiSMASH's GenBank data, the feature `region /product` gives product information. For MIBiG metadata, its biosynthetic class provides such info. - mibig_bgc_class(list[str] | None): List of MIBiG biosynthetic classes to - which the BGC belongs. + mibig_bgc_class(tuple[str] | None): A tuple of MIBiG biosynthetic + classes to which the BGC belongs. Defaults to None. MIBiG defines 6 major biosynthetic classes for natural products, including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide" @@ -52,7 +52,8 @@ def __init__(self, More details see the publication: https://doi.org/10.1186/s40793-018-0318-y. description(str | None): Brief description of the BGC. Defaults to None. - smiles(list[str] | None): SMILES formula of the BGC's product. + smiles(tuple[str] | None): A tuple of SMILES formulas of the BGC's + products. Defaults to None. antismash_file(str | None): The path to the antiSMASH GenBank file. Defaults to None. @@ -72,9 +73,9 @@ def __init__(self, self.bgc_id = bgc_id self.product_prediction = product_prediction - self.mibig_bgc_class: list[str] | None = None + self.mibig_bgc_class: tuple[str] | None = None self.description: str | None = None - self.smiles: list[str] | None = None + self.smiles: tuple[str] | None = None # antismash related attributes self.antismash_file: str | None = None diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index a4f665c1..0be9f889 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -122,7 +122,7 @@ def parse_bgc_metadata_json(file: str) -> BGC: BGC: :class:`nplinker.genomics.BGC` object """ metadata = MibigMetadata(file) - mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class) + mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class) mibig_bgc.mibig_bgc_class = metadata.biosyn_class mibig_bgc.strain = Strain(metadata.mibig_accession) return mibig_bgc diff --git a/src/nplinker/genomics/mibig/mibig_metadata.py b/src/nplinker/genomics/mibig/mibig_metadata.py index c3921f37..94c85315 100644 --- a/src/nplinker/genomics/mibig/mibig_metadata.py +++ b/src/nplinker/genomics/mibig/mibig_metadata.py @@ -21,7 +21,7 @@ def __init__(self, file) -> None: self.metadata = json.load(f) self._mibig_accession: str - self._biosyn_class: list[str] + self._biosyn_class: tuple[str] self._parse_metadata() @property @@ -30,7 +30,7 @@ def mibig_accession(self) -> str: return self._mibig_accession @property - def biosyn_class(self) -> list[str]: + def biosyn_class(self) -> tuple[str]: """Get the value of metadata item 'biosyn_class'. The 'biosyn_class' is biosynthetic class(es), namely the type of @@ -50,8 +50,8 @@ def _parse_metadata(self) -> None: if 'general_params' in self.metadata: self._mibig_accession = self.metadata['general_params'][ 'mibig_accession'] - self._biosyn_class = self.metadata['general_params'][ - 'biosyn_class'] + self._biosyn_class = tuple(self.metadata['general_params'][ + 'biosyn_class']) else: # versionā‰„2.0 self._mibig_accession = self.metadata['cluster']['mibig_accession'] - self._biosyn_class = self.metadata['cluster']['biosyn_class'] + self._biosyn_class = tuple(self.metadata['cluster']['biosyn_class']) diff --git a/tests/genomics/antismash/test_antismash_loader.py b/tests/genomics/antismash/test_antismash_loader.py index f3f30820..027e2c7a 100644 --- a/tests/genomics/antismash/test_antismash_loader.py +++ b/tests/genomics/antismash/test_antismash_loader.py @@ -67,12 +67,12 @@ def test_parse_bgc_genbank(): bgc = parse_bgc_genbank(gbk_file) assert isinstance(bgc, BGC) assert bgc.bgc_id == "NZ_AZWB01000005.region001" - assert bgc.product_prediction == ["NRPS", "lanthipeptide"] + assert bgc.product_prediction == ("NRPS", "lanthipeptide") assert "Salinispora pacifica CNT029 B170DRAFT_scaffold" in bgc.description assert bgc.antismash_id == "NZ_AZWB01000005" assert bgc.antismash_file == gbk_file assert bgc.antismash_region == "1" - assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"] + assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",) def test_parse_bgc_genbank_error(): gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk") diff --git a/tests/genomics/test_bgc.py b/tests/genomics/test_bgc.py index 7434ff8d..83d7da63 100644 --- a/tests/genomics/test_bgc.py +++ b/tests/genomics/test_bgc.py @@ -5,9 +5,9 @@ def test_default(): - bgc = BGC("BGC0000001", ["Polyketide"]) + bgc = BGC("BGC0000001", "Polyketide") assert bgc.bgc_id == "BGC0000001" - assert bgc.product_prediction == ["Polyketide"] + assert bgc.product_prediction == ("Polyketide",) assert bgc.is_mibig() is True assert bgc.parents == set() assert bgc.bigscape_classes == set() @@ -19,7 +19,7 @@ def test_default(): def test_add_and_detach_parent(): - bgc = BGC("BGC0000001", ["Polyketide"]) + bgc = BGC("BGC0000001", "Polyketide") gcf = GCF("1") bgc.add_parent(gcf) assert bgc.parents == {gcf} diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py index 0ccb20ca..a40f4728 100644 --- a/tests/genomics/test_gcf.py +++ b/tests/genomics/test_gcf.py @@ -7,14 +7,14 @@ @pytest.fixture() def bgc_with_strain(): - bgc = BGC("S0001", ["NPR"]) + bgc = BGC("S0001", "NPR") bgc.strain = Strain("strain001") yield bgc @pytest.fixture() def bgc_without_strain(): - bgc = BGC("S002", ["NPR"]) + bgc = BGC("S002", "NPR") yield bgc @@ -59,8 +59,8 @@ def test_has_strain(bgc_with_strain): assert gcf.has_strain("strain002") is False def test_has_mibig_only(): - mibig_bgc = BGC("BGC0000001", ["NPR"]) - nonmibig_bgc = BGC("S0001", ["NPR"]) + mibig_bgc = BGC("BGC0000001", "NPR") + nonmibig_bgc = BGC("S0001", "NPR") gcf = GCF("1") gcf.add_bgc(mibig_bgc) assert gcf.has_mibig_only() is True diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index a48c9d6c..dfde9e6c 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -38,10 +38,10 @@ def bgc_genome_mapping() -> dict[str, str]: @pytest.fixture def bgc_list() -> list[BGC]: return [ - BGC("SAMPLE0001", ["NPR"]), - BGC("SAMPLE0002", ["Alkaloid"]), - BGC("BGC0000001", ["Polyketide"]), - BGC("BGC0000002", ["Terpene"]) + BGC("SAMPLE0001", "NPR"), + BGC("SAMPLE0002", "Alkaloid"), + BGC("BGC0000001", "Polyketide"), + BGC("BGC0000002", "Terpene") ] diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py index bdeea051..c122c56e 100644 --- a/tests/genomics/test_mibig_loader.py +++ b/tests/genomics/test_mibig_loader.py @@ -79,4 +79,4 @@ def test_parse_bgc_metadata_json(): bgc = parse_bgc_metadata_json(str(json_file)) assert isinstance(bgc, BGC) assert bgc.bgc_id == "BGC0000001" - assert bgc.mibig_bgc_class == ["Polyketide"] + assert bgc.mibig_bgc_class == ("Polyketide",) diff --git a/tests/genomics/test_mibig_metadata.py b/tests/genomics/test_mibig_metadata.py index 0e2096e1..08607d0e 100644 --- a/tests/genomics/test_mibig_metadata.py +++ b/tests/genomics/test_mibig_metadata.py @@ -23,4 +23,4 @@ def test_mibig_accession(self, metadata): assert metadata.mibig_accession == "BGC0000001" def test_biosyn_class(self, metadata): - assert metadata.biosyn_class == ["Polyketide"] + assert metadata.biosyn_class == ("Polyketide",) From f1cac59f4a071d042fbb372625c1bda2252173ae Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 5 Apr 2023 18:13:37 +0200 Subject: [PATCH 02/95] use positional-only parameter in BGC and GCF Parameters before "/" are positional-only parameters, see https://docs.python.org/3/glossary.html#term-parameter. --- src/nplinker/genomics/bgc.py | 5 +---- src/nplinker/genomics/gcf.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 16caee08..1ec6be9f 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -4,7 +4,6 @@ from nplinker.logconfig import LogConfig from .aa_pred import predict_aa - if TYPE_CHECKING: from ..strains import Strain from .gcf import GCF @@ -14,9 +13,7 @@ class BGC(): - def __init__(self, - bgc_id: str, - *product_prediction: str): + def __init__(self, bgc_id: str, /, *product_prediction: str): """Class to model BGC (biosynthetic gene cluster) data. BGC data include both annotations and sequence data. This class is diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index e0c68c13..2a9e2b02 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -13,7 +13,7 @@ class GCF(): - def __init__(self, gcf_id: str) -> None: + def __init__(self, gcf_id: str, /) -> None: """Class to model gene cluster family (GCF). GCF is a group of similar BGCs and generated by clustering BGCs with From 18e77a6b08e7ac39955c779ed6314b40622e8154 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 5 Apr 2023 15:33:10 +0200 Subject: [PATCH 03/95] update BGC's `__eq__` and `__hash__` --- src/nplinker/genomics/bgc.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 1ec6be9f..7eb7921a 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -91,11 +91,14 @@ def __str__(self): self.__class__.__name__, self.bgc_id, self.strain, self.antismash_id, self.antismash_region) - def __eq__(self, other): - return self.bgc_id == other.bgc_id - - def __hash__(self): - return hash(self.bgc_id) + def __eq__(self, other) -> bool: + if isinstance(other, BGC): + return (self.bgc_id == other.bgc_id + and self.product_prediction == other.product_prediction) + return NotImplemented + + def __hash__(self) -> int: + return hash((self.bgc_id, self.product_prediction)) def add_parent(self, gcf: GCF) -> None: """Add a parent GCF to the BGC. From da2cf41b6528fd281288b47e30c413d48fe98f03 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 5 Apr 2023 16:10:20 +0200 Subject: [PATCH 04/95] update GCF's `__eq__` and `__hash__` --- src/nplinker/genomics/gcf.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 2a9e2b02..c3d9cc9c 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -3,7 +3,6 @@ from nplinker.logconfig import LogConfig from nplinker.strain_collection import StrainCollection - if TYPE_CHECKING: from nplinker.strains import Strain from .bgc import BGC @@ -46,10 +45,17 @@ def __str__(self): def __repr__(self): return str(self) - def __eq__(self, other): - return self.gcf_id == other.gcf_id + def __eq__(self, other) -> bool: + if isinstance(other, GCF): + return (self.gcf_id == other.gcf_id and self.bgcs == other.bgcs) + return NotImplemented + + def __hash__(self) -> int: + """Hash the GCF object. - def __hash__(self): + Note that GCF class is a mutable container. We only hash the GCF id to + avoid the hash value changes when `self._bgcs` is updated. + """ return hash(self.gcf_id) @property From 2111e5b83329650841644c3e3f7d3a9a4b6d1898 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:21:24 +0200 Subject: [PATCH 05/95] Update gcf.py --- src/nplinker/genomics/gcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index c3d9cc9c..5ea61486 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -51,7 +51,7 @@ def __eq__(self, other) -> bool: return NotImplemented def __hash__(self) -> int: - """Hash the GCF object. + """Hash function for GCF. Note that GCF class is a mutable container. We only hash the GCF id to avoid the hash value changes when `self._bgcs` is updated. From 07364767216fc70e06a50e4fec034d6463c168e9 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:02:45 +0200 Subject: [PATCH 06/95] update Spectrum's `__eq__` and `__hash__` --- src/nplinker/metabolomics/spectrum.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index c65824ce..78a8e8ce 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -121,11 +121,16 @@ def __str__(self): def __repr__(self): return str(self) - def __eq__(self, other): - return self.id == other.id - - def __hash__(self): - return hash(self.id) + def __eq__(self, other) -> bool: + if isinstance(other, Spectrum): + return (self.id == other.id + and self.spectrum_id == other.spectrum_id + and self.precursor_mz == other.precursor_mz + and self.parent_mz == other.parent_mz) + return NotImplemented + + def __hash__(self) -> int: + return hash((self.id, self.spectrum_id, self.precursor_mz, self.parent_mz)) def __cmp__(self, other): if self.parent_mz >= other.parent_mz: From fda12085f0da0ceaaa37dab9d822d2e5184072bc Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:09:14 +0200 Subject: [PATCH 07/95] update MolecularFamily `__eq__` and `__hash__` --- src/nplinker/metabolomics/molecular_family.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 1eec71fc..d56f5b82 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -51,7 +51,16 @@ def __str__(self) -> str: self.family_id, len(self.spectra)) def __eq__(self, other: Self) -> bool: - return bool(self.id == other.id) + if isinstance(other, MolecularFamily): + return (self.id == other.id + and self.family_id == other.family_id + and set(self.spectra) == set(other.spectra)) + return NotImplemented def __hash__(self) -> int: - return hash(self.id) + """Hash function for MolecularFamily. + + Note that MolecularFamily is a mutable container, so here we hash on + the id and family_id only. + """ + return hash((self.id, self.family_id)) From 0a7cceeb965526a18ec1889bf46fd3b6d03b459f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:21:30 +0200 Subject: [PATCH 08/95] Update molecular_family.py --- src/nplinker/metabolomics/molecular_family.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index d56f5b82..a55a4df9 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -61,6 +61,7 @@ def __hash__(self) -> int: """Hash function for MolecularFamily. Note that MolecularFamily is a mutable container, so here we hash on - the id and family_id only. + the id and family_id only to avoid the hash value changing when + `self.spectra` is updated. """ return hash((self.id, self.family_id)) From 6a373e1929b2e2ba3af3ba6b58eb68697bdbd806 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:18:29 +0200 Subject: [PATCH 09/95] update Strain `__eq__` and `__hash__` --- src/nplinker/strains.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index de41a8b0..661c7268 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -26,10 +26,17 @@ def __str__(self) -> str: return f'Strain({self.id}) [{len(self._aliases)} aliases]' def __eq__(self, other) -> bool: - return (isinstance(other, Strain) and self.id == other.id - and self._aliases == other._aliases) + if isinstance(other, Strain): + return (self.id == other.id + and self.aliases == other.aliases) + return NotImplemented def __hash__(self) -> int: + """Hash function for Strain. + + Note that Strain is a mutable container, so here we hash on only the id + to avoid the hash value changes when `self._aliases` is updated. + """ return hash(self.id) @property From 33ed3c60156fa394c8f2ae077568df469dc3667f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:18:46 +0200 Subject: [PATCH 10/95] update StrainCollection `__eq__` --- src/nplinker/strain_collection.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 5169f02a..00603e96 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -35,9 +35,11 @@ def __len__(self) -> int: return len(self._strains) def __eq__(self, other) -> bool: - return (self._strains == other._strains - and self._strain_dict_id == other._strain_dict_id - and self._strain_dict_index == other._strain_dict_index) + if isinstance(other, StrainCollection): + return (self._strains == other._strains + and self._strain_dict_id == other._strain_dict_id + and self._strain_dict_index == other._strain_dict_index) + return NotImplemented def __contains__(self, strain: str | Strain) -> bool: if isinstance(strain, str): From f6bae56d3389a9071bb357a13570979e593dada7 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 10:27:25 +0200 Subject: [PATCH 11/95] add TODO comments to ObjectLink --- src/nplinker/scoring/object_link.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nplinker/scoring/object_link.py b/src/nplinker/scoring/object_link.py index eb380db5..a88e543d 100644 --- a/src/nplinker/scoring/object_link.py +++ b/src/nplinker/scoring/object_link.py @@ -48,6 +48,8 @@ def __getitem__(self, name): def __hash__(self): # return the nplinker internal ID as hash value (for set/dict etc) + # TODO: hashable object should also have `__eq__` defined, see #136. + # this implementation is not ideal as the hash value is not unique return hash(self.source.id) def __str__(self): From f81accf6ec888f039160dce338bdaa905898ba19 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 11:07:52 +0200 Subject: [PATCH 12/95] add parameter type check for `add_alias` --- src/nplinker/strains.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 661c7268..65887ced 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -54,6 +54,8 @@ def add_alias(self, alias: str) -> None: Args: alias(str): The alias to add to the list of known aliases. """ + if not isinstance(alias, str): + raise TypeError(f'Expected str, got {type(alias)}') if len(alias) == 0: logger.warning( 'Refusing to add an empty-string alias to strain {%s}', self) From 53dad8206060b2f7e56a2a1e41758d15b4227217 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 11:50:58 +0200 Subject: [PATCH 13/95] add `__contains__` to Strain class --- src/nplinker/strains.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 65887ced..76caf37c 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -1,5 +1,6 @@ from __future__ import annotations from .logconfig import LogConfig +from typing import Iterator logger = LogConfig.getLogger(__name__) @@ -39,6 +40,11 @@ def __hash__(self) -> int: """ return hash(self.id) + def __contains__(self, alias: str) -> bool: + if not isinstance(alias, str): + raise TypeError(f'Expected str, got {type(alias)}') + return alias in self._aliases + @property def aliases(self) -> set[str]: """Get the set of known aliases. From 45d876537e8134ffcc4a9aca4d8dd1aec8dc8dff Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 11:52:02 +0200 Subject: [PATCH 14/95] update `lookup` method of StrainCollection --- src/nplinker/strain_collection.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 00603e96..741090f2 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -9,7 +9,6 @@ from .utils import list_dirs from .utils import list_files - logger = LogConfig.getLogger(__name__) @@ -18,6 +17,7 @@ class StrainCollection(): def __init__(self): """A collection of Strain objects.""" self._strains: list[Strain] = [] + # dict of strain name (id and alias) to strain object self._strain_dict_id: dict[str, Strain] = {} self._strain_dict_index: dict[int, Strain] = {} @@ -122,9 +122,10 @@ def lookup(self, name: str) -> Strain: Raises: KeyError: If the strain name is not found. """ - if name not in self._strain_dict_id: - raise KeyError(f"Strain {name} not found in strain collection.") - return self._strain_dict_id[name] + for strain in self: + if name == strain.id or name in strain: + return strain + raise KeyError(f"Strain {name} not found in strain collection.") def add_from_file(self, file: str | PathLike) -> None: """Add strains from a strain mapping file. From 7cb75e0ad208ffa4affddfb764c69ae13f1c1025 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 14:15:39 +0200 Subject: [PATCH 15/95] update `__contains__` in StrainCollection --- src/nplinker/strain_collection.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 741090f2..deaa47ed 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -9,6 +9,7 @@ from .utils import list_dirs from .utils import list_files + logger = LogConfig.getLogger(__name__) @@ -41,14 +42,20 @@ def __eq__(self, other) -> bool: and self._strain_dict_index == other._strain_dict_index) return NotImplemented - def __contains__(self, strain: str | Strain) -> bool: - if isinstance(strain, str): - value = strain in self._strain_dict_id - elif isinstance(strain, Strain): - value = strain.id in self._strain_dict_id + def __contains__(self, item: str | Strain) -> bool: + """Check if the strain collection contains the given strain. + + The given strain could be a Strain object, or a strain id or alias. + """ + if isinstance(item, str): + for strain in self: + if item == strain.id or item in strain: + return True + elif isinstance(item, Strain): + return item in self._strains else: - raise TypeError(f"Expected Strain or str, got {type(strain)}") - return value + raise TypeError(f"Expected Strain or str, got {type(item)}") + return False def __iter__(self) -> Iterator[Strain]: return iter(self._strains) From f797ee50cb532be1d2ce9288e221d05c95967387 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 11 Apr 2023 14:19:40 +0200 Subject: [PATCH 16/95] remove from __eq__ --- src/nplinker/strain_collection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index deaa47ed..6ccb9830 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -38,7 +38,6 @@ def __len__(self) -> int: def __eq__(self, other) -> bool: if isinstance(other, StrainCollection): return (self._strains == other._strains - and self._strain_dict_id == other._strain_dict_id and self._strain_dict_index == other._strain_dict_index) return NotImplemented From 8a3fa1bafa59e24d08343255f6ac124543d0e445 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 13 Apr 2023 09:04:17 +0200 Subject: [PATCH 17/95] update `__eq__` logic for Strain --- src/nplinker/strains.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 76caf37c..54338027 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -28,8 +28,7 @@ def __str__(self) -> str: def __eq__(self, other) -> bool: if isinstance(other, Strain): - return (self.id == other.id - and self.aliases == other.aliases) + return self.id == other.id return NotImplemented def __hash__(self) -> int: From 88754005f724579c845fc596e197a3d24b5a336a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 13 Apr 2023 09:11:31 +0200 Subject: [PATCH 18/95] rename `_strain_dict_id` to `_strain_dict_name` in StrainCollection --- src/nplinker/strain_collection.py | 16 ++++++++-------- tests/test_strain_collection.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 6ccb9830..35e63daf 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -19,7 +19,7 @@ def __init__(self): """A collection of Strain objects.""" self._strains: list[Strain] = [] # dict of strain name (id and alias) to strain object - self._strain_dict_id: dict[str, Strain] = {} + self._strain_dict_name: dict[str, Strain] = {} self._strain_dict_index: dict[int, Strain] = {} def __repr__(self) -> str: @@ -68,17 +68,17 @@ def add(self, strain: Strain) -> None: strain(Strain): The strain to add. """ # if the strain exists, merge the aliases - if strain.id in self._strain_dict_id: + if strain.id in self._strain_dict_name: existing: Strain = self.lookup(strain.id) for alias in strain.aliases: existing.add_alias(alias) - self._strain_dict_id[alias] = existing + self._strain_dict_name[alias] = existing else: self._strain_dict_index[len(self)] = strain self._strains.append(strain) - self._strain_dict_id[strain.id] = strain + self._strain_dict_name[strain.id] = strain for alias in strain.aliases: - self._strain_dict_id[alias] = strain + self._strain_dict_name[alias] = strain def remove(self, strain: Strain): """Remove a strain from the collection. @@ -86,12 +86,12 @@ def remove(self, strain: Strain): Args: strain(Strain): The strain to remove. """ - if strain.id in self._strain_dict_id: + if strain.id in self._strain_dict_name: self._strains.remove(strain) # remove from dict id - del self._strain_dict_id[strain.id] + del self._strain_dict_name[strain.id] for alias in strain.aliases: - del self._strain_dict_id[alias] + del self._strain_dict_name[alias] def filter(self, strain_set: set[Strain]): """ diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index 26ab2767..bd900a0f 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -54,7 +54,7 @@ def test_remove(collection: StrainCollection, strain: Strain): assert strain in collection collection.remove(strain) with pytest.raises(KeyError): - _ = collection._strain_dict_id[strain.id] + _ = collection._strain_dict_name[strain.id] assert strain not in collection # TODO: issue #90 # with pytest.raises(KeyError): From 2f7753978439a0b148f73a53ca60ec7e89b6d625 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 13 Apr 2023 11:48:51 +0200 Subject: [PATCH 19/95] add comments to `get_common_strains` --- src/nplinker/nplinker.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 1019abe5..006e57e2 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -493,13 +493,17 @@ def get_common_strains(self, objects_a, objects_b, filter_no_shared=True): # this is a dict with structure: # (Spectrum/MolecularFamily, GCF) => list of strain indices - common_strains = self._datalinks.common_strains( + common_strains_index_dict = self._datalinks.common_strains( objects_a, objects_b, filter_no_shared) + common_strains = {} # replace the lists of strain indices with actual strain objects - for objpair in common_strains.keys(): - common_strains[objpair] = [ - self._strains.lookup_index(x) for x in common_strains[objpair] + # TODO: bug here, the index value of common_strains_index_dict is + # not the same as the index value of self._strains + # Solution: lookup with strain.id instead of index + for key in common_strains_index_dict: + common_strains[key] = [ + self._strains.lookup_index(x) for x in common_strains_index_dict[key] ] return common_strains From dae421d7af40ac56313b6b6e3bfe4c1755a4c490 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 13 Apr 2023 11:55:23 +0200 Subject: [PATCH 20/95] add comments and rename variables for DataLinks --- src/nplinker/scoring/linking/data_linking.py | 48 ++++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index a6f181f0..bec8b6d2 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -28,9 +28,8 @@ # import packages import numpy as np import pandas as pd - -from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.genomics.gcf import GCF +from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.spectrum import Spectrum from .data_linking_functions import calc_correlation_matrix @@ -55,6 +54,8 @@ def __init__(self): # matrices that store co-occurences with respect to strains # values = 1 where gcf/spec/fam occur in strain # values = 0 where gcf/spec/fam do not occur in strain + + # 2D array [gcf: int, strain: int] self.M_gcf_strain = [] self.M_spec_strain = [] self.M_fam_strain = [] @@ -209,10 +210,10 @@ def matrix_strain_gcf(self, gcf_list, strain_list): # Collect co-ocurences in M_spec_strain matrix M_gcf_strain = np.zeros((len(gcf_list), len(strain_list))) - for i, strain in enumerate(strain_list): - for m, gcf in enumerate(gcf_list): + for i, gcf in enumerate(gcf_list): + for j, strain in enumerate(strain_list): if gcf.has_strain(strain): - M_gcf_strain[m, i] = 1 + M_gcf_strain[i, j] = 1 self.M_gcf_strain = M_gcf_strain # extend mapping tables: @@ -304,7 +305,7 @@ def data_family_mapping(self, include_singletons=False): self.mapping_fam["no of members"] = num_members return self.family_members - def common_strains(self, objects_a, objects_b, filter_no_shared=False): + def common_strains(self, metabolome_objects, gcfs, filter_no_shared=False) -> dict: """ Obtain the set of common strains between all pairs from the lists objects_a and objects_b. @@ -320,11 +321,11 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False): # TODO make this work for BGCs too? - is_list_a = isinstance(objects_a, list) - is_list_b = isinstance(objects_b, list) + is_list_a = isinstance(metabolome_objects, list) + is_list_b = isinstance(gcfs, list) - type_a = type(objects_a[0]) if is_list_a else type(objects_a) - type_b = type(objects_b[0]) if is_list_b else type(objects_b) + type_a = type(metabolome_objects[0]) if is_list_a else type(metabolome_objects) + type_b = type(gcfs[0]) if is_list_b else type(gcfs) if type_a == type_b: raise Exception('Must supply objects with different types!') @@ -333,33 +334,40 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False): if type_a == GCF: type_a, type_b = type_b, type_a is_list_a, is_list_b = is_list_b, is_list_a - objects_a, objects_b = objects_b, objects_a + metabolome_objects, gcfs = gcfs, metabolome_objects if not is_list_a: - objects_a = [objects_a] + metabolome_objects = [metabolome_objects] if not is_list_b: - objects_b = [objects_b] + gcfs = [gcfs] # retrieve object IDs # TODO: issue #103 stop using gcf.id, but note that the ids_b should be # a list of int - ids_b = [gcf.id for gcf in objects_b] + gcfs_id_list = [gcf.id for gcf in gcfs] # these might be MolFams or Spectra, either way they'll have a .id attribute - ids_a = [obj.id for obj in objects_a] + ids_a = [obj.id for obj in metabolome_objects] data_a = self.M_spec_strain if type_a == Spectrum else self.M_fam_strain - data_b = self.M_gcf_strain results = {} - for a, obj_a in enumerate(objects_a): - for b, obj_b in enumerate(objects_b): + for i, obj_meta in enumerate(metabolome_objects): + for j, obj_gcf in enumerate(gcfs): # just AND both arrays and extract the indices with positive results + + # self.M_gcf_strain is a 2D np.array [index of gcf_list, index of strain_list] + # TODO: bug here. self.M_gcf_strain use the enumerate count of the gcf_list as index + # it's wrong to assume that enumerate count is same as the gcf.id. + # self.M_gcf_strain should use dataframe + # TODO: Bug: the result is 1D array of the enumerated count of strain_list + # for the common strains. Same here, the enumerate count is not + # the same as the strain.id result = np.where( - np.logical_and(data_a[ids_a[a]], data_b[ids_b[b]]))[0] + np.logical_and(data_a[ids_a[i]], self.M_gcf_strain[gcfs_id_list[j]]))[0] # if we want to exclude results with no shared strains if (filter_no_shared and len(result) > 0) or not filter_no_shared: - results[(obj_a, obj_b)] = result + results[(obj_meta, obj_gcf)] = result return results From 504862ea44ed7e81f5e3496915967ee002320077 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 14 Apr 2023 11:14:25 +0200 Subject: [PATCH 21/95] add comment about `met_only` parameter --- src/nplinker/nplinker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 006e57e2..8c78f47f 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -260,6 +260,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): Returns: bool: True if successful, False otherwise """ + # TODO: the met_only is useless, remove it. NPlinker will stop working if met_only=True # typical case where load_data is being called with no params if new_bigscape_cutoff is None: logger.debug( From 8b160e66b7658000fc028ffd929f974f138caa42 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 18 Apr 2023 14:39:43 +0200 Subject: [PATCH 22/95] add todo comments to LinkFinder --- src/nplinker/scoring/linking/link_finder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 2e58acd9..cf3563a5 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -453,6 +453,7 @@ def get_links(self, if main_score == 'likescore': likescores = [ + # TODO CG: use dataframe instead of numpy array self.likescores_spec_gcf[:, input_ids], self.likescores_fam_gcf[:, input_ids] ] From 8e9557c6c30ef78848373cce78ab1617a4c4a3cf Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 18 Apr 2023 15:25:59 +0200 Subject: [PATCH 23/95] add comments to GNPSSpectrumLoader to figure out how `spectrum_id` is set --- src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py index 59acd06f..91a83171 100644 --- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py @@ -18,7 +18,7 @@ def __init__(self, file: str | PathLike): ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(file)]) logger.info('%d molecules parsed from MGF file', len(ms1)) self._spectra = _mols_to_spectra(ms2, metadata) - + def spectra(self) -> list[Spectrum]: """Get the spectra loaded from the file. @@ -26,7 +26,7 @@ def spectra(self) -> list[Spectrum]: list[Spectrum]: the loaded spectra as a list of `Spectrum` objects. """ return self._spectra - + def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spectrum]: """Function to convert ms2 object and metadata to `Spectrum` objects. @@ -39,13 +39,15 @@ def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spe list[Spectrum]: List of mass spectra obtained from ms2 and metadata. """ ms2_dict = {} + # an example of m: + # (118.487999, 0.0, 18.753, , 'spectra.mgf', 0.0) for m in ms2: - if not m[3] in ms2_dict: + if not m[3] in ms2_dict: # m[3] is `nplinker.parsers.mgf.MS1` object ms2_dict[m[3]] = [] ms2_dict[m[3]].append((m[0], m[2])) spectra = [] - for i, m in enumerate(ms2_dict.keys()): + for i, m in enumerate(ms2_dict.keys()): # m is `nplinker.parsers.mgf.MS1` object new_spectrum = Spectrum(i, ms2_dict[m], int(m.name), metadata[m.name]['precursormass'], metadata[m.name]['parentmass']) From 9257e0b561fedd0710ee8b7d070d3fc9dea22d3f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 18 Apr 2023 15:35:08 +0200 Subject: [PATCH 24/95] change Spectrum.spectrum_id from type int to str --- src/nplinker/annotations.py | 17 ++++++++--------- src/nplinker/class_info/chem_classes.py | 4 ++-- .../metabolomics/gnps/gnps_annotation_loader.py | 16 ++++++++-------- .../gnps/gnps_molecular_family_loader.py | 16 ++++++++-------- .../metabolomics/gnps/gnps_spectrum_loader.py | 2 +- src/nplinker/metabolomics/metabolomics.py | 14 +++++++------- src/nplinker/metabolomics/molecular_family.py | 4 ++-- src/nplinker/metabolomics/spectrum.py | 6 +++--- src/nplinker/scoring/np_class_scoring.py | 2 +- src/nplinker/scoring/rosetta/spec_lib.py | 2 +- tests/conftest.py | 2 +- 11 files changed, 42 insertions(+), 43 deletions(-) diff --git a/src/nplinker/annotations.py b/src/nplinker/annotations.py index 970576a7..6449b6d2 100644 --- a/src/nplinker/annotations.py +++ b/src/nplinker/annotations.py @@ -14,10 +14,9 @@ import csv import os - from deprecated import deprecated - -from nplinker.metabolomics.spectrum import Spectrum, GNPS_KEY +from nplinker.metabolomics.spectrum import GNPS_KEY +from nplinker.metabolomics.spectrum import Spectrum from .logconfig import LogConfig @@ -61,14 +60,14 @@ def create_gnps_annotation(spec: Spectrum, gnps_anno: dict): @deprecated(version="1.3.3", reason="Use GNPSAnnotationLoader class instead.") -def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[int, Spectrum]) -> list[Spectrum]: +def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[str, Spectrum]) -> list[Spectrum]: """Load the annotations from the GNPS annotation file present in root to the spectra. Args: root(str | os.PathLike): Path to the downloaded and extracted GNPS results. config(str | os.PathLike): Path to config file for custom file locations. spectra(list[Spectrum]): List of spectra to annotate. - spec_dict(dict[int, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable. + spec_dict(dict[str, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable. Raises: Exception: Raises exception if custom annotation config file has invalid content. @@ -76,7 +75,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra Returns: list[Spectrum]: List of annotated spectra. """ - + if not os.path.exists(root): logger.debug(f'Annotation directory not found ({root})') return spectra @@ -89,7 +88,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra logger.debug('Found {} annotations .tsv files in {}'.format( len(annotation_files), root)) - + for af in annotation_files: with open(af) as f: rdr = csv.reader(f, delimiter='\t') @@ -105,7 +104,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra # each line should be a different spec ID here for line in rdr: # read the scan ID column and get the corresponding Spectrum object - scan_id = int(line[scans_index]) + scan_id = line[scans_index] if scan_id not in spec_dict: logger.warning( 'Unknown spectrum ID found in GNPS annotation file (ID={})' @@ -147,7 +146,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra # note that might have multiple lines for the same spec ID! spec_annotations = {} for line in rdr: - scan_id = int(line[spec_id_index]) + scan_id = line[spec_id_index] if scan_id not in spec_dict: logger.warning( 'Unknown spectrum ID found in annotation file "{}", ID is "{}"' diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py index 8aeb0190..00a9cdb4 100644 --- a/src/nplinker/class_info/chem_classes.py +++ b/src/nplinker/class_info/chem_classes.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import Counter import glob import os -from collections import Counter from canopus import Canopus from canopus.classifications_to_gnps import analyse_canopus from ..logconfig import LogConfig @@ -409,7 +409,7 @@ class prediction for a level. When no class is present, instead of Tuple it will classes_per_spectra = [] for spec in spectra: - spec_classes = self.spectra_classes.get(str(spec.spectrum_id)) + spec_classes = self.spectra_classes.get(spec.spectrum_id) if spec_classes: # account for spectra without prediction classes_per_spectra.append(spec_classes) diff --git a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py index 13cb5028..aade4de2 100644 --- a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py @@ -2,9 +2,9 @@ from os import PathLike from pathlib import Path from typing import Any - from nplinker.metabolomics.abc import AnnotationLoaderBase + GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}' class GNPSAnnotationLoader(AnnotationLoaderBase): @@ -15,28 +15,28 @@ def __init__(self, file: str | PathLike): file(str | PathLike): The GNPS annotation file. """ self._file = Path(file) - self._annotations : dict[int, dict] = dict() + self._annotations : dict[str, dict] = {} with open(self._file, mode='rt', encoding='UTF-8') as f: header = f.readline().split('\t') dict_reader = csv.DictReader(f, header, delimiter='\t') for row in dict_reader: - scan_id = int(row.pop('#Scan#')) + scan_id = row.pop('#Scan#') self._annotations[scan_id] = row - + # also insert useful URLs for t in ['png', 'json', 'svg', 'spectrum']: self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID']) - - def get_annotations(self) -> dict[int, dict]: + + def get_annotations(self) -> dict[str, dict]: """Get annotations. Returns: - dict[int, dict]: Spectra indices are keys and values are the annotations for this spectrum. + dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum. Examples: >>> print(loader.annotations()[100]) """ - return self._annotations \ No newline at end of file + return self._annotations diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index 47783e15..729b34b8 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -17,7 +17,7 @@ def __init__(self, file: str | PathLike): file(str | PathLike): str or PathLike object pointing towards the GNPS molecular families file to load. """ self._families: list[MolecularFamily | SingletonFamily] = [] - + for family_id, spectra_ids in _load_molecular_families(file).items(): if family_id == -1: for spectrum_id in spectra_ids: @@ -28,32 +28,32 @@ def __init__(self, file: str | PathLike): family = MolecularFamily(family_id) family.spectra_ids = spectra_ids self._families.append(family) - + def families(self) -> list[MolecularFamily]: return self._families -def _load_molecular_families(file: str | PathLike) -> dict[int, set[int]]: +def _load_molecular_families(file: str | PathLike) -> dict[int, set[str]]: """Load ids of molecular families and corresponding spectra from GNPS output file. Args: file(str | PathLike): path to the GNPS file to load molecular families. Returns: - dict[int, set[int]]: Mapping from molecular family/cluster id to the spectra ids. + dict[int, set[str]]: Mapping from molecular family/cluster id to the spectra ids. """ logger.debug('loading edges file: %s', file) families: dict = {} - + with open(file, mode='rt', encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t') headers = next(reader) cid1_index, cid2_index, fam_index = _sniff_column_indices(file, headers) for line in reader: - spec1_id = int(line[cid1_index]) - spec2_id = int(line[cid2_index]) + spec1_id = line[cid1_index] + spec2_id = line[cid2_index] family_id = int(line[fam_index]) if families.get(family_id) is None: @@ -84,5 +84,5 @@ def _sniff_column_indices(file: str | PathLike, headers: list[str]) -> tuple[int except ValueError as ve: message = f'Unknown or missing column(s) in edges file: {file}' raise Exception(message) from ve - + return cid1_index,cid2_index,fam_index diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py index 91a83171..0a1431cf 100644 --- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py @@ -48,7 +48,7 @@ def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spe spectra = [] for i, m in enumerate(ms2_dict.keys()): # m is `nplinker.parsers.mgf.MS1` object - new_spectrum = Spectrum(i, ms2_dict[m], int(m.name), + new_spectrum = Spectrum(i, ms2_dict[m], m.name, metadata[m.name]['precursormass'], metadata[m.name]['parentmass']) new_spectrum.metadata = metadata[m.name] diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py index 1c4cde5d..c9b2c929 100644 --- a/src/nplinker/metabolomics/metabolomics.py +++ b/src/nplinker/metabolomics/metabolomics.py @@ -35,7 +35,7 @@ def _mols_to_spectra(ms2, metadata): spectra = [] for i, m in enumerate(ms2_dict.keys()): - new_spectrum = Spectrum(i, ms2_dict[m], int(m.name), + new_spectrum = Spectrum(i, ms2_dict[m], m.name, metadata[m.name]['precursormass'], metadata[m.name]['parentmass']) new_spectrum.metadata = metadata[m.name] @@ -53,15 +53,15 @@ def _mols_to_spectra(ms2, metadata): return spectra @deprecated(version="1.3.3", reason="Use the GNPSMolecularFamilyLoader class instead.") -def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]): +def load_edges(edges_file: str, spec_dict: dict[str, Spectrum]): """Insert information about the molecular family into the spectra. Args: edges_file(str): File containing the molecular families. - spec_dict(dict[int, Spectrum]): Dictionary with mapping from spectra_id to Spectrum. + spec_dict(dict[str, Spectrum]): Dictionary with mapping from spectra_id to Spectrum. Raises: - Exception: Raises exception if the edges file doesn't contain the correct columns. + Exception: Raises exception if the edges file doesn't contain the correct columns. """ logger.debug('loading edges file: {} [{} spectra from MGF]'.format( edges_file, len(spec_dict))) @@ -145,7 +145,7 @@ def load_dataset(strains, @deprecated(version="1.3.3", reason="Use the GNPSSpectrumLoader class instead.") -def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[int, Spectrum]: +def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[str, Spectrum]: """Wrapper function to load spectra and init the molecular family links. Args: @@ -153,14 +153,14 @@ def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[i edges_file(str | PathLike): File storing the molecular family information in .selfloop or .pairsinfo format. Returns: - dict[int, Spectrum]: Indexed dict of mass spectra. + dict[str, Spectrum]: Indexed dict of mass spectra. """ ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(mgf_file)]) logger.info('%d molecules parsed from MGF file', len(ms1)) spectra = _mols_to_spectra(ms2, metadata) # above returns a list, create a dict indexed by spectrum_id to make # the rest of the parsing a bit simpler - spec_dict: dict[int, Spectrum] = {spec.spectrum_id: spec for spec in spectra} + spec_dict: dict[str, Spectrum] = {spec.spectrum_id: spec for spec in spectra} load_edges(edges_file, spec_dict) return spec_dict diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index a55a4df9..02b8496a 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -1,9 +1,9 @@ from typing_extensions import Self - from nplinker.metabolomics.spectrum import Spectrum from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain + class MolecularFamily(): def __init__(self, family_id: int): @@ -16,7 +16,7 @@ def __init__(self, family_id: int): self.family_id: int = family_id self.spectra: list[Spectrum] = [] self.family = None - self.spectra_ids: set[int] = set() + self.spectra_ids: set[str] = set() # def has_strain(self, strain): # for spectrum in self.spectra: diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 78a8e8ce..b6fa62a3 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -2,6 +2,7 @@ from nplinker.strain_collection import StrainCollection from nplinker.utils import sqrt_normalise + GNPS_KEY = 'gnps' JCAMP = '##TITLE={}\\n' +\ @@ -22,7 +23,7 @@ class Spectrum(): def __init__(self, id, peaks, - spectrum_id, + spectrum_id: str, precursor_mz, parent_mz=None, rt=None): @@ -34,8 +35,7 @@ def __init__(self, intensity for mz, intensity in self.peaks) self.total_ms2_intensity = sum( intensity for mz, intensity in self.peaks) - assert (isinstance(spectrum_id, int)) - self.spectrum_id = spectrum_id # == metadata.get('cluster_index') + self.spectrum_id = spectrum_id # MS1.name self.rt = rt self.precursor_mz = precursor_mz self.parent_mz = parent_mz diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index 51335c00..2340d9f1 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -261,7 +261,7 @@ def _get_met_classes(self, spec_like, method='mix'): # list of list of tuples/None - todo: add to spectrum object? # take only 'best' (first) classification per ontology level all_classes = self.npl.chem_classes.canopus. \ - spectra_classes.get(str(spec_like.spectrum_id)) + spectra_classes.get(spec_like.spectrum_id) if all_classes: spec_like_classes = [ cls_per_lvl for lvl in all_classes diff --git a/src/nplinker/scoring/rosetta/spec_lib.py b/src/nplinker/scoring/rosetta/spec_lib.py index 27bb9723..1a332a5e 100644 --- a/src/nplinker/scoring/rosetta/spec_lib.py +++ b/src/nplinker/scoring/rosetta/spec_lib.py @@ -13,8 +13,8 @@ # limitations under the License. from sortedcontainers import SortedList -from ...logconfig import LogConfig from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader +from ...logconfig import LogConfig from .rosetta_functions import fast_cosine diff --git a/tests/conftest.py b/tests/conftest.py index cac0255c..123e45d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,7 @@ def prepare_data(): @pytest.fixture -def spec_dict() -> dict[int, Spectrum]: +def spec_dict() -> dict[str, Spectrum]: mgf_file = DATA_DIR / "spectra.mgf" edges_file = DATA_DIR / "edges.pairsinfo" return load_spectra(mgf_file, edges_file) From 43f6d80d5c180155855522838f64002c59631ab4 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 19 Apr 2023 10:42:03 +0200 Subject: [PATCH 25/95] update spec_dict --- src/nplinker/metabolomics/load_gnps.py | 27 ++++++++++--------- src/nplinker/metabolomics/metabolomics.py | 8 +++--- .../test_gnps_annotation_loader.py | 4 +-- tests/scoring/test_data_links.py | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py index 0266df56..14a5ec2c 100644 --- a/src/nplinker/metabolomics/load_gnps.py +++ b/src/nplinker/metabolomics/load_gnps.py @@ -4,10 +4,11 @@ from typing import Any from deprecated import deprecated from nplinker.logconfig import LogConfig +from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping +from nplinker.metabolomics.gnps.gnps_format import GNPSFormat from nplinker.metabolomics.spectrum import Spectrum from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain -from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping ,GNPSFormat logger = LogConfig.getLogger(__name__) @@ -102,7 +103,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, the strain_mappings file. finally it also tries to extract the growth medium label as given in the metadata_table file, again using the strain label which should match between the two files. - >>> + >>> """ @@ -160,14 +161,14 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, return (strain_name, growth_medium, strain_name not in strains) -def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[int, Spectrum]) -> dict[str, int]: +def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[str, Spectrum]) -> dict[str, int]: """ Load info about clusters from old style GNPS files. Args: gnps_format(str): Identifier for the GNPS format of the file. Has to be one of [GNPS_FORMAT_OLD_ALLFILES, GNPS_FORMAT_OLD_UNIQUEFILES] strains(StrainCollection): StrainCollection in which to search for the detected strains. file(str): Path to file from which to load the cluster information. - spec_dict(dict[int, Spectrum]): Dictionary with already loaded spectra into which the metadata read from the file will be inserted. + spec_dict(dict[str, Spectrum]): Dictionary with already loaded spectra into which the metadata read from the file will be inserted. Raises: Exception: Raises exception if not supported GNPS format was detected. @@ -194,7 +195,7 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str for line in reader: # get the values of the important columns - clu_index = int(line[clu_index_index]) + clu_index = line[clu_index_index] if gnps_format == GNPSFormat.UniqueFiles: mzxmls = line[mzxml_index].split('|') else: @@ -314,7 +315,7 @@ def _parse_metadata_table(file: str) -> dict[str, dict[str, str|None]]: def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nodes_file: str, - md_table_file: str, spec_dict: dict[int, Spectrum], ext_metadata_parsing: bool) -> tuple[dict[int, dict[str, str|None]], dict[str, int]]: + md_table_file: str, spec_dict: dict[str, Spectrum], ext_metadata_parsing: bool) -> tuple[dict[str, dict[str, str|None]], dict[str, str]]: """Load the clusterinfo from a feature based molecular networking run output from GNPS. Args: @@ -322,11 +323,11 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod nodes_file(str): File from which to load the cluster information. extra_nodes_file(str): Unknown. md_table_file(str): Path to metadata table. Deprecated. - spec_dict(dict[int, Spectrum]): Dictionary with already loaded spectra. + spec_dict(dict[str, Spectrum]): Dictionary with already loaded spectra. ext_metadata_parsing(bool): Whether to use extended metadata parsing. Returns: - tuple[dict[int, dict], dict[str, int]]: Spectra info mapping from spectrum id to all columns in the nodes file and unknown strain mapping from file identifier to spectrum id. + tuple[dict[str, dict], dict[str, str]]: Spectra info mapping from spectrum id to all columns in the nodes file and unknown strain mapping from file identifier to spectrum id. """ spec_info = {} @@ -347,7 +348,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod tmp = {} for i, v in enumerate(line): tmp[headers[i]] = v - spec_info[int(line[ci_index])] = tmp + spec_info[line[ci_index]] = tmp with open(extra_nodes_file) as f: reader = csv.reader(f, delimiter=',') @@ -358,7 +359,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod # nodes_file to the "row ID" from this file, and update the per-row dict # with the extra columns from this file for line in reader: - ri = int(line[ri_index]) + ri = line[ri_index] tmp = {} for i, v in enumerate(line): tmp[headers[i]] = v @@ -437,7 +438,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod @deprecated(version="1.3.3", reason="Use the GNPSFileMappingLoader class instead.") def load_gnps(strains: StrainCollection, nodes_file: str, quant_table_file: str, metadata_table_file: str, - ext_metadata_parsing: bool, spec_dict: dict[int, Spectrum]) -> dict[str, int]: + ext_metadata_parsing: bool, spec_dict: dict[str, Spectrum]) -> dict[str, int]: """Wrapper function to load information from GNPS outputs. Args: @@ -446,13 +447,13 @@ def load_gnps(strains: StrainCollection, nodes_file: str, quant_table_file: str, quant_table_file(str): Path to the quantification table. metadata_table_file(str): Path to the metadata table. ext_metadata_parsing(bool): Whether to use extended metadata parsing. - spec_dict(dict[int, Spectrum]): Mapping from int to spectra loaded from file. + spec_dict(dict[str, Spectrum]): Mapping from int to spectra loaded from file. Raises: Exception: Raises exception if an unknown GNPS format is encountered. Returns: - dict[str, int]: Returns a mapping from unknown strains which are found to spectra ids which occur in these unknown strains. + dict[str, int]: Returns a mapping from unknown strains which are found to spectra ids which occur in these unknown strains. """ gnps_format = gnps_format_from_file_mapping(nodes_file, quant_table_file is not None) diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py index c9b2c929..041fbe2e 100644 --- a/src/nplinker/metabolomics/metabolomics.py +++ b/src/nplinker/metabolomics/metabolomics.py @@ -53,7 +53,7 @@ def _mols_to_spectra(ms2, metadata): return spectra @deprecated(version="1.3.3", reason="Use the GNPSMolecularFamilyLoader class instead.") -def load_edges(edges_file: str, spec_dict: dict[str, Spectrum]): +def load_edges(edges_file: str | PathLike, spec_dict: dict[str, Spectrum]): """Insert information about the molecular family into the spectra. Args: @@ -79,8 +79,8 @@ def load_edges(edges_file: str, spec_dict: dict[str, Spectrum]): edges_file)) for line in reader: - spec1_id = int(line[cid1_index]) - spec2_id = int(line[cid2_index]) + spec1_id = line[cid1_index] + spec2_id = line[cid2_index] cosine = float(line[cos_index]) family = int(line[fam_index]) @@ -134,7 +134,7 @@ def load_dataset(strains, # spec_dict = {spec.spectrum_id: spec for spec in spectra} # add edges info to the spectra - molfams = make_families(spec_dict.values()) + molfams = make_families(list(spec_dict.values())) # molfams = GNPSMolecularFamilyLoader(edges_file).families() unknown_strains = load_gnps(strains, nodes_file, quant_table_file, diff --git a/tests/metabolomics/test_gnps_annotation_loader.py b/tests/metabolomics/test_gnps_annotation_loader.py index 352be137..24f7cf99 100644 --- a/tests/metabolomics/test_gnps_annotation_loader.py +++ b/tests/metabolomics/test_gnps_annotation_loader.py @@ -32,7 +32,7 @@ def test_reads_all_annotations(file, expected): assert len(sut.get_annotations()) == expected -def test_annotations_are_equal(spec_dict: dict[int, Spectrum]): +def test_annotations_are_equal(spec_dict: dict[str, Spectrum]): annotations_dir = DATA_DIR / "annotations" annotations_file = annotations_dir / "gnps_annotations.tsv" @@ -44,7 +44,7 @@ def test_annotations_are_equal(spec_dict: dict[int, Spectrum]): spectra, spec_dict ) - expected: dict[int, dict] = {} + expected = {} for x in sut: if x.has_annotations(): expected[x.spectrum_id] = x.gnps_annotations diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 004db682..e794a29b 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -9,7 +9,7 @@ @pytest.fixture -def spec_with_families(spec_dict) -> dict[int, Spectrum]: +def spec_with_families(spec_dict) -> dict[str, Spectrum]: make_families(spec_dict.values()) return spec_dict From 7536e42e3d2c8e439a93b36775affe03b0b086d3 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 19 Apr 2023 11:20:51 +0200 Subject: [PATCH 26/95] Update tests --- tests/metabolomics/test_load_gnps.py | 10 +++++----- tests/metabolomics/test_spectrum.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/metabolomics/test_load_gnps.py b/tests/metabolomics/test_load_gnps.py index f3c4961b..354895c6 100644 --- a/tests/metabolomics/test_load_gnps.py +++ b/tests/metabolomics/test_load_gnps.py @@ -1,13 +1,13 @@ from itertools import chain - import pytest -from nplinker.metabolomics.load_gnps import _messy_strain_naming_lookup, _parse_mzxml_header -from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping, GNPSFormat +from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping +from nplinker.metabolomics.gnps.gnps_format import GNPSFormat from nplinker.metabolomics.load_gnps import _load_clusterinfo_old +from nplinker.metabolomics.load_gnps import _messy_strain_naming_lookup +from nplinker.metabolomics.load_gnps import _parse_mzxml_header from nplinker.metabolomics.load_gnps import load_gnps from nplinker.strain_collection import StrainCollection from nplinker.utils import get_headers - from .. import DATA_DIR @@ -51,7 +51,7 @@ def test_load_clusterinfo_old(spec_dict): for spectrum_id, spec in spec_dict.items(): metadata = spec.metadata assert len(metadata.get('files')) > 1 - assert isinstance(metadata.get('cluster_index'), int) + assert isinstance(metadata.get('cluster_index'), str) assert spectrum_id == metadata.get('cluster_index') diff --git a/tests/metabolomics/test_spectrum.py b/tests/metabolomics/test_spectrum.py index bcb82d36..30473f90 100644 --- a/tests/metabolomics/test_spectrum.py +++ b/tests/metabolomics/test_spectrum.py @@ -1,13 +1,13 @@ import pytest - from nplinker.metabolomics.spectrum import Spectrum + @pytest.fixture def spectrum() -> Spectrum: spec = Spectrum( 1, peaks=[[10, 100], [20, 150]], - spectrum_id=2, + spectrum_id="2", precursor_mz=30, parent_mz=50, rt= 100 From a0cf97ab23a99b515e5f7fda782918868631bd4d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 28 Apr 2023 11:34:18 +0200 Subject: [PATCH 27/95] update `__eq__` in MolecularFamily --- src/nplinker/metabolomics/molecular_family.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 02b8496a..4de13e3e 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -53,8 +53,7 @@ def __str__(self) -> str: def __eq__(self, other: Self) -> bool: if isinstance(other, MolecularFamily): return (self.id == other.id - and self.family_id == other.family_id - and set(self.spectra) == set(other.spectra)) + and self.family_id == other.family_id) return NotImplemented def __hash__(self) -> int: From b53af96a0ca2c0228febaae8d739735aa32c3681 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 28 Apr 2023 12:09:49 +0200 Subject: [PATCH 28/95] change `MolecularFamily.family_id` from type int to str --- src/nplinker/class_info/chem_classes.py | 2 +- .../gnps/gnps_molecular_family_loader.py | 8 +-- src/nplinker/metabolomics/metabolomics.py | 6 +-- src/nplinker/metabolomics/molecular_family.py | 13 ++--- src/nplinker/metabolomics/singleton_family.py | 4 +- src/nplinker/metabolomics/spectrum.py | 2 +- src/nplinker/scoring/np_class_scoring.py | 4 +- .../test_gnps_molecular_family_loader.py | 49 +++++++------------ 8 files changed, 34 insertions(+), 54 deletions(-) diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py index 00a9cdb4..d2f184a3 100644 --- a/src/nplinker/class_info/chem_classes.py +++ b/src/nplinker/class_info/chem_classes.py @@ -399,7 +399,7 @@ class prediction for a level. When no class is present, instead of Tuple it will molfam_classes = {} for molfam in molfams: - fid = str(molfam.family_id) # the key + fid = molfam.family_id # the key spectra = molfam.spectra # if singleton family, format like '-1_spectrum-id' if fid == '-1': diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index 729b34b8..0a3b7659 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -19,7 +19,7 @@ def __init__(self, file: str | PathLike): self._families: list[MolecularFamily | SingletonFamily] = [] for family_id, spectra_ids in _load_molecular_families(file).items(): - if family_id == -1: + if family_id == '-1': for spectrum_id in spectra_ids: family = SingletonFamily() family.spectra_ids = set([spectrum_id]) @@ -33,14 +33,14 @@ def families(self) -> list[MolecularFamily]: return self._families -def _load_molecular_families(file: str | PathLike) -> dict[int, set[str]]: +def _load_molecular_families(file: str | PathLike) -> dict[str, set[str]]: """Load ids of molecular families and corresponding spectra from GNPS output file. Args: file(str | PathLike): path to the GNPS file to load molecular families. Returns: - dict[int, set[str]]: Mapping from molecular family/cluster id to the spectra ids. + dict[str, set[str]]: Mapping from molecular family/cluster id to the spectra ids. """ logger.debug('loading edges file: %s', file) @@ -54,7 +54,7 @@ def _load_molecular_families(file: str | PathLike) -> dict[int, set[str]]: for line in reader: spec1_id = line[cid1_index] spec2_id = line[cid2_index] - family_id = int(line[fam_index]) + family_id = line[fam_index] if families.get(family_id) is None: families[family_id] = set([spec1_id, spec2_id]) diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py index 041fbe2e..79b1c7d5 100644 --- a/src/nplinker/metabolomics/metabolomics.py +++ b/src/nplinker/metabolomics/metabolomics.py @@ -82,13 +82,13 @@ def load_edges(edges_file: str | PathLike, spec_dict: dict[str, Spectrum]): spec1_id = line[cid1_index] spec2_id = line[cid2_index] cosine = float(line[cos_index]) - family = int(line[fam_index]) + family = line[fam_index] if spec1_id in spec_dict and spec2_id in spec_dict: spec1 = spec_dict[spec1_id] spec2 = spec_dict[spec2_id] - if family != -1: # singletons + if family != '-1': # singletons spec1.family_id = family spec2.family_id = family @@ -181,7 +181,7 @@ def make_families(spectra: list[Spectrum]) -> list[MolecularFamily]: fams, singles = 0, 0 for spectrum in spectra: family_id = spectrum.family_id - if family_id == -1: # singleton + if family_id == '-1': # singleton new_family = SingletonFamily() new_family.id = family_index family_index += 1 diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 4de13e3e..13d8430e 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -6,25 +6,18 @@ class MolecularFamily(): - def __init__(self, family_id: int): + def __init__(self, family_id: str): """Class to model molecular families. Args: - family_id(int): Id for the molecular family. + family_id(str): Id for the molecular family. """ self.id: int = -1 - self.family_id: int = family_id + self.family_id: str = family_id self.spectra: list[Spectrum] = [] self.family = None self.spectra_ids: set[str] = set() - # def has_strain(self, strain): - # for spectrum in self.spectra: - # if spectrum.has_strain(strain): - # return True - - # return False - @property def strains(self) -> StrainCollection: """Get strains of spectra in the molecular family. diff --git a/src/nplinker/metabolomics/singleton_family.py b/src/nplinker/metabolomics/singleton_family.py index a742b807..36f03e49 100644 --- a/src/nplinker/metabolomics/singleton_family.py +++ b/src/nplinker/metabolomics/singleton_family.py @@ -1,10 +1,10 @@ from .molecular_family import MolecularFamily + class SingletonFamily(MolecularFamily): def __init__(self): - super().__init__(-1) + super().__init__('-1') def __str__(self): return f"Singleton molecular family (id={self.id})" - diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index b6fa62a3..0002843d 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -47,7 +47,7 @@ def __init__(self, # this is a dict indexed by Strain objects (the strains found in this Spectrum), with # the values being dicts of the form {growth_medium: peak intensity} for the parent strain self.growth_media = {} - self.family_id = -1 + self.family_id = '-1' self.family = None # a dict indexed by filename, or "gnps" self.annotations = {} diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index 2340d9f1..304651dc 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -270,7 +270,7 @@ def _get_met_classes(self, spec_like, method='mix'): spec_like_classes_names_inds = self.npl.chem_classes.canopus. \ spectra_classes_names_inds else: # molfam - fam_id = str(spec_like.family_id) + fam_id = spec_like.family_id if fam_id == '-1': # account for singleton families fam_id += f'_{spec_like.spectra[0].spectrum_id}' all_classes = self.npl.chem_classes.canopus.molfam_classes.get( @@ -288,7 +288,7 @@ def _get_met_classes(self, spec_like, method='mix'): spec_like_classes = self.npl.chem_classes.molnetenhancer. \ spectra_classes(spec_like.spectrum_id) else: # molfam - fam_id = str(spec_like.family_id) + fam_id = spec_like.family_id if fam_id == '-1': # account for singleton families fam_id += f'_{spec_like.spectra[0].spectrum_id}' spec_like_classes = self.npl.chem_classes.molnetenhancer. \ diff --git a/tests/metabolomics/test_gnps_molecular_family_loader.py b/tests/metabolomics/test_gnps_molecular_family_loader.py index e8ac05a1..1174e28b 100644 --- a/tests/metabolomics/test_gnps_molecular_family_loader.py +++ b/tests/metabolomics/test_gnps_molecular_family_loader.py @@ -1,42 +1,29 @@ import os -import numpy import pytest from nplinker.metabolomics.gnps.gnps_molecular_family_loader import \ GNPSMolecularFamilyLoader -from nplinker.metabolomics.metabolomics import make_families -from nplinker.metabolomics.molecular_family import MolecularFamily from .. import DATA_DIR -@pytest.fixture -def molecular_families(spec_dict) -> list[MolecularFamily]: - return make_families(spec_dict.values()) - -@pytest.mark.parametrize("filename", [ - os.path.join(DATA_DIR, "edges.pairsinfo"), - DATA_DIR / "edges.pairsinfo" -]) +@pytest.mark.parametrize( + "filename", + [os.path.join(DATA_DIR, "edges.pairsinfo"), DATA_DIR / "edges.pairsinfo"]) def test_has_molecular_families(filename): sut = GNPSMolecularFamilyLoader(filename) actual = sut.families() assert len(actual) == 25769 - assert len(actual[0].spectra_ids) == 19 - - -def test_families_are_identical(spec_dict, molecular_families): - filename = os.path.join(DATA_DIR, "edges.pairsinfo") - actual = GNPSMolecularFamilyLoader(filename).families() - - actual.sort(key= lambda x: min(x.spectra_ids)) - - for i, x in enumerate(actual): - x.id = i - for spec_id in x.spectra_ids: - x.add_spectrum(spec_dict[spec_id]) - - - for x in molecular_families: - for spec in x.spectra: - x.spectra_ids.add(spec.spectrum_id) - - numpy.testing.assert_array_equal(actual, molecular_families) + mf_ids = [mf.family_id for mf in actual[:30]] + assert mf_ids == [ + '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', + '14', '15', '16', '17', '18', '20', '21', '22', '23', '24', '26', '28', + '30', '31', '32', '33', '-1' + ] + num_spec_ids = [len(mf.spectra_ids) for mf in actual[:30]] + assert num_spec_ids == [ + 19, 48, 3, 3, 11, 4, 9, 3, 15, 3, 5, 2, 3, 3, 5, 3, 14, 4, 2, 2, 12, 2, + 3, 5, 2, 4, 2, 2, 2, 1 + ] + assert actual[0].spectra_ids == set( + ('13170', '13662', '15316', '15364', '16341', '17201', '17270', + '18120', '18172', '18748', '18831', '19005', '19673', '19719', + '20320', '20738', '21163', '21593', '23566')) From ca848fd0bac2d77096f48e67975bfc0ba0a781c0 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 11:06:27 +0200 Subject: [PATCH 29/95] add method `has_strain` to MolecularFamily --- src/nplinker/metabolomics/molecular_family.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 13d8430e..57cd686a 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -15,9 +15,9 @@ def __init__(self, family_id: str): self.id: int = -1 self.family_id: str = family_id self.spectra: list[Spectrum] = [] - self.family = None self.spectra_ids: set[str] = set() + # TODO: change property to attibute @property def strains(self) -> StrainCollection: """Get strains of spectra in the molecular family. @@ -31,6 +31,18 @@ def strains(self) -> StrainCollection: strains.add(strain) return strains + def has_strain(self, strain: str | Strain) -> bool: + """Check if the given strain exists. + + Args: + strain(str | Strain): strain id or `Strain` object. + + Returns: + bool: True when the given strain exist. + """ + return strain in self.strains + + # TODO: update the logics, mf should also be added to the spectrum object def add_spectrum(self, spectrum: Spectrum): """Add a spectrum to the spectra list. From 583fa3384a8ae0d663a83a9eef354c8884343f1f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 11:06:50 +0200 Subject: [PATCH 30/95] Update metcalf_scoring.py --- src/nplinker/scoring/metcalf_scoring.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 65c45a5f..79e90fc4 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -7,10 +7,11 @@ from nplinker.metabolomics.spectrum import Spectrum from nplinker.pickler import load_pickled_data from nplinker.pickler import save_pickled_data -from nplinker.scoring.methods import ScoringMethod -from nplinker.scoring.object_link import ObjectLink from nplinker.scoring.linking.data_linking import DataLinks from nplinker.scoring.linking.link_finder import LinkFinder +from nplinker.scoring.methods import ScoringMethod +from nplinker.scoring.object_link import ObjectLink + logger = LogConfig.getLogger(__name__) @@ -80,8 +81,7 @@ def setup(npl): MetcalfScoring.DATALINKS.load_data(npl._spectra, npl._gcfs, npl._strains, npl.molfams) # TODO fix crash with this set to True, see https://github.com/sdrogers/nplinker/issues/57 - MetcalfScoring.DATALINKS.find_correlations( - include_singletons=False) + MetcalfScoring.DATALINKS.find_correlations() MetcalfScoring.LINKFINDER = LinkFinder() MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, type='spec-gcf') From 6ebaf0dca5cdb32378ffaafaacac34b1d0c1852d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 16:34:06 +0200 Subject: [PATCH 31/95] change array to dataframe in DataLinks 1. Change array to dataframe: - self.M_gcf_strain -> self.gcf_strain_occurrence - self. M_spec_strain -> self.spec_strain_occurrence - self. M_fam_strain -> mf_strain_occurrence 2. update relevant methods to get the new dataframes 3. update logics of method `common_strains` using the new dataframes --- src/nplinker/scoring/linking/data_linking.py | 362 ++++++++----------- 1 file changed, 144 insertions(+), 218 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index bec8b6d2..651390ee 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -1,17 +1,3 @@ -# Copyright 2021 The NPLinker Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - # Methods to find correlations between spectra/molecular families and # gene clusters/families (BGCs/GCFs) # @@ -23,21 +9,21 @@ # spec stands for spectrum # fam stands for molecular family +from __future__ import annotations from collections import Counter -from typing import Sequence -# import packages +from typing import Sequence, TYPE_CHECKING import numpy as np import pandas as pd from nplinker.genomics.gcf import GCF +from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.singleton_family import SingletonFamily from nplinker.metabolomics.spectrum import Spectrum from .data_linking_functions import calc_correlation_matrix -SCORING_METHODS = ['metcalf', 'likescore', 'hg'] - -from nplinker.logconfig import LogConfig - +if TYPE_CHECKING: + from nplinker.strain_collection import StrainCollection logger = LogConfig.getLogger(__name__) @@ -51,21 +37,17 @@ class DataLinks(): """ def __init__(self): - # matrices that store co-occurences with respect to strains - # values = 1 where gcf/spec/fam occur in strain - # values = 0 where gcf/spec/fam do not occur in strain - - # 2D array [gcf: int, strain: int] - self.M_gcf_strain = [] - self.M_spec_strain = [] - self.M_fam_strain = [] + # DataFrame to store occurence of objects with respect to strains + # values = 1 where gcf/spec/fam occur in strain, 0 otherwise + self.gcf_strain_occurrence = pd.DataFrame() + self.spec_strain_occurrence = pd.DataFrame() + self.mf_strain_occurrence = pd.DataFrame() # mappings (lookup lists to map between different ids and categories self.mapping_spec = pd.DataFrame() self.mapping_gcf = pd.DataFrame() self.mapping_fam = pd.DataFrame() # labels for strain-family matrix self.mapping_strain = pd.DataFrame() - self.family_members = [] # correlation matrices for spectra <-> GCFs self.M_spec_gcf = [ @@ -79,48 +61,53 @@ def __init__(self): self.M_fam_notgcf = [] self.M_notfam_gcf = [] - def get_spec_pos(self, spec_id): - # get the position in the arrays of a spectrum - row = self.mapping_spec.loc[self.mapping_spec['original spec-id'] == - float(spec_id)] - return int(row.iloc[0]['spec-id']) - - def get_gcf_pos(self, gcf_id): - #Ā TODO: fix this so the original ID is present in case of re-ordering - pass - - def load_data(self, spectra, gcf_list, strain_list, molfams): + def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF], + strains: StrainCollection, + molfams: Sequence[MolecularFamily]): # load data from spectra, GCFs, and strains logger.debug("Create mappings between spectra, gcfs, and strains.") self.collect_mappings_spec(spectra) - # self.collect_mappings_spec_v2(molfams) - self.collect_mappings_gcf(gcf_list) + self.collect_mappings_gcf(gcfs) logger.debug( - "Create co-occurence matrices: spectra<->strains + and gcfs<->strains." + "Create co-occurence matrices: spectra<->strains, gcfs<->strains and mfs<->strains." ) - self.matrix_strain_gcf(gcf_list, strain_list) - self.matrix_strain_spec(spectra, strain_list) - - def find_correlations(self, include_singletons=False): + self._get_gcf_strain_occurrence(gcfs, strains) + self._get_spec_strain_occurrence(spectra, strains) + self._get_mf_strain_occurrence(molfams, strains) + + self._get_mappings_from_occurrence() + + def _get_mappings_from_occurrence(self): + self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence, + axis=1) + self.mapping_spec["no of strains"] = np.sum( + self.spec_strain_occurrence, axis=1) + self.mapping_strain["no of spectra"] = np.sum( + self.spec_strain_occurrence, axis=0) + self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence, + axis=1) + + def find_correlations(self): # collect correlations/ co-occurences logger.debug("Create correlation matrices: spectra<->gcfs.") self.correlation_matrices(type='spec-gcf') logger.debug("Create correlation matrices: mol-families<->gcfs.") - self.data_family_mapping(include_singletons=include_singletons) self.correlation_matrices(type='fam-gcf') - def collect_mappings_spec(self, obj: Sequence[Spectrum]|Sequence[MolecularFamily]): + def collect_mappings_spec(self, obj: Sequence[Spectrum] + | Sequence[MolecularFamily]): if isinstance(obj[0], Spectrum): mapping_spec = self._collect_mappings_from_spectra(obj) elif isinstance(obj[0], MolecularFamily): mapping_spec = self._collect_mappings_from_molecular_families(obj) # extend mapping tables: - self.mapping_spec["spec-id"] = mapping_spec[:, 0] - self.mapping_spec["original spec-id"] = mapping_spec[:, 1] + # TODO: why do we need the mappings??? + # "spec-id" is defined as the index of the spectrum in the input data self.mapping_spec["fam-id"] = mapping_spec[:, 2] - def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]: + def _collect_mappings_from_spectra(self, + spectra) -> np.ndarray[np.float64]: # Collect most import mapping tables from input data mapping_spec = np.zeros((len(spectra), 3)) mapping_spec[:, 0] = np.arange(0, len(spectra)) @@ -131,7 +118,9 @@ def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]: return mapping_spec - def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]: + def _collect_mappings_from_molecular_families( + self, + molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]: num_spectra = sum(len(x.spectra_ids) for x in molfams) mapping_spec = np.zeros((num_spectra, 3)) mapping_spec[:, 0] = np.arange(0, num_spectra) @@ -201,174 +190,115 @@ def collect_mappings_gcf(self, gcf_list): ]) # extend mapping tables: - self.mapping_gcf["gcf-id"] = np.arange(0, len(bigscape_bestguess)) bigscape_guess, bigscape_guessscore = zip(*bigscape_bestguess) self.mapping_gcf["bgc class"] = bigscape_guess - self.mapping_gcf["bgc class score"] = bigscape_guessscore - def matrix_strain_gcf(self, gcf_list, strain_list): - # Collect co-ocurences in M_spec_strain matrix - M_gcf_strain = np.zeros((len(gcf_list), len(strain_list))) + def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF], + strains: StrainCollection) -> None: + """Get the occurence of strains in gcfs. - for i, gcf in enumerate(gcf_list): - for j, strain in enumerate(strain_list): + The occurence is a DataFrame with gcfs as rows and strains as columns, + where index is `gcf.gcf_id` and column name is `strain.id`. The values + are 1 if the gcf contains the strain and 0 otherwise. + """ + df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))), + index=[gcf.gcf_id for gcf in gcfs], + columns=[strain.id for strain in strains]) + for gcf in gcfs: + for strain in strains: if gcf.has_strain(strain): - M_gcf_strain[i, j] = 1 + df_gcf_strain.loc[gcf.gcf_id, strain.id] = 1 + self.gcf_strain_occurrence = df_gcf_strain - self.M_gcf_strain = M_gcf_strain - # extend mapping tables: - self.mapping_gcf["no of strains"] = np.sum(self.M_gcf_strain, axis=1) - self.mapping_strain["no of gcfs"] = np.sum(self.M_gcf_strain, axis=0) - - def matrix_strain_spec(self, spectra, strain_list): - # Collect co-ocurences in M_strains_specs matrix - - M_spec_strain = np.zeros((len(spectra), len(strain_list))) - for i, spectrum in enumerate(spectra): - for j, s in enumerate(strain_list): - if spectrum.has_strain(s): - M_spec_strain[i, j] = 1 - self.M_spec_strain = M_spec_strain + def _get_spec_strain_occurrence(self, spectra: Sequence[Spectrum], + strains: StrainCollection) -> None: + """Get the occurence of strains in spectra. - # extend mapping tables: - self.mapping_spec["no of strains"] = np.sum(self.M_spec_strain, axis=1) - self.mapping_strain["no of spectra"] = np.sum(self.M_spec_strain, - axis=0) - self.mapping_strain["strain name"] = [str(s) for s in strain_list] - - def data_family_mapping(self, include_singletons=False): - # Create M_fam_strain matrix that gives co-occurences between mol. families and strains - # matrix dimensions are: number of families x number of strains - - family_ids = np.unique( - self.mapping_spec["fam-id"]) # get unique family ids - - # if singletons are included, check if there are a non-zero number of them (singleton - # families all have -1 as a family ID number) - if include_singletons and np.where( - self.mapping_spec["fam-id"] == -1)[0].shape[0] > 0: - # in this case the number of unique families is the number of singletons - # plus the number of normal families. the "-1" is (I think) to account for - # the single "-1" entry that will be present in "family_ids". - num_of_singletons = np.where( - self.mapping_spec["fam-id"] == -1)[0].shape[0] - num_unique_fams = num_of_singletons + len(family_ids) - 1 - else: - # if no singletons included or present in the dataset, just take the number - # of regular molfams instead - num_of_singletons = 0 - num_unique_fams = len(family_ids) - - M_fam_strain = np.zeros((num_unique_fams, self.M_spec_strain.shape[1])) - strain_fam_labels = [] - strain_fam_index = [] - - if num_of_singletons > 0: # if singletons exist + included - M_fam_strain[( - num_unique_fams - - num_of_singletons):, :] = self.M_spec_strain[np.where( - self.mapping_spec["fam-id"][:, 0] == -1)[0], :] - - # go through families (except singletons) and collect member strain occurences - self.family_members = [] - for i, fam_id in enumerate( - family_ids[np.where(family_ids != -1)].astype(int)): - family_members = np.where( - np.array(self.mapping_spec["fam-id"]) == fam_id) - self.family_members.append(family_members) - M_fam_strain[i, :] = np.sum(self.M_spec_strain[family_members, :], - axis=1) - strain_fam_labels.append(fam_id) - strain_fam_index.append(i) - - add_singleton_entries = -1 in family_ids - # TODO: i think this breaks stuff below due to mismatches in the number of rows - # in the dataframes and matrices if there are no -1 family ids. - # discovered when trying to write some code to test scoring. is this ever - # likely to happen with a real dataset?? - if add_singleton_entries: - strain_fam_labels.append([-1] * num_of_singletons) - strain_fam_index.append(i + 1) - - # only looking for co-occurence, hence only 1 or 0 - M_fam_strain[M_fam_strain > 1] = 1 - - self.M_fam_strain = M_fam_strain - # extend mapping table: - self.mapping_fam["family id"] = strain_fam_index - self.mapping_fam["original family id"] = strain_fam_labels - self.mapping_fam["no of strains"] = np.sum(self.M_fam_strain, axis=1) - num_members = [x[0].shape[0] for x in self.family_members] - # see above - if add_singleton_entries: - num_members.append(num_of_singletons) - self.mapping_fam["no of members"] = num_members - return self.family_members - - def common_strains(self, metabolome_objects, gcfs, filter_no_shared=False) -> dict: + The occurence is a DataFrame with spectra as rows and strains as columns, + where index is `spectrum.spectrum_id` and column name is `strain.id`. + The values are 1 if the spectrum contains the strain and 0 otherwise. """ - Obtain the set of common strains between all pairs from the lists objects_a - and objects_b. - - The two parameters can be either lists or single instances of the 3 supported - object types (GCF, Spectrum, MolecularFamily). It's possible to use a single - object together with a list as well. - - Returns a dict indexed by tuples of (Spectrum/MolecularFamily, GCF), where - the values are lists of strain indices which appear in both objects, which - can then be looked up in NPLinker.strains. + df_spec_strain = pd.DataFrame( + np.zeros((len(spectra), len(strains))), + index=[spectrum.spectrum_id for spectrum in spectra], + columns=[strain.id for strain in strains]) + for spectrum in spectra: + for strain in strains: + if spectrum.has_strain(strain): + df_spec_strain.loc[spectrum.spectrum_id, strain.id] = 1 + self.spec_strain_occurrence = df_spec_strain + + def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily], + strains: StrainCollection) -> None: + """Get the occurence of strains in molecular families. + + The occurence is a DataFrame with molecular families as rows and + strains as columns, where index is `mf.family_id` and column name is + `strain.id`. The values are 1 if the molecular family contains the + strain and 0 otherwise. + + Note that SingletonFamily objects are excluded from given `mfs`. """ + # remove SingletonFamily objects + mfs = [mf for mf in mfs if not isinstance(mf, SingletonFamily)] + + df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))), + index=[mf.family_id for mf in mfs], + columns=[strain.id for strain in strains]) + for mf in mfs: + for strain in strains: + if mf.has_strain(strain): + df_mf_strain.loc[mf.family_id, strain.id] = 1 + self.mf_strain_occurrence = df_mf_strain + + def get_common_strains( + self, + spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily], + gcfs: Sequence[GCF], + filter_no_shared: bool = False + ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]: + """Get common strains between given spectra/molecular families and GCFs. + + Note that SingletonFamily objects are excluded from given `spectra_or_molfams`. + + Args: + spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]): + A list of Spectrum or MolecularFamily objects. + gcfs(Sequence[GCF]): A list of GCF objects. + filter_no_shared(bool): If True, return only the pair of + spectrum/molecular family and GCF that have common strains; + otherwise, return all pairs no matter they have common strains + or not. + + Returns: + dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) + and values are a list of strain ids that appear in both objects. + """ + if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)): + raise ValueError( + 'Must provide Spectra or MolecularFamilies as the first argument!' + ) + if not isinstance(gcfs[0], GCF): + raise ValueError('Must provide GCFs as the second argument!') - # TODO make this work for BGCs too? - - is_list_a = isinstance(metabolome_objects, list) - is_list_b = isinstance(gcfs, list) - - type_a = type(metabolome_objects[0]) if is_list_a else type(metabolome_objects) - type_b = type(gcfs[0]) if is_list_b else type(gcfs) - - if type_a == type_b: - raise Exception('Must supply objects with different types!') - - # to keep things slightly simpler, ensure the GCFs are always "b" - if type_a == GCF: - type_a, type_b = type_b, type_a - is_list_a, is_list_b = is_list_b, is_list_a - metabolome_objects, gcfs = gcfs, metabolome_objects - - if not is_list_a: - metabolome_objects = [metabolome_objects] - if not is_list_b: - gcfs = [gcfs] - - # retrieve object IDs - # TODO: issue #103 stop using gcf.id, but note that the ids_b should be - # a list of int - gcfs_id_list = [gcf.id for gcf in gcfs] - # these might be MolFams or Spectra, either way they'll have a .id attribute - ids_a = [obj.id for obj in metabolome_objects] - - data_a = self.M_spec_strain if type_a == Spectrum else self.M_fam_strain - + # Assume that 3 occurrence dataframes have same df.columns (strain ids) + strain_ids = self.gcf_strain_occurrence.columns results = {} - for i, obj_meta in enumerate(metabolome_objects): - for j, obj_gcf in enumerate(gcfs): - # just AND both arrays and extract the indices with positive results - - # self.M_gcf_strain is a 2D np.array [index of gcf_list, index of strain_list] - # TODO: bug here. self.M_gcf_strain use the enumerate count of the gcf_list as index - # it's wrong to assume that enumerate count is same as the gcf.id. - # self.M_gcf_strain should use dataframe - # TODO: Bug: the result is 1D array of the enumerated count of strain_list - # for the common strains. Same here, the enumerate count is not - # the same as the strain.id - result = np.where( - np.logical_and(data_a[ids_a[i]], self.M_gcf_strain[gcfs_id_list[j]]))[0] - # if we want to exclude results with no shared strains - if (filter_no_shared - and len(result) > 0) or not filter_no_shared: - results[(obj_meta, obj_gcf)] = result - + for obj in spectra_or_molfams: + if isinstance(obj, SingletonFamily): + continue + for gcf in gcfs: + if isinstance(obj, Spectrum): + shared_strains = strain_ids[np.logical_and( + self.spec_strain_occurrence.loc[obj.spectrum_id], + self.gcf_strain_occurrence.loc[gcf.gcf_id])] + else: + shared_strains = strain_ids[np.logical_and( + self.mf_strain_occurrence.loc[obj.family_id], + self.gcf_strain_occurrence.loc[gcf.gcf_id])] + if filter_no_shared and len(shared_strains) == 0: + continue + results[(obj, gcf)] = shared_strains.to_list() return results def correlation_matrices(self, type='spec-gcf'): @@ -384,9 +314,9 @@ def correlation_matrices(self, type='spec-gcf'): # Make selection for scenario spec<->gcf or fam<->gcf if type == 'spec-gcf': - M_type1_strain = self.M_spec_strain + M_type1_strain = self.spec_strain_occurrence elif type == 'fam-gcf': - M_type1_strain = self.M_fam_strain + M_type1_strain = self.mf_strain_occurrence elif type == 'spec-bgc' or type == 'fam-bgc': raise Exception("Given types are not yet supported... ") else: @@ -394,12 +324,11 @@ def correlation_matrices(self, type='spec-gcf'): "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf', ..." ) - logger.debug( - f"Calculating correlation matrices of type: {type}") + logger.debug(f"Calculating correlation matrices of type: {type}") # Calculate correlation matrix from co-occurence matrices M_type1_gcf, M_type1_notgcf, M_nottype1_gcf, M_nottype1_notgcf = calc_correlation_matrix( - M_type1_strain, self.M_gcf_strain) + M_type1_strain, self.gcf_strain_occurrence) # return results: if type == 'spec-gcf': @@ -414,6 +343,3 @@ def correlation_matrices(self, type='spec-gcf'): self.M_notfam_notgcf = M_nottype1_notgcf else: raise Exception("No correct correlation matrix was created.") - - # class data_links OUTPUT functions - # TODO add output functions (e.g. to search for mappings of individual specs, gcfs etc.) From 2cd8e01b98b7fe83fd819a8933afabeb2e0d4d31 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 16:37:02 +0200 Subject: [PATCH 32/95] update references of the new dataframes from DataLinks --- src/nplinker/process_output.py | 2 +- src/nplinker/scoring/linking/link_finder.py | 24 +++++++++---------- .../scoring/linking/link_likelihood.py | 6 ++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/nplinker/process_output.py b/src/nplinker/process_output.py index 42420867..9892a583 100644 --- a/src/nplinker/process_output.py +++ b/src/nplinker/process_output.py @@ -51,7 +51,7 @@ def get_sig_spec(data_link, sig_links, scores, gcf_pos, min_n_strains=2): # Check if there are *any* strains in the GCF #Ā No strains = MiBIG # Can also filter if only (e.g. 2 strains) - strain_sum = data_link.M_gcf_strain[gcf_pos, :].sum() + strain_sum = data_link.gcf_strain_occurrence[gcf_pos, :].sum() if strain_sum < min_n_strains: return [] col = sig_links[:, gcf_pos] #Ā get the column diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index cf3563a5..71aaa2e6 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -18,8 +18,8 @@ # import packages for plotting # TODO move plotting to separate module? try: - import seaborn as sns from matplotlib import pyplot as plt + import seaborn as sns except ImportError: print( 'Warning: plotting functionality will not be available (missing matplotlib and/or seaborn)' @@ -104,7 +104,7 @@ def metcalf_scoring(self, # Compute the expected values for all possible values of spec and gcf strains # we need the total number of strains - _, n_strains = data_links.M_gcf_strain.shape + _, n_strains = data_links.gcf_strain_occurrence.shape if self.metcalf_expected is None: sz = (n_strains + 1, n_strains + 1) self.metcalf_expected = np.zeros(sz) @@ -177,7 +177,7 @@ def hg_scoring(self, data_links, type='spec-gcf'): if type == 'spec-gcf': num_strains = np.ones( - data_links.M_spec_gcf.shape) * data_links.M_gcf_strain.shape[1] + data_links.M_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] overlap_counts = data_links.M_spec_gcf gcf_counts = overlap_counts + data_links.M_notspec_gcf spec_counts = overlap_counts + data_links.M_spec_notgcf @@ -189,7 +189,7 @@ def hg_scoring(self, data_links, type='spec-gcf'): self.hg_spec_gcf = hg_scores elif type == 'fam-gcf': num_strains = np.ones( - data_links.M_fam_gcf.shape) * data_links.M_gcf_strain.shape[1] + data_links.M_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] overlap_counts = data_links.M_fam_gcf gcf_counts = overlap_counts + data_links.M_notfam_gcf fam_counts = overlap_counts + data_links.M_fam_notgcf @@ -350,7 +350,7 @@ def select_link_candidates(self, P_str = np.array(data_links.mapping_strain["no of spectra"]) P_str = P_str / np.sum(P_str) - num_strains = data_links.M_gcf_strain.shape[1] + num_strains = data_links.gcf_strain_occurrence.shape[1] # Calculate the hypergeometric probability (as before) for i in range(link_candidates.shape[1]): @@ -361,11 +361,11 @@ def select_link_candidates(self, # Calculate the GCF specific probability for i in range(link_candidates.shape[1]): - id_spec = int(link_candidates[0, i]) - id_gcf = int(link_candidates[1, i]) + id_spec = link_candidates[0, i] + id_gcf = link_candidates[1, i] # find set of strains which contain GCF with id link_candidates[1,i] - XG = np.where(data_links.M_gcf_strain[id_gcf, :] == 1)[0] + XG = np.where(data_links.gcf_strain_occurrence.loc[id_gcf, :] == 1)[0] link_candidates[10, i] = pair_prob_approx(P_str, XG, @@ -374,11 +374,11 @@ def select_link_candidates(self, # Calculate the link specific probability # Find strains where GCF and spectra/family co-occur if type == 'spec-gcf': - XGS = np.where((data_links.M_gcf_strain[id_gcf, :] == 1) & - (data_links.M_spec_strain[id_spec, :] == 1))[0] + XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1) & + (data_links.spec_strain_occurrence[id_spec, :] == 1))[0] elif type == 'fam-gcf': - XGS = np.where((data_links.M_gcf_strain[id_gcf, :] == 1) - & (data_links.M_fam_strain[id_spec, :] == 1))[0] + XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1) + & (data_links.mf_strain_occurrence[id_spec, :] == 1))[0] link_candidates[11, i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]), int(Ny_list[id_spec]), num_strains) diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py index fa68f952..90446a18 100644 --- a/src/nplinker/scoring/linking/link_likelihood.py +++ b/src/nplinker/scoring/linking/link_likelihood.py @@ -49,12 +49,12 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): M_type1_type2 = data_links.M_spec_gcf M_type1_nottype2 = data_links.M_spec_notgcf M_nottype1_type2 = data_links.M_notspec_gcf - M_type1_cond = data_links.M_spec_strain + M_type1_cond = data_links.spec_strain_occurrence elif type == 'fam-gcf': M_type1_type2 = data_links.M_fam_gcf M_type1_nottype2 = data_links.M_fam_notgcf M_nottype1_type2 = data_links.M_notfam_gcf - M_type1_cond = data_links.M_fam_strain + M_type1_cond = data_links.mf_strain_occurrence elif type == 'spec-bgc' or type == 'fam-bgc': raise Exception("Given types are not yet supported... ") else: @@ -67,7 +67,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): # Calculate likelihood matrices using calc_likelihood_matrix() P_type2_given_type1, P_type2_not_type1, P_type1_given_type2, \ P_type1_not_type2 = calc_likelihood_matrix(M_type1_cond, - data_links.M_gcf_strain, + data_links.gcf_strain_occurrence, M_type1_type2, M_type1_nottype2, M_nottype1_type2) From 2834345beb34aa3fd4694c651a9cbcdca66d557f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 16:38:20 +0200 Subject: [PATCH 33/95] update logics of `get_links` in NPLinker class --- src/nplinker/nplinker.py | 75 ++++++---------------------------------- 1 file changed, 10 insertions(+), 65 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 8c78f47f..cc80872b 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -28,6 +28,7 @@ from .scoring.metcalf_scoring import MetcalfScoring from .scoring.np_class_scoring import NPClassScoring from .scoring.rosetta_scoring import RosettaScoring +from .strain_collection import StrainCollection logger = LogConfig.getLogger(__name__) @@ -289,7 +290,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): self._bgcs = self._loader.bgcs self._gcfs = self._loader.gcfs self._mibig_bgc_dict = self._loader.mibig_bgc_dict - self._strains = self._loader.strains + self._strains: StrainCollection = self._loader.strains self._product_types = self._loader.product_types self._chem_classes = self._loader.chem_classes self._class_matches = self._loader.class_matches @@ -433,24 +434,20 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): targets = list( filter(lambda x: not isinstance(x, BGC), link_data.keys())) if len(targets) > 0: - shared_strains = self._datalinks.common_strains([source], - targets, True) - for objpair in shared_strains.keys(): - shared_strains[objpair] = [ - self._strains.lookup_index(x) - for x in shared_strains[objpair] - ] - if isinstance(source, GCF): + shared_strains = self._datalinks.common_strains(targets, [source], True) for target, link in link_data.items(): if (target, source) in shared_strains: - link.shared_strains = shared_strains[(target, - source)] + link.shared_strains = [ + self._strains.lookup(strain_id) for strain_id + in shared_strains[(target, source)]] else: + shared_strains = self._datalinks.common_strains([source], targets, True) for target, link in link_data.items(): if (source, target) in shared_strains: - link.shared_strains = shared_strains[(source, - target)] + link.shared_strains = [ + self._strains.lookup(strain_id) for strain_id + in shared_strains[(source, target)]] logger.debug('Finished calculating shared strain information') @@ -601,55 +598,3 @@ def scoring_method(self, name): self._scoring_methods_setup_complete[name] = True return self._scoring_methods.get(name, None)(self) - - -if __name__ == "__main__": - # can set default logging configuration this way... - LogConfig.setLogLevel(logging.DEBUG) - - # initialise NPLinker from the command-line arguments - npl = NPLinker(Args().get_args()) - - # load the dataset - if not npl.load_data(): - print('Failed to load the dataset!') - sys.exit(-1) - - # create a metcalf scoring object - mc = npl.scoring_method('metcalf') - if mc is not None: - # set a scoring cutoff threshold - mc.cutoff = 0.5 - - # pick some GCFs to get links for - test_gcfs = npl.gcfs[:10] - - # tell nplinker to find links for this set of GCFs using metcalf - # scoring - results = npl.get_links(test_gcfs, mc) - - # check if any links were found - if len(results) == 0: - print('No links found!') - sys.exit(0) - - # the "result" object will be a LinkCollection, holding all the information - # returned by the scoring method(s) used - print(f'{len(results)} total links found') - - # display some information about each object and its links - for obj, result in results.links.items(): - print('Results for object: {}, {} total links, {} methods used'. - format(obj, len(result), results.method_count)) - - # get links for this object, sorted by metcalf score - sorted_links = results.get_sorted_links(mc, obj) - for link_data in sorted_links: - print(' --> [{}] {} | {} | # shared_strains = {}'.format( - ','.join(method.name for method in link_data.methods), - link_data.target, link_data[mc], - len(link_data.shared_strains))) - - rs = npl.scoring_method('rosetta') - if rs is not None: - print('OK') From ee5ff03f07c31a477555a75d6aef863c8796c3c0 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 16:38:35 +0200 Subject: [PATCH 34/95] Update test_nplinker.py - add code to remove cached results --- tests/test_nplinker.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/test_nplinker.py b/tests/test_nplinker.py index 3b9171af..4399d24c 100644 --- a/tests/test_nplinker.py +++ b/tests/test_nplinker.py @@ -1,33 +1,39 @@ +import os +from pathlib import Path import pytest from nplinker.nplinker import NPLinker -import os - from . import DATA_DIR + @pytest.fixture(scope='module') -def instance() -> NPLinker: +def npl() -> NPLinker: npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) npl.load_data() + # remove cached results before running tests + root_dir = Path(npl.root_dir) + score_cache = root_dir / 'metcalf' / 'metcalf_scores.pckl' + score_cache.unlink(missing_ok=True) return npl -@pytest.mark.skipif(os.environ.get('CI') == 'true', reason="Skip when running on CI") -def test_load_data(instance: NPLinker): - assert len(instance.bgcs) == 390 - assert len(instance.gcfs) == 113 - assert len(instance.spectra) == 25935 - assert len(instance.molfams) == 25769 +@pytest.mark.skipif(os.environ.get('CI') == 'true', + reason="Skip when running on CI") +def test_load_data(npl: NPLinker): + assert len(npl.bgcs) == 390 + assert len(npl.gcfs) == 113 + assert len(npl.spectra) == 25935 + assert len(npl.molfams) == 25769 -@pytest.mark.skipif(os.environ.get('CI') == 'true' , reason="Skip when running on CI") -def test_get_links(instance: NPLinker): - mc = instance.scoring_method('metcalf') +@pytest.mark.skipif(os.environ.get('CI') == 'true', + reason="Skip when running on CI") +def test_get_links(npl: NPLinker): + mc = npl.scoring_method('metcalf') mc.cutoff = 3.5 mc.standardised = True - actual = instance.get_links(instance.gcfs, mc, and_mode=True) + actual = npl.get_links(npl.gcfs, mc, and_mode=True) assert len(actual) == len(actual.sources) == len(actual.links) == 101 - + actual.filter_links(lambda link: link[mc] > 5.0) assert len(actual.links) == 60 - From ef9441863ad575cf68c79d880bf93651e6817a4e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 17:44:40 +0200 Subject: [PATCH 35/95] move SCORING_METHODS to LinkFinder --- src/nplinker/scoring/linking/link_finder.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 71aaa2e6..b14a6cfd 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -4,7 +4,6 @@ from scipy.stats import hypergeom from nplinker.genomics.gcf import GCF from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking.data_linking import SCORING_METHODS from .data_linking_functions import pair_prob_approx from .data_linking_functions import pair_prob_hg @@ -12,9 +11,6 @@ # CG: TODO get_links function does not work any more, need to update its logics -# CG: TODO get_links function does not work any more, need to update its logics - - # import packages for plotting # TODO move plotting to separate module? try: @@ -31,6 +27,7 @@ logger = LogConfig.getLogger(__file__) +SCORING_METHODS = ['metcalf', 'likescore', 'hg'] class LinkFinder(): """ From 1edabc5f332cbc1668a4f8509c24060f75c031a9 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 1 May 2023 17:45:52 +0200 Subject: [PATCH 36/95] update method name to `get_common_strains` --- src/nplinker/nplinker.py | 6 +++--- src/nplinker/scoring/np_class_scoring.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index cc80872b..e47c35fb 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -435,14 +435,14 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): filter(lambda x: not isinstance(x, BGC), link_data.keys())) if len(targets) > 0: if isinstance(source, GCF): - shared_strains = self._datalinks.common_strains(targets, [source], True) + shared_strains = self._datalinks.get_common_strains(targets, [source], True) for target, link in link_data.items(): if (target, source) in shared_strains: link.shared_strains = [ self._strains.lookup(strain_id) for strain_id in shared_strains[(target, source)]] else: - shared_strains = self._datalinks.common_strains([source], targets, True) + shared_strains = self._datalinks.get_common_strains([source], targets, True) for target, link in link_data.items(): if (source, target) in shared_strains: link.shared_strains = [ @@ -491,7 +491,7 @@ def get_common_strains(self, objects_a, objects_b, filter_no_shared=True): # this is a dict with structure: # (Spectrum/MolecularFamily, GCF) => list of strain indices - common_strains_index_dict = self._datalinks.common_strains( + common_strains_index_dict = self._datalinks.get_common_strains( objects_a, objects_b, filter_no_shared) common_strains = {} diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index 304651dc..b63b7b0a 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -339,7 +339,7 @@ def get_links(self, objects, link_collection): MetcalfScoring.NAME).datalinks # this is a dict with structure: # tup(Spectrum/MolecularFamily, GCF) => array of strain indices - common_strains = self.npl._datalinks.common_strains( + common_strains = self.npl._datalinks.get_common_strains( objects, targets, True) logger.info(f"Calculating NPClassScore for {len(objects)} objects to " f"{len(targets)} targets ({len(common_strains)} pairwise " From d7ad2c05751d3fda2f9e190873cb99fc831ba794 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 2 May 2023 16:39:12 +0200 Subject: [PATCH 37/95] refactor mapping dataframes in DataLinks --- src/nplinker/scoring/linking/data_linking.py | 117 ++----------------- 1 file changed, 9 insertions(+), 108 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index 651390ee..e3c566ef 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -43,10 +43,11 @@ def __init__(self): self.spec_strain_occurrence = pd.DataFrame() self.mf_strain_occurrence = pd.DataFrame() - # mappings (lookup lists to map between different ids and categories + # mapping tables, check `_get_mappings_from_occurance` for details + # TODO: these mappings could be removed when refactoring LinkFinder self.mapping_spec = pd.DataFrame() self.mapping_gcf = pd.DataFrame() - self.mapping_fam = pd.DataFrame() # labels for strain-family matrix + self.mapping_fam = pd.DataFrame() self.mapping_strain = pd.DataFrame() # correlation matrices for spectra <-> GCFs @@ -64,28 +65,27 @@ def __init__(self): def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF], strains: StrainCollection, molfams: Sequence[MolecularFamily]): - # load data from spectra, GCFs, and strains - logger.debug("Create mappings between spectra, gcfs, and strains.") - self.collect_mappings_spec(spectra) - self.collect_mappings_gcf(gcfs) logger.debug( "Create co-occurence matrices: spectra<->strains, gcfs<->strains and mfs<->strains." ) self._get_gcf_strain_occurrence(gcfs, strains) self._get_spec_strain_occurrence(spectra, strains) self._get_mf_strain_occurrence(molfams, strains) - self._get_mappings_from_occurrence() def _get_mappings_from_occurrence(self): + # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence, axis=1) + # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs self.mapping_spec["no of strains"] = np.sum( self.spec_strain_occurrence, axis=1) - self.mapping_strain["no of spectra"] = np.sum( - self.spec_strain_occurrence, axis=0) + # pd.Series with index = mf.family_id and value = number of strains where mf occurs self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence, axis=1) + # pd.Series with index = strain.id and value = number of spectra in strain + self.mapping_strain["no of spectra"] = np.sum( + self.spec_strain_occurrence, axis=0) def find_correlations(self): # collect correlations/ co-occurences @@ -94,105 +94,6 @@ def find_correlations(self): logger.debug("Create correlation matrices: mol-families<->gcfs.") self.correlation_matrices(type='fam-gcf') - def collect_mappings_spec(self, obj: Sequence[Spectrum] - | Sequence[MolecularFamily]): - if isinstance(obj[0], Spectrum): - mapping_spec = self._collect_mappings_from_spectra(obj) - elif isinstance(obj[0], MolecularFamily): - mapping_spec = self._collect_mappings_from_molecular_families(obj) - - # extend mapping tables: - # TODO: why do we need the mappings??? - # "spec-id" is defined as the index of the spectrum in the input data - self.mapping_spec["fam-id"] = mapping_spec[:, 2] - - def _collect_mappings_from_spectra(self, - spectra) -> np.ndarray[np.float64]: - # Collect most import mapping tables from input data - mapping_spec = np.zeros((len(spectra), 3)) - mapping_spec[:, 0] = np.arange(0, len(spectra)) - - for i, spectrum in enumerate(spectra): - mapping_spec[i, 1] = spectrum.id - mapping_spec[i, 2] = spectrum.family.family_id - - return mapping_spec - - def _collect_mappings_from_molecular_families( - self, - molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]: - num_spectra = sum(len(x.spectra_ids) for x in molfams) - mapping_spec = np.zeros((num_spectra, 3)) - mapping_spec[:, 0] = np.arange(0, num_spectra) - - inverted_mappings = {} - for item in molfams: - for spectrum_id in item.spectra_ids: - inverted_mappings[spectrum_id] = item.family_id - - for i, key in enumerate(inverted_mappings): - mapping_spec[i, 1] = key - mapping_spec[i, 2] = inverted_mappings[key] - - return mapping_spec - - def collect_mappings_gcf(self, gcf_list): - """ - Find classes of gene cluster (nrps, pksi etc.) - collect most likely class (most occuring name, preferentially not "Others") - additional score shows fraction of chosen class among all given ones - """ - - # TODO: not only collect bigclass types but also product predictions - bigscape_bestguess = [] - for i, gcf in enumerate(gcf_list): - #bigscape_class = [] - #for i, bgc in enumerate(gcf_list[i].bgcs): - # # bigscape_class.append(gcf_list[i].bgc_list[m].bigscape_class) - # bigscape_class.append(bgc.bigscape_class) - # class_counter = Counter(bigscape_class) - - # TODO: this might need properly rewritten due to changes in the way GCF/BGC - # objects store bigscape class information and handle hybrid BGCs (see genomics.py). the - # original version i've left above iterates over every BGC in the current GCF - # and extracts its .bigscape_class attribute, but now each BGC can have multiple - # classes if it happens to be a hybrid and i'm not sure the version below - # still makes sense. - # - # doesn't seem to be very important for calculating the metcalf scores though so not urgent for now. - bigscape_class = [] - for bgc in gcf.bgcs: - bigscape_class.extend(bgc.bigscape_classes) - class_counter = Counter(bigscape_class) - - # try not to select "Others": - if class_counter.most_common(1)[0][0] is None: - bigscape_bestguess.append(["Others", 0]) - elif class_counter.most_common( - 1)[0][0] == "Others" and class_counter.most_common( - 1)[0][1] < len(bigscape_class): - if class_counter.most_common(2)[1][0] is None: - bigscape_bestguess.append([ - class_counter.most_common(1)[0][0], - class_counter.most_common(1)[0][1] / - len(bigscape_class) - ]) - else: - bigscape_bestguess.append([ - class_counter.most_common(2)[1][0], - class_counter.most_common(2)[1][1] / - len(bigscape_class) - ]) - else: - bigscape_bestguess.append([ - class_counter.most_common(1)[0][0], - class_counter.most_common(1)[0][1] / len(bigscape_class) - ]) - - # extend mapping tables: - bigscape_guess, bigscape_guessscore = zip(*bigscape_bestguess) - self.mapping_gcf["bgc class"] = bigscape_guess - def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF], strains: StrainCollection) -> None: """Get the occurence of strains in gcfs. From 4f8d8115cd4e7da8edc8c29ce35e58681e0a92ce Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 2 May 2023 16:39:42 +0200 Subject: [PATCH 38/95] add TODOs and deprecation to LinkFinder --- src/nplinker/scoring/linking/link_finder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index b14a6cfd..0e61be88 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -1,4 +1,5 @@ +from deprecated import deprecated import numpy as np import pandas as pd from scipy.stats import hypergeom @@ -387,6 +388,7 @@ def select_link_candidates(self, # add other potentially relevant knowdledge # If this will grow to more collected information -> create separate function/class bgc_class = [] + # TODO CG: bgc class should be obtained from GCF object for i in link_candidates_pd["GCF id"].astype(int): bgc_class.append(data_links.mapping_gcf["bgc class"][i]) link_candidates_pd["BGC class"] = bgc_class @@ -574,6 +576,7 @@ def get_links(self, return links + @deprecated(version="1.3.3", reason="The unworkable method will be removed") def create_cytoscape_files(self, data_links, network_filename, From ab7268a96e393593506d9545f27529b68ddc37f0 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 2 May 2023 17:38:30 +0200 Subject: [PATCH 39/95] refactor cooccurrence in DataLinks --- src/nplinker/scoring/linking/data_linking.py | 81 +++++++------------ src/nplinker/scoring/linking/link_finder.py | 52 ++++++------ .../scoring/linking/link_likelihood.py | 12 +-- 3 files changed, 62 insertions(+), 83 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index e3c566ef..b188d7b8 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -37,7 +37,7 @@ class DataLinks(): """ def __init__(self): - # DataFrame to store occurence of objects with respect to strains + # DataFrame to store occurrence of objects with respect to strains # values = 1 where gcf/spec/fam occur in strain, 0 otherwise self.gcf_strain_occurrence = pd.DataFrame() self.spec_strain_occurrence = pd.DataFrame() @@ -50,17 +50,6 @@ def __init__(self): self.mapping_fam = pd.DataFrame() self.mapping_strain = pd.DataFrame() - # correlation matrices for spectra <-> GCFs - self.M_spec_gcf = [ - ] # = int: Number of strains where spec_x and gcf_y co_occure - self.M_spec_notgcf = [ - ] # = int: Number of strains where spec_x and NOT-gcf_y co_occure - self.M_notspec_gcf = [ - ] # = int: Number of strains where NOT-spec_x and gcf_y co_occure - # and the same for mol.families <-> GCFs - self.M_fam_gcf = [] - self.M_fam_notgcf = [] - self.M_notfam_gcf = [] def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF], strains: StrainCollection, @@ -90,9 +79,9 @@ def _get_mappings_from_occurrence(self): def find_correlations(self): # collect correlations/ co-occurences logger.debug("Create correlation matrices: spectra<->gcfs.") - self.correlation_matrices(type='spec-gcf') + self._get_cooccurrence(link_type='spec-gcf') logger.debug("Create correlation matrices: mol-families<->gcfs.") - self.correlation_matrices(type='fam-gcf') + self._get_cooccurrence(link_type='fam-gcf') def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF], strains: StrainCollection) -> None: @@ -202,45 +191,35 @@ def get_common_strains( results[(obj, gcf)] = shared_strains.to_list() return results - def correlation_matrices(self, type='spec-gcf'): - """ - Collect co-occurrences accros strains: - IF type='spec-gcf': - number of co-occurences of spectra and GCFS - --> Output: M_spec_gcf matrix - IF type='fam-gcf': - number of co-occurences of mol.families and GCFS - --> Output: M_fam_gcf matrix - """ + def _get_cooccurrence(self, link_type: str = 'spec-gcf'): + """Calculate co-occurrence for given link types across strains. - # Make selection for scenario spec<->gcf or fam<->gcf - if type == 'spec-gcf': - M_type1_strain = self.spec_strain_occurrence - elif type == 'fam-gcf': - M_type1_strain = self.mf_strain_occurrence - elif type == 'spec-bgc' or type == 'fam-bgc': - raise Exception("Given types are not yet supported... ") + Args: + link_type(str): Type of link to calculate co-occurrence for, + either 'spec-gcf' or 'fam-gcf'. + """ + if link_type == 'spec-gcf': + met_strain_occurrence = self.spec_strain_occurrence + elif link_type == 'fam-gcf': + met_strain_occurrence = self.mf_strain_occurrence else: - raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf', ..." + raise ValueError( + f"Link type {link_type} is not supported. Use 'spec-gcf' or 'fam-gcf' instead." ) - logger.debug(f"Calculating correlation matrices of type: {type}") - - # Calculate correlation matrix from co-occurence matrices - M_type1_gcf, M_type1_notgcf, M_nottype1_gcf, M_nottype1_notgcf = calc_correlation_matrix( - M_type1_strain, self.gcf_strain_occurrence) - - # return results: - if type == 'spec-gcf': - self.M_spec_gcf = M_type1_gcf - self.M_spec_notgcf = M_type1_notgcf - self.M_notspec_gcf = M_nottype1_gcf - self.M_notspec_notgcf = M_nottype1_notgcf - elif type == 'fam-gcf': - self.M_fam_gcf = M_type1_gcf - self.M_fam_notgcf = M_type1_notgcf - self.M_notfam_gcf = M_nottype1_gcf - self.M_notfam_notgcf = M_nottype1_notgcf + logger.debug(f"Calculating correlation matrices of type: {link_type}") + + met_gcf_corr, met_notgcf_corr, notmet_gcf_corr, notmet_notgcf_corr = calc_correlation_matrix( + met_strain_occurrence, self.gcf_strain_occurrence) + + if link_type == 'spec-gcf': + # co-occurrence of spectrum and GCF across strains + self.cooccurrence_spec_gcf = met_gcf_corr + self.cooccurrence_spec_notgcf = met_notgcf_corr + self.cooccurrence_notspec_gcf = notmet_gcf_corr + self.cooccurrence_notspec_notgcf = notmet_notgcf_corr else: - raise Exception("No correct correlation matrix was created.") + self.cooccurrence_fam_gcf = met_gcf_corr + self.cooccurrence_fam_notgcf = met_notgcf_corr + self.cooccurrence_notfam_gcf = notmet_gcf_corr + self.cooccurrence_notfam_notgcf = notmet_notgcf_corr diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 0e61be88..fb198457 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -141,19 +141,19 @@ def metcalf_scoring(self, # at expected_metcalf[3,6] and sqrt of the variance in the same position if type == 'spec-gcf': - metcalf_scores = np.zeros(data_links.M_spec_gcf.shape) - metcalf_scores = (data_links.M_spec_gcf * both + - data_links.M_spec_notgcf * type1_not_gcf + - data_links.M_notspec_gcf * gcf_not_type1 + - data_links.M_notspec_notgcf * not_type1_not_gcf) + metcalf_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape) + metcalf_scores = (data_links.cooccurrence_spec_gcf * both + + data_links.cooccurrence_spec_notgcf * type1_not_gcf + + data_links.cooccurrence_notspec_gcf * gcf_not_type1 + + data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf) self.metcalf_spec_gcf = metcalf_scores elif type == 'fam-gcf': - metcalf_scores = np.zeros(data_links.M_fam_gcf.shape) - metcalf_scores = (data_links.M_fam_gcf * both + - data_links.M_fam_notgcf * type1_not_gcf + - data_links.M_notfam_gcf * gcf_not_type1 + - data_links.M_notfam_notgcf * not_type1_not_gcf) + metcalf_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape) + metcalf_scores = (data_links.cooccurrence_fam_gcf * both + + data_links.cooccurrence_fam_notgcf * type1_not_gcf + + data_links.cooccurrence_notfam_gcf * gcf_not_type1 + + data_links.cooccurrence_notfam_notgcf * not_type1_not_gcf) self.metcalf_fam_gcf = metcalf_scores return metcalf_scores @@ -168,17 +168,17 @@ def hg_scoring(self, data_links, type='spec-gcf'): # Instead of "number of strains only in GCF", it requires "number of strains in the # GCF PLUS the number shared between the GCF and the other object". # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain, - # M_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead + # cooccurrence_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead # of "3", because the spectrum only has 2 distinct strains vs the GCF. - # To fix this the M_spec_gcf/M_fam_gcf matrix can just be added onto the others to give + # To fix this the cooccurrence_spec_gcf/cooccurrence_fam_gcf matrix can just be added onto the others to give # the correct totals. if type == 'spec-gcf': num_strains = np.ones( - data_links.M_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] - overlap_counts = data_links.M_spec_gcf - gcf_counts = overlap_counts + data_links.M_notspec_gcf - spec_counts = overlap_counts + data_links.M_spec_notgcf + data_links.cooccurrence_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] + overlap_counts = data_links.cooccurrence_spec_gcf + gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf + spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, @@ -187,10 +187,10 @@ def hg_scoring(self, data_links, type='spec-gcf'): self.hg_spec_gcf = hg_scores elif type == 'fam-gcf': num_strains = np.ones( - data_links.M_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] - overlap_counts = data_links.M_fam_gcf - gcf_counts = overlap_counts + data_links.M_notfam_gcf - fam_counts = overlap_counts + data_links.M_fam_notgcf + data_links.cooccurrence_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] + overlap_counts = data_links.cooccurrence_fam_gcf + gcf_counts = overlap_counts + data_links.cooccurrence_notfam_gcf + fam_counts = overlap_counts + data_links.cooccurrence_fam_notgcf hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, @@ -224,19 +224,19 @@ def likelihood_scoring(self, """ if type == 'spec-gcf': - likelihood_scores = np.zeros(data_links.M_spec_gcf.shape) + likelihood_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape) likelihood_scores = ( likelihoods.P_gcf_given_spec * (1 - likelihoods.P_spec_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.M_spec_gcf))) + (1 - np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf))) self.likescores_spec_gcf = likelihood_scores elif type == 'fam-gcf': - likelihood_scores = np.zeros(data_links.M_fam_gcf.shape) + likelihood_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape) likelihood_scores = ( likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.M_fam_gcf))) + (1 - np.exp(-alpha_weighing * data_links.cooccurrence_fam_gcf))) self.likescores_fam_gcf = likelihood_scores return likelihood_scores @@ -275,7 +275,7 @@ def select_link_candidates(self, P_gcf_not_type1 = likelihoods.P_gcf_not_spec P_type1_given_gcf = likelihoods.P_spec_given_gcf P_type1_not_gcf = likelihoods.P_spec_not_gcf - M_type1_gcf = data_links.M_spec_gcf + M_type1_gcf = data_links.cooccurrence_spec_gcf metcalf_scores = self.metcalf_spec_gcf likescores = self.likescores_spec_gcf index_names = [ @@ -290,7 +290,7 @@ def select_link_candidates(self, P_gcf_not_type1 = likelihoods.P_gcf_not_fam P_type1_given_gcf = likelihoods.P_fam_given_gcf P_type1_not_gcf = likelihoods.P_fam_not_gcf - M_type1_gcf = data_links.M_fam_gcf + M_type1_gcf = data_links.cooccurrence_fam_gcf metcalf_scores = self.metcalf_fam_gcf likescores = self.likescores_fam_gcf index_names = [ diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py index 90446a18..c8758d19 100644 --- a/src/nplinker/scoring/linking/link_likelihood.py +++ b/src/nplinker/scoring/linking/link_likelihood.py @@ -46,14 +46,14 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): # Make selection for scenario spec<->gcf or fam<->gcf if type == 'spec-gcf': - M_type1_type2 = data_links.M_spec_gcf - M_type1_nottype2 = data_links.M_spec_notgcf - M_nottype1_type2 = data_links.M_notspec_gcf + M_type1_type2 = data_links.cooccurrence_spec_gcf + M_type1_nottype2 = data_links.cooccurrence_spec_notgcf + M_nottype1_type2 = data_links.cooccurrence_notspec_gcf M_type1_cond = data_links.spec_strain_occurrence elif type == 'fam-gcf': - M_type1_type2 = data_links.M_fam_gcf - M_type1_nottype2 = data_links.M_fam_notgcf - M_nottype1_type2 = data_links.M_notfam_gcf + M_type1_type2 = data_links.cooccurrence_fam_gcf + M_type1_nottype2 = data_links.cooccurrence_fam_notgcf + M_nottype1_type2 = data_links.cooccurrence_notfam_gcf M_type1_cond = data_links.mf_strain_occurrence elif type == 'spec-bgc' or type == 'fam-bgc': raise Exception("Given types are not yet supported... ") From 995d6d21024f922fc7dc437c264e8af06edf5f0e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 2 May 2023 17:52:49 +0200 Subject: [PATCH 40/95] merge `load_data` and `find_correlations` to init in DataLinks --- src/nplinker/scoring/linking/data_linking.py | 172 +++++++++---------- src/nplinker/scoring/metcalf_scoring.py | 5 +- tests/scoring/test_scoring.py | 8 +- 3 files changed, 80 insertions(+), 105 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index b188d7b8..07009975 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -1,14 +1,3 @@ -# Methods to find correlations between spectra/molecular families and -# gene clusters/families (BGCs/GCFs) -# -# (still at very much protoype/exploration stage) -# -# Naming: -# M_* stands for a matrix format -# map_* stands for a simple mapping lookup table -# spec stands for spectrum -# fam stands for molecular family - from __future__ import annotations from collections import Counter from typing import Sequence, TYPE_CHECKING @@ -35,54 +24,78 @@ class DataLinks(): 2) Mappings: Lookup-tables that link different ids and categories 3) Correlation matrices that show how often spectra/families and GCFs co-occur """ - - def __init__(self): - # DataFrame to store occurrence of objects with respect to strains - # values = 1 where gcf/spec/fam occur in strain, 0 otherwise - self.gcf_strain_occurrence = pd.DataFrame() - self.spec_strain_occurrence = pd.DataFrame() - self.mf_strain_occurrence = pd.DataFrame() - - # mapping tables, check `_get_mappings_from_occurance` for details - # TODO: these mappings could be removed when refactoring LinkFinder - self.mapping_spec = pd.DataFrame() - self.mapping_gcf = pd.DataFrame() - self.mapping_fam = pd.DataFrame() - self.mapping_strain = pd.DataFrame() - - - def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF], - strains: StrainCollection, - molfams: Sequence[MolecularFamily]): + def __init__(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF], + strains: StrainCollection, + molfams: Sequence[MolecularFamily]): logger.debug( - "Create co-occurence matrices: spectra<->strains, gcfs<->strains and mfs<->strains." + "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains." ) + # DataFrame to store occurrence of gcfs/spectra/mfs with respect to strains + # values = 1 where gcf/spec/fam occur in strain, 0 otherwise self._get_gcf_strain_occurrence(gcfs, strains) self._get_spec_strain_occurrence(spectra, strains) self._get_mf_strain_occurrence(molfams, strains) - self._get_mappings_from_occurrence() - def _get_mappings_from_occurrence(self): - # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs - self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence, - axis=1) - # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs - self.mapping_spec["no of strains"] = np.sum( - self.spec_strain_occurrence, axis=1) - # pd.Series with index = mf.family_id and value = number of strains where mf occurs - self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence, - axis=1) - # pd.Series with index = strain.id and value = number of spectra in strain - self.mapping_strain["no of spectra"] = np.sum( - self.spec_strain_occurrence, axis=0) + # DataFrame to store mapping tables, check `_get_mappings_from_occurance` for details + # TODO: these mappings could be removed when refactoring LinkFinder + self._get_mappings_from_occurrence() - def find_correlations(self): - # collect correlations/ co-occurences + # np.array to store co-occurrence of "spectra<->gcf" or "mfs<->gcf" logger.debug("Create correlation matrices: spectra<->gcfs.") self._get_cooccurrence(link_type='spec-gcf') logger.debug("Create correlation matrices: mol-families<->gcfs.") self._get_cooccurrence(link_type='fam-gcf') + def get_common_strains( + self, + spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily], + gcfs: Sequence[GCF], + filter_no_shared: bool = False + ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]: + """Get common strains between given spectra/molecular families and GCFs. + + Note that SingletonFamily objects are excluded from given `spectra_or_molfams`. + + Args: + spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]): + A list of Spectrum or MolecularFamily objects. + gcfs(Sequence[GCF]): A list of GCF objects. + filter_no_shared(bool): If True, return only the pair of + spectrum/molecular family and GCF that have common strains; + otherwise, return all pairs no matter they have common strains + or not. + + Returns: + dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) + and values are a list of strain ids that appear in both objects. + """ + if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)): + raise ValueError( + 'Must provide Spectra or MolecularFamilies as the first argument!' + ) + if not isinstance(gcfs[0], GCF): + raise ValueError('Must provide GCFs as the second argument!') + + # Assume that 3 occurrence dataframes have same df.columns (strain ids) + strain_ids = self.gcf_strain_occurrence.columns + results = {} + for obj in spectra_or_molfams: + if isinstance(obj, SingletonFamily): + continue + for gcf in gcfs: + if isinstance(obj, Spectrum): + shared_strains = strain_ids[np.logical_and( + self.spec_strain_occurrence.loc[obj.spectrum_id], + self.gcf_strain_occurrence.loc[gcf.gcf_id])] + else: + shared_strains = strain_ids[np.logical_and( + self.mf_strain_occurrence.loc[obj.family_id], + self.gcf_strain_occurrence.loc[gcf.gcf_id])] + if filter_no_shared and len(shared_strains) == 0: + continue + results[(obj, gcf)] = shared_strains.to_list() + return results + def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF], strains: StrainCollection) -> None: """Get the occurence of strains in gcfs. @@ -141,55 +154,24 @@ def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily], df_mf_strain.loc[mf.family_id, strain.id] = 1 self.mf_strain_occurrence = df_mf_strain - def get_common_strains( - self, - spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily], - gcfs: Sequence[GCF], - filter_no_shared: bool = False - ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]: - """Get common strains between given spectra/molecular families and GCFs. - - Note that SingletonFamily objects are excluded from given `spectra_or_molfams`. - - Args: - spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]): - A list of Spectrum or MolecularFamily objects. - gcfs(Sequence[GCF]): A list of GCF objects. - filter_no_shared(bool): If True, return only the pair of - spectrum/molecular family and GCF that have common strains; - otherwise, return all pairs no matter they have common strains - or not. - - Returns: - dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) - and values are a list of strain ids that appear in both objects. - """ - if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)): - raise ValueError( - 'Must provide Spectra or MolecularFamilies as the first argument!' - ) - if not isinstance(gcfs[0], GCF): - raise ValueError('Must provide GCFs as the second argument!') + def _get_mappings_from_occurrence(self): + self.mapping_spec = pd.DataFrame() + self.mapping_gcf = pd.DataFrame() + self.mapping_fam = pd.DataFrame() + self.mapping_strain = pd.DataFrame() - # Assume that 3 occurrence dataframes have same df.columns (strain ids) - strain_ids = self.gcf_strain_occurrence.columns - results = {} - for obj in spectra_or_molfams: - if isinstance(obj, SingletonFamily): - continue - for gcf in gcfs: - if isinstance(obj, Spectrum): - shared_strains = strain_ids[np.logical_and( - self.spec_strain_occurrence.loc[obj.spectrum_id], - self.gcf_strain_occurrence.loc[gcf.gcf_id])] - else: - shared_strains = strain_ids[np.logical_and( - self.mf_strain_occurrence.loc[obj.family_id], - self.gcf_strain_occurrence.loc[gcf.gcf_id])] - if filter_no_shared and len(shared_strains) == 0: - continue - results[(obj, gcf)] = shared_strains.to_list() - return results + # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs + self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence, + axis=1) + # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs + self.mapping_spec["no of strains"] = np.sum( + self.spec_strain_occurrence, axis=1) + # pd.Series with index = mf.family_id and value = number of strains where mf occurs + self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence, + axis=1) + # pd.Series with index = strain.id and value = number of spectra in strain + self.mapping_strain["no of spectra"] = np.sum( + self.spec_strain_occurrence, axis=0) def _get_cooccurrence(self, link_type: str = 'spec-gcf'): """Calculate co-occurrence for given link types across strains. diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 79e90fc4..b435635b 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -77,11 +77,8 @@ def setup(npl): logger.info( 'MetcalfScoring.setup preprocessing dataset (this may take some time)' ) - MetcalfScoring.DATALINKS = DataLinks() - MetcalfScoring.DATALINKS.load_data(npl._spectra, npl._gcfs, + MetcalfScoring.DATALINKS = DataLinks(npl._spectra, npl._gcfs, npl._strains, npl.molfams) - # TODO fix crash with this set to True, see https://github.com/sdrogers/nplinker/issues/57 - MetcalfScoring.DATALINKS.find_correlations() MetcalfScoring.LINKFINDER = LinkFinder() MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, type='spec-gcf') diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py index fe75611f..e43aa5ac 100644 --- a/tests/scoring/test_scoring.py +++ b/tests/scoring/test_scoring.py @@ -71,9 +71,7 @@ def do_scoring_old(gcfs, spectra, strains, standardised): def do_scoring_new(gcfs, spectra, strains, standardised): - datalinks = DataLinks() - datalinks.load_data(spectra, gcfs, strains) - datalinks.find_correlations() + datalinks = DataLinks(spectra, gcfs, strains) lf = LinkFinder() scores = lf.metcalf_scoring(datalinks) @@ -104,9 +102,7 @@ def do_scoring_old_hg(gcfs, spectra, strains): def do_scoring_new_hg(gcfs, spectra, strains): - datalinks = DataLinks() - datalinks.load_data(spectra, gcfs, strains) - datalinks.find_correlations() + datalinks = DataLinks(spectra, gcfs, strains) lf = LinkFinder() scores = lf.hg_scoring(datalinks) return scores From 742e6e5b917186c9400d431eb891fa1ec601fa9c Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 10:16:52 +0200 Subject: [PATCH 41/95] refactor DataLinks attributes - Move assignment of attributes to `__init__` - Rename attributes - Replace `fam` or `molfam` with `mf` to refer to molecular family - Add docstrings --- src/nplinker/process_output.py | 2 +- src/nplinker/scoring/linking/data_linking.py | 167 +++++++++++------- src/nplinker/scoring/linking/link_finder.py | 80 ++++----- .../scoring/linking/link_likelihood.py | 22 +-- src/nplinker/scoring/metcalf_scoring.py | 8 +- tests/scoring/test_scoring.py | 4 +- 6 files changed, 157 insertions(+), 126 deletions(-) diff --git a/src/nplinker/process_output.py b/src/nplinker/process_output.py index 9892a583..27f23929 100644 --- a/src/nplinker/process_output.py +++ b/src/nplinker/process_output.py @@ -51,7 +51,7 @@ def get_sig_spec(data_link, sig_links, scores, gcf_pos, min_n_strains=2): # Check if there are *any* strains in the GCF #Ā No strains = MiBIG # Can also filter if only (e.g. 2 strains) - strain_sum = data_link.gcf_strain_occurrence[gcf_pos, :].sum() + strain_sum = data_link.occurrence_gcf_strain[gcf_pos, :].sum() if strain_sum < min_n_strains: return [] col = sig_links[:, gcf_pos] #Ā get the column diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index 07009975..3c79f46f 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -1,5 +1,4 @@ from __future__ import annotations -from collections import Counter from typing import Sequence, TYPE_CHECKING import numpy as np import pandas as pd @@ -18,46 +17,93 @@ class DataLinks(): - """ - DataLinks collects and structures co-occurence data - 1) Co-occurences of spectra, families, and GCFs with respect to strains - 2) Mappings: Lookup-tables that link different ids and categories - 3) Correlation matrices that show how often spectra/families and GCFs co-occur - """ - def __init__(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF], - strains: StrainCollection, - molfams: Sequence[MolecularFamily]): + + def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], + mfs: Sequence[MolecularFamily], + strains: StrainCollection): + """DataLinks class to store occurrence and co-occurrence information. + + Occurrence refers to the presence of a spectrum/gcf/mf in a strain. + Co-occurrence refers to the presence of a spectrum/mf and a gcf in a strain. + + Args: + gcfs(Sequence[GCF]): A list of GCF objects. + spectra(Sequence[Spectrum]): A list of Spectrum objects. + mfs(Sequence[MolecularFamily]): A list of MolecularFamily objects. + strains(StrainCollection): A StrainCollection object. + + Attributes: + occurrence_gcf_strain(pd.DataFrame): A DataFrame to store occurrence of + gcfs with respect to strains. + occurrence_spec_strain(pd.DataFrame): A DataFrame to store occurrence of + spectra with respect to strains. + occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of + molecular families with respect to strains. + cooccurrence_spec_gcf(np.array): A 2D numpy array to store co-occurrence + of spectra<->gcfs. + cooccurrence_spec_notgcf(np.array): A 2D numpy array to store co-occurrence + of spectra<->not gcfs. + cooccurrence_notspec_gcf(np.array): A 2D numpy array to store co-occurrence + of not spectra<->gcfs. + cooccurrence_notspec_notgcf(np.array): A 2D numpy array to store co-occurrence + of not spectra<->not gcfs. + cooccurrence_mf_gcf(np.array): A 2D numpy array to store co-occurrence + of molecular families<->gcfs. + cooccurrence_mf_notgcf(np.array): A 2D numpy array to store co-occurrence + of molecular families<->not gcfs. + cooccurrence_notmf_gcf(np.array): A 2D numpy array to store co-occurrence + of not molecular families<->gcfs. + cooccurrence_notmf_notgcf(np.array): A 2D numpy array to store co-occurrence + of not molecular families<->not gcfs. + mapping_gcf(pd.DataFrame): A DataFrame to store mappings for gcfs. + mapping_spec(pd.DataFrame): A DataFrame to store mappings for spectra. + mapping_mf(pd.DataFrame): A DataFrame to store mappings for molecular families. + mapping_strain(pd.DataFrame): A DataFrame to store mappings for strains. + """ logger.debug( "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains." ) # DataFrame to store occurrence of gcfs/spectra/mfs with respect to strains # values = 1 where gcf/spec/fam occur in strain, 0 otherwise - self._get_gcf_strain_occurrence(gcfs, strains) - self._get_spec_strain_occurrence(spectra, strains) - self._get_mf_strain_occurrence(molfams, strains) + self.occurrence_gcf_strain = self._get_occurrence_gcf_strain( + gcfs, strains) + self.occurrence_spec_strain = self._get_occurrence_spec_strain( + spectra, strains) + self.occurrence_mf_strain = self._get_occurrence_mf_strain( + mfs, strains) # DataFrame to store mapping tables, check `_get_mappings_from_occurance` for details # TODO: these mappings could be removed when refactoring LinkFinder + self.mapping_spec = pd.DataFrame() + self.mapping_gcf = pd.DataFrame() + self.mapping_fam = pd.DataFrame() + self.mapping_strain = pd.DataFrame() self._get_mappings_from_occurrence() # np.array to store co-occurrence of "spectra<->gcf" or "mfs<->gcf" logger.debug("Create correlation matrices: spectra<->gcfs.") - self._get_cooccurrence(link_type='spec-gcf') + (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf, + self.cooccurrence_notspec_gcf, + self.cooccurrence_notspec_notgcf) = self._get_cooccurrence( + link_type='spec-gcf') logger.debug("Create correlation matrices: mol-families<->gcfs.") - self._get_cooccurrence(link_type='fam-gcf') + (self.cooccurrence_mf_gcf, self.cooccurrence_mf_notgcf, + self.cooccurrence_notmf_gcf, + self.cooccurrence_notmf_notgcf) = self._get_cooccurrence( + link_type='mf-gcf') def get_common_strains( self, - spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily], + spectra_or_mfs: Sequence[Spectrum] | Sequence[MolecularFamily], gcfs: Sequence[GCF], filter_no_shared: bool = False ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]: """Get common strains between given spectra/molecular families and GCFs. - Note that SingletonFamily objects are excluded from given `spectra_or_molfams`. + Note that SingletonFamily objects are excluded from given `spectra_or_mfs`. Args: - spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]): + spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]): A list of Spectrum or MolecularFamily objects. gcfs(Sequence[GCF]): A list of GCF objects. filter_no_shared(bool): If True, return only the pair of @@ -69,7 +115,7 @@ def get_common_strains( dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) and values are a list of strain ids that appear in both objects. """ - if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)): + if not isinstance(spectra_or_mfs[0], (Spectrum, MolecularFamily)): raise ValueError( 'Must provide Spectra or MolecularFamilies as the first argument!' ) @@ -77,27 +123,27 @@ def get_common_strains( raise ValueError('Must provide GCFs as the second argument!') # Assume that 3 occurrence dataframes have same df.columns (strain ids) - strain_ids = self.gcf_strain_occurrence.columns + strain_ids = self.occurrence_gcf_strain.columns results = {} - for obj in spectra_or_molfams: + for obj in spectra_or_mfs: if isinstance(obj, SingletonFamily): continue for gcf in gcfs: if isinstance(obj, Spectrum): shared_strains = strain_ids[np.logical_and( - self.spec_strain_occurrence.loc[obj.spectrum_id], - self.gcf_strain_occurrence.loc[gcf.gcf_id])] + self.occurrence_spec_strain.loc[obj.spectrum_id], + self.occurrence_gcf_strain.loc[gcf.gcf_id])] else: shared_strains = strain_ids[np.logical_and( - self.mf_strain_occurrence.loc[obj.family_id], - self.gcf_strain_occurrence.loc[gcf.gcf_id])] + self.occurrence_mf_strain.loc[obj.family_id], + self.occurrence_gcf_strain.loc[gcf.gcf_id])] if filter_no_shared and len(shared_strains) == 0: continue results[(obj, gcf)] = shared_strains.to_list() return results - def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF], - strains: StrainCollection) -> None: + def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF], + strains: StrainCollection) -> pd.DataFrame: """Get the occurence of strains in gcfs. The occurence is a DataFrame with gcfs as rows and strains as columns, @@ -111,10 +157,10 @@ def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF], for strain in strains: if gcf.has_strain(strain): df_gcf_strain.loc[gcf.gcf_id, strain.id] = 1 - self.gcf_strain_occurrence = df_gcf_strain + return df_gcf_strain - def _get_spec_strain_occurrence(self, spectra: Sequence[Spectrum], - strains: StrainCollection) -> None: + def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum], + strains: StrainCollection) -> pd.DataFrame: """Get the occurence of strains in spectra. The occurence is a DataFrame with spectra as rows and strains as columns, @@ -129,10 +175,10 @@ def _get_spec_strain_occurrence(self, spectra: Sequence[Spectrum], for strain in strains: if spectrum.has_strain(strain): df_spec_strain.loc[spectrum.spectrum_id, strain.id] = 1 - self.spec_strain_occurrence = df_spec_strain + return df_spec_strain - def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily], - strains: StrainCollection) -> None: + def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily], + strains: StrainCollection) -> pd.DataFrame: """Get the occurence of strains in molecular families. The occurence is a DataFrame with molecular families as rows and @@ -152,56 +198,41 @@ def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily], for strain in strains: if mf.has_strain(strain): df_mf_strain.loc[mf.family_id, strain.id] = 1 - self.mf_strain_occurrence = df_mf_strain + return df_mf_strain def _get_mappings_from_occurrence(self): - self.mapping_spec = pd.DataFrame() - self.mapping_gcf = pd.DataFrame() - self.mapping_fam = pd.DataFrame() - self.mapping_strain = pd.DataFrame() # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs - self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence, - axis=1) + self.mapping_gcf["no of strains"] = self.occurrence_gcf_strain.sum( + axis=1) # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs - self.mapping_spec["no of strains"] = np.sum( - self.spec_strain_occurrence, axis=1) + self.mapping_spec["no of strains"] = self.occurrence_spec_strain.sum( + axis=1) # pd.Series with index = mf.family_id and value = number of strains where mf occurs - self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence, - axis=1) + self.mapping_fam["no of strains"] = self.occurrence_mf_strain.sum( + axis=1) # pd.Series with index = strain.id and value = number of spectra in strain - self.mapping_strain["no of spectra"] = np.sum( - self.spec_strain_occurrence, axis=0) + self.mapping_strain["no of spectra"] = self.occurrence_spec_strain.sum( + axis=0) - def _get_cooccurrence(self, link_type: str = 'spec-gcf'): + def _get_cooccurrence( + self, + link_type: str = 'spec-gcf' + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Calculate co-occurrence for given link types across strains. Args: link_type(str): Type of link to calculate co-occurrence for, - either 'spec-gcf' or 'fam-gcf'. + either 'spec-gcf' or 'mf-gcf'. """ if link_type == 'spec-gcf': - met_strain_occurrence = self.spec_strain_occurrence - elif link_type == 'fam-gcf': - met_strain_occurrence = self.mf_strain_occurrence + met_strain_occurrence = self.occurrence_spec_strain + elif link_type == 'mf-gcf': + met_strain_occurrence = self.occurrence_mf_strain else: raise ValueError( - f"Link type {link_type} is not supported. Use 'spec-gcf' or 'fam-gcf' instead." + f"Link type {link_type} is not supported. Use 'spec-gcf' or 'mf-gcf' instead." ) - logger.debug(f"Calculating correlation matrices of type: {link_type}") - - met_gcf_corr, met_notgcf_corr, notmet_gcf_corr, notmet_notgcf_corr = calc_correlation_matrix( - met_strain_occurrence, self.gcf_strain_occurrence) - - if link_type == 'spec-gcf': - # co-occurrence of spectrum and GCF across strains - self.cooccurrence_spec_gcf = met_gcf_corr - self.cooccurrence_spec_notgcf = met_notgcf_corr - self.cooccurrence_notspec_gcf = notmet_gcf_corr - self.cooccurrence_notspec_notgcf = notmet_notgcf_corr - else: - self.cooccurrence_fam_gcf = met_gcf_corr - self.cooccurrence_fam_notgcf = met_notgcf_corr - self.cooccurrence_notfam_gcf = notmet_gcf_corr - self.cooccurrence_notfam_notgcf = notmet_notgcf_corr + return calc_correlation_matrix(met_strain_occurrence, + self.occurrence_gcf_strain) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index fb198457..7fcdae03 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -72,17 +72,17 @@ def get_scores(self, method, type_): if method == 'metcalf': if type_ == 'spec-gcf': return self.metcalf_spec_gcf - elif type_ == 'fam-gcf': + elif type_ == 'mf-gcf': return self.metcalf_fam_gcf elif method == 'likescore': if type_ == 'spec-gcf': return self.likescores_spec_gcf - elif type_ == 'fam-gcf': + elif type_ == 'mf-gcf': return self.likescores_fam_gcf elif method == 'hg': if type_ == 'spec-gcf': return self.hg_spec_gcf - elif type_ == 'fam-gcf': + elif type_ == 'mf-gcf': return self.hg_fam_gcf raise Exception( @@ -102,7 +102,7 @@ def metcalf_scoring(self, # Compute the expected values for all possible values of spec and gcf strains # we need the total number of strains - _, n_strains = data_links.gcf_strain_occurrence.shape + _, n_strains = data_links.occurrence_gcf_strain.shape if self.metcalf_expected is None: sz = (n_strains + 1, n_strains + 1) self.metcalf_expected = np.zeros(sz) @@ -148,12 +148,12 @@ def metcalf_scoring(self, data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf) self.metcalf_spec_gcf = metcalf_scores - elif type == 'fam-gcf': - metcalf_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape) - metcalf_scores = (data_links.cooccurrence_fam_gcf * both + - data_links.cooccurrence_fam_notgcf * type1_not_gcf + - data_links.cooccurrence_notfam_gcf * gcf_not_type1 + - data_links.cooccurrence_notfam_notgcf * not_type1_not_gcf) + elif type == 'mf-gcf': + metcalf_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape) + metcalf_scores = (data_links.cooccurrence_mf_gcf * both + + data_links.cooccurrence_mf_notgcf * type1_not_gcf + + data_links.cooccurrence_notmf_gcf * gcf_not_type1 + + data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf) self.metcalf_fam_gcf = metcalf_scores return metcalf_scores @@ -170,12 +170,12 @@ def hg_scoring(self, data_links, type='spec-gcf'): # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain, # cooccurrence_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead # of "3", because the spectrum only has 2 distinct strains vs the GCF. - # To fix this the cooccurrence_spec_gcf/cooccurrence_fam_gcf matrix can just be added onto the others to give + # To fix this the cooccurrence_spec_gcf/cooccurrence_mf_gcf matrix can just be added onto the others to give # the correct totals. if type == 'spec-gcf': num_strains = np.ones( - data_links.cooccurrence_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] + data_links.cooccurrence_spec_gcf.shape) * data_links.occurrence_gcf_strain.shape[1] overlap_counts = data_links.cooccurrence_spec_gcf gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf @@ -185,12 +185,12 @@ def hg_scoring(self, data_links, type='spec-gcf'): spec_counts, loc=1) self.hg_spec_gcf = hg_scores - elif type == 'fam-gcf': + elif type == 'mf-gcf': num_strains = np.ones( - data_links.cooccurrence_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1] - overlap_counts = data_links.cooccurrence_fam_gcf - gcf_counts = overlap_counts + data_links.cooccurrence_notfam_gcf - fam_counts = overlap_counts + data_links.cooccurrence_fam_notgcf + data_links.cooccurrence_mf_gcf.shape) * data_links.occurrence_gcf_strain.shape[1] + overlap_counts = data_links.cooccurrence_mf_gcf + gcf_counts = overlap_counts + data_links.cooccurrence_notmf_gcf + fam_counts = overlap_counts + data_links.cooccurrence_mf_notgcf hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, @@ -232,11 +232,11 @@ def likelihood_scoring(self, self.likescores_spec_gcf = likelihood_scores - elif type == 'fam-gcf': - likelihood_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape) + elif type == 'mf-gcf': + likelihood_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape) likelihood_scores = ( likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.cooccurrence_fam_gcf))) + (1 - np.exp(-alpha_weighing * data_links.cooccurrence_mf_gcf))) self.likescores_fam_gcf = likelihood_scores return likelihood_scores @@ -247,11 +247,11 @@ def select_link_candidates(self, P_cutoff=0.8, main_score='likescore', score_cutoff=0, - type='fam-gcf'): + type='mf-gcf'): """ Look for potential best candidate for links between IF type='spec-gcf': GCFs and spectra - IF type='fam-gcf': GCFs and mol.families + IF type='mf-gcf': GCFs and mol.families Parameters ---------- @@ -269,7 +269,7 @@ def select_link_candidates(self, score >= score_cutoff """ - # Select scenario: spec<->gcf or fam<->gcf + # Select scenario: spec<->gcf or mf<->gcf if type == 'spec-gcf': P_gcf_given_type1 = likelihoods.P_gcf_given_spec P_gcf_not_type1 = likelihoods.P_gcf_not_spec @@ -285,12 +285,12 @@ def select_link_candidates(self, "link prob specific" ] - elif type == 'fam-gcf': + elif type == 'mf-gcf': P_gcf_given_type1 = likelihoods.P_gcf_given_fam P_gcf_not_type1 = likelihoods.P_gcf_not_fam P_type1_given_gcf = likelihoods.P_fam_given_gcf P_type1_not_gcf = likelihoods.P_fam_not_gcf - M_type1_gcf = data_links.cooccurrence_fam_gcf + M_type1_gcf = data_links.cooccurrence_mf_gcf metcalf_scores = self.metcalf_fam_gcf likescores = self.likescores_fam_gcf index_names = [ @@ -304,7 +304,7 @@ def select_link_candidates(self, raise Exception("Given types are not yet supported... ") else: raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf'..." + "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..." ) dim1, dim2 = P_gcf_given_type1.shape @@ -341,14 +341,14 @@ def select_link_candidates(self, Nx_list = data_links.mapping_gcf["no of strains"] if type == 'spec-gcf': Ny_list = data_links.mapping_spec["no of strains"] - elif type == 'fam-gcf': + elif type == 'mf-gcf': Ny_list = data_links.mapping_fam["no of strains"] # Calculate probabilities of finding a spectrum in a certain strain P_str = np.array(data_links.mapping_strain["no of spectra"]) P_str = P_str / np.sum(P_str) - num_strains = data_links.gcf_strain_occurrence.shape[1] + num_strains = data_links.occurrence_gcf_strain.shape[1] # Calculate the hypergeometric probability (as before) for i in range(link_candidates.shape[1]): @@ -363,7 +363,7 @@ def select_link_candidates(self, id_gcf = link_candidates[1, i] # find set of strains which contain GCF with id link_candidates[1,i] - XG = np.where(data_links.gcf_strain_occurrence.loc[id_gcf, :] == 1)[0] + XG = np.where(data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0] link_candidates[10, i] = pair_prob_approx(P_str, XG, @@ -372,11 +372,11 @@ def select_link_candidates(self, # Calculate the link specific probability # Find strains where GCF and spectra/family co-occur if type == 'spec-gcf': - XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1) & - (data_links.spec_strain_occurrence[id_spec, :] == 1))[0] - elif type == 'fam-gcf': - XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1) - & (data_links.mf_strain_occurrence[id_spec, :] == 1))[0] + XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1) & + (data_links.occurrence_spec_strain[id_spec, :] == 1))[0] + elif type == 'mf-gcf': + XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1) + & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0] link_candidates[11, i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]), int(Ny_list[id_spec]), num_strains) @@ -401,7 +401,7 @@ def select_link_candidates(self, # return results if type == 'spec-gcf': self.link_candidates_gcf_spec = link_candidates_pd - elif type == 'fam-gcf': + elif type == 'mf-gcf': self.link_candidates_gcf_fam = link_candidates_pd else: raise Exception("No candidate selection was created.") @@ -580,7 +580,7 @@ def get_links(self, def create_cytoscape_files(self, data_links, network_filename, - link_type='fam-gcf', + link_type='mf-gcf', score_type='metcalf'): """ Create network file for import into Cytoscape. @@ -603,7 +603,7 @@ def create_cytoscape_files(self, import networkx as nx NPlinker_net = nx.Graph() - if link_type == 'fam-gcf': + if link_type == 'mf-gcf': link_candidates = self.link_candidates_gcf_fam type1str = 'family_id' elif link_type == 'spec-gcf': @@ -669,7 +669,7 @@ def plot_candidates(self, P_cutoff=0.8, score_type='likescore', score_cutoff=0, - type='fam-gcf'): + type='mf-gcf'): """ Plot best rated correlations between gcfs and spectra/families plot in form of seaborn clustermap @@ -690,7 +690,7 @@ def plot_candidates(self, selected_ids = np.where( (link_candidates["P(gcf|spec)"] > P_cutoff) & (link_candidates[scorestr] > score_cutoff))[0] - elif type == 'fam-gcf': + elif type == 'mf-gcf': link_candidates = self.link_candidates_gcf_fam selected_ids = np.where( (link_candidates["P(gcf|fam)"] > P_cutoff) @@ -731,7 +731,7 @@ def plot_candidates(self, columns=mapping_gcfs.astype(int)) if type == 'spec-gcf': M_links.index.name = 'spectrum number' - elif type == 'fam-gcf': + elif type == 'mf-gcf': M_links.index.name = 'molecular family number' M_links.columns.name = 'gene cluster family (GCF)' diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py index c8758d19..830e0676 100644 --- a/src/nplinker/scoring/linking/link_likelihood.py +++ b/src/nplinker/scoring/linking/link_likelihood.py @@ -39,27 +39,27 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): IF type='spec-gcf': P(GCF_x | spec_y), P(spec_y | GCF_x), P(GCF_x | not spec_y), P(spec_y | not GCF_x) - IF type='fam-gcf': + IF type='mf-gcf': P(GCF_x | fam_y), P(fam_y | GCF_x), P(GCF_x | not fam_y), P(fam_y | not GCF_x) """ - # Make selection for scenario spec<->gcf or fam<->gcf + # Make selection for scenario spec<->gcf or mf<->gcf if type == 'spec-gcf': M_type1_type2 = data_links.cooccurrence_spec_gcf M_type1_nottype2 = data_links.cooccurrence_spec_notgcf M_nottype1_type2 = data_links.cooccurrence_notspec_gcf - M_type1_cond = data_links.spec_strain_occurrence - elif type == 'fam-gcf': - M_type1_type2 = data_links.cooccurrence_fam_gcf - M_type1_nottype2 = data_links.cooccurrence_fam_notgcf - M_nottype1_type2 = data_links.cooccurrence_notfam_gcf - M_type1_cond = data_links.mf_strain_occurrence + M_type1_cond = data_links.occurrence_spec_strain + elif type == 'mf-gcf': + M_type1_type2 = data_links.cooccurrence_mf_gcf + M_type1_nottype2 = data_links.cooccurrence_mf_notgcf + M_nottype1_type2 = data_links.cooccurrence_notmf_gcf + M_type1_cond = data_links.occurrence_mf_strain elif type == 'spec-bgc' or type == 'fam-bgc': raise Exception("Given types are not yet supported... ") else: raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf'..." + "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..." ) logger.debug( @@ -67,7 +67,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): # Calculate likelihood matrices using calc_likelihood_matrix() P_type2_given_type1, P_type2_not_type1, P_type1_given_type2, \ P_type1_not_type2 = calc_likelihood_matrix(M_type1_cond, - data_links.gcf_strain_occurrence, + data_links.occurrence_gcf_strain, M_type1_type2, M_type1_nottype2, M_nottype1_type2) @@ -76,7 +76,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): self.P_gcf_not_spec = P_type2_not_type1 self.P_spec_given_gcf = P_type1_given_type2 self.P_spec_not_gcf = P_type1_not_type2 - elif type == 'fam-gcf': + elif type == 'mf-gcf': self.P_gcf_given_fam = P_type2_given_type1 self.P_gcf_not_fam = P_type2_not_type1 self.P_fam_given_gcf = P_type1_given_type2 diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index b435635b..00c04487 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -77,13 +77,13 @@ def setup(npl): logger.info( 'MetcalfScoring.setup preprocessing dataset (this may take some time)' ) - MetcalfScoring.DATALINKS = DataLinks(npl._spectra, npl._gcfs, - npl._strains, npl.molfams) + MetcalfScoring.DATALINKS = DataLinks(npl._gcfs, npl._spectra, + npl._molfams, npl._strains) MetcalfScoring.LINKFINDER = LinkFinder() MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, type='spec-gcf') MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - type='fam-gcf') + type='mf-gcf') logger.debug('MetcalfScoring.setup caching results') save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS, MetcalfScoring.LINKFINDER), cache_file) @@ -238,7 +238,7 @@ def get_links(self, objects, link_collection): 'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs={}, results={}' .format(len(objects), results[0].shape)) # for GCF input, results contains two arrays of shape (3, x), - # which contain spec-gcf and fam-gcf links respectively + # which contain spec-gcf and mf-gcf links respectively result_gcf_spec, result_gcf_fam = results[0], results[1] for res, type_ in [(result_gcf_spec, Spectrum), diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py index e43aa5ac..8c46a28e 100644 --- a/tests/scoring/test_scoring.py +++ b/tests/scoring/test_scoring.py @@ -71,7 +71,7 @@ def do_scoring_old(gcfs, spectra, strains, standardised): def do_scoring_new(gcfs, spectra, strains, standardised): - datalinks = DataLinks(spectra, gcfs, strains) + datalinks = DataLinks(gcfs, spectra, strains) lf = LinkFinder() scores = lf.metcalf_scoring(datalinks) @@ -102,7 +102,7 @@ def do_scoring_old_hg(gcfs, spectra, strains): def do_scoring_new_hg(gcfs, spectra, strains): - datalinks = DataLinks(spectra, gcfs, strains) + datalinks = DataLinks(gcfs, spectra, strains) lf = LinkFinder() scores = lf.hg_scoring(datalinks) return scores From 66cda2c0b45a2ce3adb06c22c7b91b95be6ff00d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 12:14:47 +0200 Subject: [PATCH 42/95] Delete test_data_links.py --- tests/scoring/test_data_links.py | 44 -------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 tests/scoring/test_data_links.py diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py deleted file mode 100644 index e794a29b..00000000 --- a/tests/scoring/test_data_links.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import pytest -from nplinker.metabolomics.gnps.gnps_molecular_family_loader import \ - GNPSMolecularFamilyLoader -from nplinker.metabolomics.metabolomics import make_families -from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking.data_linking import DataLinks -from .. import DATA_DIR - - -@pytest.fixture -def spec_with_families(spec_dict) -> dict[str, Spectrum]: - make_families(spec_dict.values()) - return spec_dict - -@pytest.fixture -def molecular_families_gnps(): - filename = os.path.join(DATA_DIR, "edges.pairsinfo") - sut = GNPSMolecularFamilyLoader(filename) - return sut.families() - -def test_collect_mappings_from_spectra(spec_with_families): - sut = DataLinks() - actual = sut._collect_mappings_from_spectra(spec_with_families.values()) - - assert actual.shape == (25935,3) - - -def test_collect_mappings_from_molecular_families(molecular_families_gnps): - sut = DataLinks() - actual = sut._collect_mappings_from_molecular_families(molecular_families_gnps) - - assert actual.shape == (25935,3) - - -def test_mappings_are_equal(spec_with_families, molecular_families_gnps): - sut = DataLinks() - sut._collect_mappings_from_spectra(spec_with_families.values()) - actual = sut.mapping_spec - - sut._collect_mappings_from_molecular_families(molecular_families_gnps) - expected = sut.mapping_spec - - assert actual.eq(expected).all(axis=None) From 7094d0d99da482db64b2803077c46392a65ea69a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 11:49:53 +0200 Subject: [PATCH 43/95] update get_common_strains methods - update parameters to be more clear and specific - change strain id in returned dict to strain objects - update docstrings --- src/nplinker/nplinker.py | 104 +++++++------------ src/nplinker/scoring/linking/data_linking.py | 16 +-- src/nplinker/scoring/np_class_scoring.py | 10 +- 3 files changed, 52 insertions(+), 78 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index e47c35fb..bc598723 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -1,20 +1,8 @@ -# Copyright 2021 The NPLinker Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +from __future__ import annotations import copy import logging import sys +from typing import TYPE_CHECKING from .config import Args from .config import Config from .genomics import BGC @@ -30,6 +18,9 @@ from .scoring.rosetta_scoring import RosettaScoring from .strain_collection import StrainCollection +if TYPE_CHECKING: + from collections.abc import Sequence + from .strains import Strain logger = LogConfig.getLogger(__name__) @@ -274,8 +265,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): else: # CG: only reload genomics data when changing bigscape cutoff # TODO: this part should be removed, reload everything if bigscape data changes. - logger.debug( - f'load_data with new cutoff = {new_bigscape_cutoff}') + logger.debug(f'load_data with new cutoff = {new_bigscape_cutoff}') # 1. change the cutoff (which by itself doesn't do anything) self._loader._bigscape_cutoff = new_bigscape_cutoff # 2. reload the strain mappings (MiBIG filtering may have removed strains @@ -435,19 +425,19 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): filter(lambda x: not isinstance(x, BGC), link_data.keys())) if len(targets) > 0: if isinstance(source, GCF): - shared_strains = self._datalinks.get_common_strains(targets, [source], True) + shared_strains = self._datalinks.get_common_strains( + targets, [source], True) for target, link in link_data.items(): if (target, source) in shared_strains: - link.shared_strains = [ - self._strains.lookup(strain_id) for strain_id - in shared_strains[(target, source)]] + link.shared_strains = shared_strains[(target, + source)] else: - shared_strains = self._datalinks.get_common_strains([source], targets, True) + shared_strains = self._datalinks.get_common_strains( + [source], targets, True) for target, link in link_data.items(): if (source, target) in shared_strains: - link.shared_strains = [ - self._strains.lookup(strain_id) for strain_id - in shared_strains[(source, target)]] + link.shared_strains = shared_strains[(source, + target)] logger.debug('Finished calculating shared strain information') @@ -455,55 +445,32 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): len(link_collection))) return link_collection - def get_common_strains(self, objects_a, objects_b, filter_no_shared=True): - """Retrive strains shared by arbitrary pairs of objects. - - Two lists of objects are required as input. Typically one list will be - MolecularFamily or Spectrum objects and the other GCF (which list is which - doesn't matter). + def get_common_strains( + self, + met: Sequence[Spectrum] | Sequence[MolecularFamily], + gcfs: Sequence[GCF], + filter_no_shared: bool = True + ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]: + """Get common strains between given spectra/molecular families and GCFs. - The return value is a dict mapping pairs of objects to lists of Strain objects - shared by that pair. This list may be empty if ``filter_no_shared`` is False, - indicating no shared strains were found. - - If ``filter_no_shared`` is True, every entry in the dict with no shared strains - will be removed before it is returned, so the only entries will be those for - which shared strains exist. + Note that SingletonFamily objects are excluded from given molecular families. Args: - objects_a (list): a list of Spectrum/MolecularFamily/GCF objects - objects_b (list): a list of Spectrum/MolecularFamily/GCF objects - filter_no_shared (bool): if True, remove result entries for which no shared strains exist + met(Sequence[Spectrum] | Sequence[MolecularFamily]): + A list of Spectrum or MolecularFamily objects. + gcfs(Sequence[GCF]): A list of GCF objects. + filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF + without common strains will be removed from the returned dict; Returns: - A dict mapping pairs of objects (obj1, obj2) to lists of Strain objects. - - NOTE: The ordering of the pairs is *fixed* to be (metabolomic, genomic). In - other words, if objects_a = [GCF1, GC2, ...] and objects_b = [Spectrum1, - Spectrum2, ...], the object pairs will be (Spectrum1, GCF1), (Spectrum2, - GCF2), and so on. The same applies if objects_a and objects_b are swapped, - the metabolomic objects (Spectrum or MolecularFamily) will be the obj1 - entry in each pair. + dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) + and values are a list of shared Strain objects. """ if not self._datalinks: self._datalinks = self.scoring_method( MetcalfScoring.NAME).datalinks - - # this is a dict with structure: - # (Spectrum/MolecularFamily, GCF) => list of strain indices - common_strains_index_dict = self._datalinks.get_common_strains( - objects_a, objects_b, filter_no_shared) - - common_strains = {} - # replace the lists of strain indices with actual strain objects - # TODO: bug here, the index value of common_strains_index_dict is - # not the same as the index value of self._strains - # Solution: lookup with strain.id instead of index - for key in common_strains_index_dict: - common_strains[key] = [ - self._strains.lookup_index(x) for x in common_strains_index_dict[key] - ] - + common_strains = self._datalinks.get_common_strains( + met, gcfs, filter_no_shared) return common_strains def has_bgc(self, bgc_id): @@ -512,15 +479,18 @@ def has_bgc(self, bgc_id): def lookup_bgc(self, bgc_id): """If BGC ``bgc_id`` exists, return it. Otherwise return None""" - return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc(bgc_id) else None + return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc( + bgc_id) else None def lookup_gcf(self, gcf_id): """If GCF ``gcf_id`` exists, return it. Otherwise return None""" - return self._gcfs[self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None + return self._gcfs[ + self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None def lookup_spectrum(self, name): """If Spectrum ``name`` exists, return it. Otherwise return None""" - return self._spectra[self._spec_lookup[name]] if name in self._spec_lookup else None + return self._spectra[ + self._spec_lookup[name]] if name in self._spec_lookup else None @property def strains(self): diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index 3c79f46f..daa46915 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -12,6 +12,8 @@ if TYPE_CHECKING: from nplinker.strain_collection import StrainCollection + from nplinker.strains import Strain + logger = LogConfig.getLogger(__name__) @@ -60,6 +62,8 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], mapping_mf(pd.DataFrame): A DataFrame to store mappings for molecular families. mapping_strain(pd.DataFrame): A DataFrame to store mappings for strains. """ + self._strains = strains + logger.debug( "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains." ) @@ -97,7 +101,7 @@ def get_common_strains( spectra_or_mfs: Sequence[Spectrum] | Sequence[MolecularFamily], gcfs: Sequence[GCF], filter_no_shared: bool = False - ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]: + ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]: """Get common strains between given spectra/molecular families and GCFs. Note that SingletonFamily objects are excluded from given `spectra_or_mfs`. @@ -106,14 +110,12 @@ def get_common_strains( spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]): A list of Spectrum or MolecularFamily objects. gcfs(Sequence[GCF]): A list of GCF objects. - filter_no_shared(bool): If True, return only the pair of - spectrum/molecular family and GCF that have common strains; - otherwise, return all pairs no matter they have common strains - or not. + filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF + without common strains will be removed from the returned dict; Returns: dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) - and values are a list of strain ids that appear in both objects. + and values are a list of shared Strain objects. """ if not isinstance(spectra_or_mfs[0], (Spectrum, MolecularFamily)): raise ValueError( @@ -139,7 +141,7 @@ def get_common_strains( self.occurrence_gcf_strain.loc[gcf.gcf_id])] if filter_no_shared and len(shared_strains) == 0: continue - results[(obj, gcf)] = shared_strains.to_list() + results[(obj, gcf)] = [self._strains.lookup(strain_id) for strain_id in shared_strains] return results def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF], diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index b63b7b0a..79d1a71f 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -337,10 +337,12 @@ def get_links(self, objects, link_collection): if not self.npl._datalinks: self.npl._datalinks = self.npl.scoring_method( MetcalfScoring.NAME).datalinks - # this is a dict with structure: - # tup(Spectrum/MolecularFamily, GCF) => array of strain indices - common_strains = self.npl._datalinks.get_common_strains( - objects, targets, True) + if obj_is_gen: + common_strains = self.npl.get_common_strains( + targets, objects) + else: + common_strains = self.npl.get_common_strains( + objects, targets) logger.info(f"Calculating NPClassScore for {len(objects)} objects to " f"{len(targets)} targets ({len(common_strains)} pairwise " f"interactions that share at least 1 strain). This might " From 0b137468bab7fbadd57709751d2aec90c3de4bd1 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 12:08:57 +0200 Subject: [PATCH 44/95] remove lookup_index method from StrainCollection (#90) - remove method `lookup_index` - remove attribute `_strain_dict_index` --- src/nplinker/strain_collection.py | 18 +----------------- tests/test_strain_collection.py | 13 +------------ 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 35e63daf..16bca126 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -1,5 +1,4 @@ import csv -import os from os import PathLike from pathlib import Path from typing import Iterator @@ -9,7 +8,6 @@ from .utils import list_dirs from .utils import list_files - logger = LogConfig.getLogger(__name__) @@ -20,7 +18,6 @@ def __init__(self): self._strains: list[Strain] = [] # dict of strain name (id and alias) to strain object self._strain_dict_name: dict[str, Strain] = {} - self._strain_dict_index: dict[int, Strain] = {} def __repr__(self) -> str: return str(self) @@ -38,7 +35,7 @@ def __len__(self) -> int: def __eq__(self, other) -> bool: if isinstance(other, StrainCollection): return (self._strains == other._strains - and self._strain_dict_index == other._strain_dict_index) + and self._strain_dict_name == other._strain_dict_name) return NotImplemented def __contains__(self, item: str | Strain) -> bool: @@ -74,7 +71,6 @@ def add(self, strain: Strain) -> None: existing.add_alias(alias) self._strain_dict_name[alias] = existing else: - self._strain_dict_index[len(self)] = strain self._strains.append(strain) self._strain_dict_name[strain.id] = strain for alias in strain.aliases: @@ -88,7 +84,6 @@ def remove(self, strain: Strain): """ if strain.id in self._strain_dict_name: self._strains.remove(strain) - # remove from dict id del self._strain_dict_name[strain.id] for alias in strain.aliases: del self._strain_dict_name[alias] @@ -102,17 +97,6 @@ def filter(self, strain_set: set[Strain]): if strain not in strain_set: self.remove(strain) - def lookup_index(self, index: int) -> Strain: - """Return the strain from lookup by index. - - Args: - index(int): Position index from which to retrieve the strain - - Returns: - Strain: Strain identified by the given index. - """ - return self._strain_dict_index[index] - def lookup(self, name: str) -> Strain: """Lookup a strain by name (id or alias). diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index bd900a0f..dcdad3df 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -41,13 +41,13 @@ def test_iter(collection: StrainCollection, strain: Strain): for actual in collection: assert actual == strain + def test_add(strain: Strain): sut = StrainCollection() sut.add(strain) assert strain in sut for alias in strain.aliases: assert alias in sut - assert sut._strain_dict_index[0] == strain def test_remove(collection: StrainCollection, strain: Strain): @@ -56,9 +56,6 @@ def test_remove(collection: StrainCollection, strain: Strain): with pytest.raises(KeyError): _ = collection._strain_dict_name[strain.id] assert strain not in collection - # TODO: issue #90 - # with pytest.raises(KeyError): - # collection.lookup_index(0) def test_filter(collection: StrainCollection, strain: Strain): @@ -70,13 +67,6 @@ def test_filter(collection: StrainCollection, strain: Strain): assert len(collection) == 1 -def test_lookup_index(collection: StrainCollection, strain: Strain): - actual = collection.lookup_index(0) - assert actual == strain - with pytest.raises(KeyError): - collection.lookup_index(1) - - def test_lookup(collection: StrainCollection, strain: Strain): assert collection.lookup(strain.id) == strain for alias in strain.aliases: @@ -89,7 +79,6 @@ def test_add_from_file(): sut = StrainCollection() sut.add_from_file(DATA_DIR / "strain_mappings.csv") assert len(sut) == 27 - assert len(sut.lookup_index(1).aliases) == 29 def test_save_to_file(collection: StrainCollection, tmp_path): From 96d221041bb58a504b73b9f50dda48e6145d92ba Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 14:39:54 +0200 Subject: [PATCH 45/95] Remove integer id from GCF --- src/nplinker/genomics/gcf.py | 2 +- src/nplinker/loader.py | 6 ------ src/nplinker/scoring/linking/link_finder.py | 5 ++--- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 5ea61486..62669bf5 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -3,6 +3,7 @@ from nplinker.logconfig import LogConfig from nplinker.strain_collection import StrainCollection + if TYPE_CHECKING: from nplinker.strains import Strain from .bgc import BGC @@ -35,7 +36,6 @@ def __init__(self, gcf_id: str, /) -> None: self.bigscape_class: str | None = None # CG TODO: remove attribute id, see issue 103 # https://github.com/NPLinker/nplinker/issues/103 - self.id: int | None = None self.bgc_ids: set[str] = set() self.strains: StrainCollection = StrainCollection() diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index d4865630..3f44b4df 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -659,12 +659,6 @@ def _load_genomics(self): antismash_bgc_loader.get_files(), self._bigscape_cutoff) - # CG TODO: remove the gcf.id, see issue 103 - # https://github.com/NPLinker/nplinker/issues/103 - # This is only place to set gcf.id value. - for i, gcf in enumerate(self.gcfs): - gcf.id = i - #---------------------------------------------------------------------- # CG: write unknown strains in genomics to file #---------------------------------------------------------------------- diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 7fcdae03..5997a08f 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -439,15 +439,14 @@ def get_links(self, query_size = 1 # Check type of input_object: + # TODO CG: replace integer ids with string ids # If GCF: if isinstance(input_object[0], GCF): input_type = "gcf" link_levels = [0, 1] # Get necessary ids - # CG: TODO update the logics here: - # don't use integer gcf.id, use string gcf.gcf_id instead. - input_ids = np.array([gcf.id for gcf in input_object], + input_ids = np.array([gcf.gcf_id for gcf in input_object], dtype=np.int32) if main_score == 'likescore': From 44d2c675b46e1e6225ce3f3e6dcddd3c6d6bdf31 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 14:51:30 +0200 Subject: [PATCH 46/95] update lookup methods and attributes in NPLikner class --- src/nplinker/nplinker.py | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index bc598723..02799ed0 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -122,6 +122,7 @@ def __init__(self, userconfig=None): self._class_matches = None self._bgc_lookup = {} + self._gcf_lookup = {} self._spec_lookup = {} self._scoring_methods = {} @@ -286,25 +287,17 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): self._class_matches = self._loader.class_matches logger.debug('Generating lookup tables: genomics') - self._bgc_lookup = {} - for i, bgc in enumerate(self._bgcs): - self._bgc_lookup[bgc.bgc_id] = i - - self._gcf_lookup = {} - for i, gcf in enumerate(self._gcfs): - self._gcf_lookup[gcf.gcf_id] = i + self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs} + self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs} # don't need to do these two if cutoff changed (indicating genomics data # was reloaded but not metabolomics) if new_bigscape_cutoff is None: logger.debug('Generating lookup tables: metabolomics') - self._spec_lookup = {} - for i, spec in enumerate(self._spectra): - self._spec_lookup[spec.spectrum_id] = i - - self._molfam_lookup = {} - for i, molfam in enumerate(self._molfams): - self._molfam_lookup[molfam.id] = i + self._spec_lookup = { + spec.spectrum_id: spec + for spec in self._spectra + } logger.debug('load_data: completed') return True @@ -479,18 +472,15 @@ def has_bgc(self, bgc_id): def lookup_bgc(self, bgc_id): """If BGC ``bgc_id`` exists, return it. Otherwise return None""" - return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc( - bgc_id) else None + return self._bgc_lookup.get(bgc_id, None) def lookup_gcf(self, gcf_id): """If GCF ``gcf_id`` exists, return it. Otherwise return None""" - return self._gcfs[ - self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None + return self._gcf_lookup.get(gcf_id, None) - def lookup_spectrum(self, name): + def lookup_spectrum(self, spectrum_id): """If Spectrum ``name`` exists, return it. Otherwise return None""" - return self._spectra[ - self._spec_lookup[name]] if name in self._spec_lookup else None + return self._spec_lookup.get(spectrum_id, None) @property def strains(self): From 92aee38528709994e7814bd579931d69e2bec6fd Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 15:26:40 +0200 Subject: [PATCH 47/95] change cooccurrence from array to DataFrame in DataLinks --- src/nplinker/scoring/linking/data_linking.py | 47 +++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index daa46915..0edec183 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -14,15 +14,13 @@ from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain - logger = LogConfig.getLogger(__name__) class DataLinks(): def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], - mfs: Sequence[MolecularFamily], - strains: StrainCollection): + mfs: Sequence[MolecularFamily], strains: StrainCollection): """DataLinks class to store occurrence and co-occurrence information. Occurrence refers to the presence of a spectrum/gcf/mf in a strain. @@ -41,21 +39,21 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], spectra with respect to strains. occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of molecular families with respect to strains. - cooccurrence_spec_gcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_spec_gcf(pd.DataFrame): A DataFrame to store co-occurrence of spectra<->gcfs. - cooccurrence_spec_notgcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_spec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence of spectra<->not gcfs. - cooccurrence_notspec_gcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_notspec_gcf(pd.DataFrame): A DataFrame to store co-occurrence of not spectra<->gcfs. - cooccurrence_notspec_notgcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_notspec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence of not spectra<->not gcfs. - cooccurrence_mf_gcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_mf_gcf(pd.DataFrame): A DataFrame to store co-occurrence of molecular families<->gcfs. - cooccurrence_mf_notgcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_mf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence of molecular families<->not gcfs. - cooccurrence_notmf_gcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_notmf_gcf(pd.DataFrame): A DataFrame to store co-occurrence of not molecular families<->gcfs. - cooccurrence_notmf_notgcf(np.array): A 2D numpy array to store co-occurrence + cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence of not molecular families<->not gcfs. mapping_gcf(pd.DataFrame): A DataFrame to store mappings for gcfs. mapping_spec(pd.DataFrame): A DataFrame to store mappings for spectra. @@ -84,7 +82,7 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], self.mapping_strain = pd.DataFrame() self._get_mappings_from_occurrence() - # np.array to store co-occurrence of "spectra<->gcf" or "mfs<->gcf" + # DataFrame to store co-occurrence of "spectra<->gcf" or "mfs<->gcf" logger.debug("Create correlation matrices: spectra<->gcfs.") (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf, self.cooccurrence_notspec_gcf, @@ -141,7 +139,10 @@ def get_common_strains( self.occurrence_gcf_strain.loc[gcf.gcf_id])] if filter_no_shared and len(shared_strains) == 0: continue - results[(obj, gcf)] = [self._strains.lookup(strain_id) for strain_id in shared_strains] + results[(obj, gcf)] = [ + self._strains.lookup(strain_id) + for strain_id in shared_strains + ] return results def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF], @@ -220,7 +221,7 @@ def _get_mappings_from_occurrence(self): def _get_cooccurrence( self, link_type: str = 'spec-gcf' - ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Calculate co-occurrence for given link types across strains. Args: @@ -236,5 +237,19 @@ def _get_cooccurrence( f"Link type {link_type} is not supported. Use 'spec-gcf' or 'mf-gcf' instead." ) logger.debug(f"Calculating correlation matrices of type: {link_type}") - return calc_correlation_matrix(met_strain_occurrence, - self.occurrence_gcf_strain) + m1, m2, m3, m4 = calc_correlation_matrix(met_strain_occurrence, + self.occurrence_gcf_strain) + df_met_gcf = pd.DataFrame(m1, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index) + df_met_notgcf = pd.DataFrame(m2, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index) + df_notmet_gcf = pd.DataFrame(m3, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index) + df_notmet_notgcf = pd.DataFrame( + m4, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index) + return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf From d23994034f5a24027f6f49658c6691f5b7a3a125 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 15:41:56 +0200 Subject: [PATCH 48/95] format link_finder.py --- src/nplinker/scoring/linking/link_finder.py | 56 +++++++++++---------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 5997a08f..d0100e05 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -1,4 +1,3 @@ - from deprecated import deprecated import numpy as np import pandas as pd @@ -8,10 +7,8 @@ from .data_linking_functions import pair_prob_approx from .data_linking_functions import pair_prob_hg - # CG: TODO get_links function does not work any more, need to update its logics - # import packages for plotting # TODO move plotting to separate module? try: @@ -25,11 +22,11 @@ from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily - logger = LogConfig.getLogger(__file__) SCORING_METHODS = ['metcalf', 'likescore', 'hg'] + class LinkFinder(): """ Class to: @@ -142,18 +139,20 @@ def metcalf_scoring(self, if type == 'spec-gcf': metcalf_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape) - metcalf_scores = (data_links.cooccurrence_spec_gcf * both + - data_links.cooccurrence_spec_notgcf * type1_not_gcf + - data_links.cooccurrence_notspec_gcf * gcf_not_type1 + - data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf) + metcalf_scores = ( + data_links.cooccurrence_spec_gcf * both + + data_links.cooccurrence_spec_notgcf * type1_not_gcf + + data_links.cooccurrence_notspec_gcf * gcf_not_type1 + + data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf) self.metcalf_spec_gcf = metcalf_scores elif type == 'mf-gcf': metcalf_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape) - metcalf_scores = (data_links.cooccurrence_mf_gcf * both + - data_links.cooccurrence_mf_notgcf * type1_not_gcf + - data_links.cooccurrence_notmf_gcf * gcf_not_type1 + - data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf) + metcalf_scores = ( + data_links.cooccurrence_mf_gcf * both + + data_links.cooccurrence_mf_notgcf * type1_not_gcf + + data_links.cooccurrence_notmf_gcf * gcf_not_type1 + + data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf) self.metcalf_fam_gcf = metcalf_scores return metcalf_scores @@ -174,8 +173,8 @@ def hg_scoring(self, data_links, type='spec-gcf'): # the correct totals. if type == 'spec-gcf': - num_strains = np.ones( - data_links.cooccurrence_spec_gcf.shape) * data_links.occurrence_gcf_strain.shape[1] + num_strains = np.ones(data_links.cooccurrence_spec_gcf.shape + ) * data_links.occurrence_gcf_strain.shape[1] overlap_counts = data_links.cooccurrence_spec_gcf gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf @@ -186,8 +185,8 @@ def hg_scoring(self, data_links, type='spec-gcf'): loc=1) self.hg_spec_gcf = hg_scores elif type == 'mf-gcf': - num_strains = np.ones( - data_links.cooccurrence_mf_gcf.shape) * data_links.occurrence_gcf_strain.shape[1] + num_strains = np.ones(data_links.cooccurrence_mf_gcf.shape + ) * data_links.occurrence_gcf_strain.shape[1] overlap_counts = data_links.cooccurrence_mf_gcf gcf_counts = overlap_counts + data_links.cooccurrence_notmf_gcf fam_counts = overlap_counts + data_links.cooccurrence_mf_notgcf @@ -224,11 +223,13 @@ def likelihood_scoring(self, """ if type == 'spec-gcf': - likelihood_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape) + likelihood_scores = np.zeros( + data_links.cooccurrence_spec_gcf.shape) likelihood_scores = ( likelihoods.P_gcf_given_spec * (1 - likelihoods.P_spec_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf))) + (1 - + np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf))) self.likescores_spec_gcf = likelihood_scores @@ -363,7 +364,8 @@ def select_link_candidates(self, id_gcf = link_candidates[1, i] # find set of strains which contain GCF with id link_candidates[1,i] - XG = np.where(data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0] + XG = np.where( + data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0] link_candidates[10, i] = pair_prob_approx(P_str, XG, @@ -372,11 +374,13 @@ def select_link_candidates(self, # Calculate the link specific probability # Find strains where GCF and spectra/family co-occur if type == 'spec-gcf': - XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1) & - (data_links.occurrence_spec_strain[id_spec, :] == 1))[0] + XGS = np.where( + (data_links.occurrence_gcf_strain[id_gcf, :] == 1) + & (data_links.occurrence_spec_strain[id_spec, :] == 1))[0] elif type == 'mf-gcf': - XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1) - & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0] + XGS = np.where( + (data_links.occurrence_gcf_strain[id_gcf, :] == 1) + & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0] link_candidates[11, i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]), int(Ny_list[id_spec]), num_strains) @@ -520,8 +524,7 @@ def get_links(self, hg_scores[linklevel] >= score_cutoff) else: # should never happen - raise Exception( - f'Unknown scoring type! "{main_score}"') + raise Exception(f'Unknown scoring type! "{main_score}"') else: # TODO is this best way to get same output as above code? # to keep the remainder of the method identical in the case of no cutoff @@ -575,7 +578,8 @@ def get_links(self, return links - @deprecated(version="1.3.3", reason="The unworkable method will be removed") + @deprecated(version="1.3.3", + reason="The unworkable method will be removed") def create_cytoscape_files(self, data_links, network_filename, From 8ed8ce9e7444ab3ddf252d090085358fb11dda35 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 3 May 2023 16:32:59 +0200 Subject: [PATCH 49/95] temp replace array with dataframe in LinkFinder for metcalf scoring --- src/nplinker/scoring/linking/link_finder.py | 46 +++++++-------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index d0100e05..3fa9bf88 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -7,6 +7,7 @@ from .data_linking_functions import pair_prob_approx from .data_linking_functions import pair_prob_hg + # CG: TODO get_links function does not work any more, need to update its logics # import packages for plotting @@ -22,6 +23,7 @@ from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily + logger = LogConfig.getLogger(__file__) SCORING_METHODS = ['metcalf', 'likescore', 'hg'] @@ -46,8 +48,8 @@ def __init__(self): """ # metcalf scores - self.metcalf_spec_gcf = [] - self.metcalf_fam_gcf = [] + self.metcalf_spec_gcf = pd.DataFrame() + self.metcalf_fam_gcf = pd.DataFrame() # likelihood scores self.likescores_spec_gcf = [] @@ -138,7 +140,6 @@ def metcalf_scoring(self, # at expected_metcalf[3,6] and sqrt of the variance in the same position if type == 'spec-gcf': - metcalf_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape) metcalf_scores = ( data_links.cooccurrence_spec_gcf * both + data_links.cooccurrence_spec_notgcf * type1_not_gcf + @@ -147,7 +148,6 @@ def metcalf_scoring(self, self.metcalf_spec_gcf = metcalf_scores elif type == 'mf-gcf': - metcalf_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape) metcalf_scores = ( data_links.cooccurrence_mf_gcf * both + data_links.cooccurrence_mf_notgcf * type1_not_gcf + @@ -155,7 +155,6 @@ def metcalf_scoring(self, data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf) self.metcalf_fam_gcf = metcalf_scores - return metcalf_scores def hg_scoring(self, data_links, type='spec-gcf'): """ @@ -446,13 +445,9 @@ def get_links(self, # TODO CG: replace integer ids with string ids # If GCF: if isinstance(input_object[0], GCF): + input_ids = [gcf.gcf_id for gcf in input_object] input_type = "gcf" link_levels = [0, 1] - - # Get necessary ids - input_ids = np.array([gcf.gcf_id for gcf in input_object], - dtype=np.int32) - if main_score == 'likescore': likescores = [ # TODO CG: use dataframe instead of numpy array @@ -461,48 +456,32 @@ def get_links(self, ] elif main_score == 'metcalf': metcalf_scores = [ - self.metcalf_spec_gcf[:, input_ids], - self.metcalf_fam_gcf[:, input_ids] + self.metcalf_spec_gcf.loc[:, input_ids], + self.metcalf_fam_gcf.loc[:, input_ids] ] elif main_score == 'hg': hg_scores = [ self.hg_spec_gcf[:, input_ids], self.hg_fam_gcf[:, input_ids] ] - - # If Spectrum: elif isinstance(input_object[0], Spectrum): - # Get necessary ids - input_ids = np.array([spec.id for spec in input_object], - dtype=np.int32) - + input_ids = [spec.spectrum_id for spec in input_object] input_type = "spec" link_levels = [0] if main_score == 'likescore': likescores = [self.likescores_spec_gcf[input_ids, :], []] elif main_score == 'metcalf': - metcalf_scores = [self.metcalf_spec_gcf[input_ids, :], []] + metcalf_scores = [self.metcalf_spec_gcf.loc[input_ids, :], []] elif main_score == 'hg': hg_scores = [self.hg_spec_gcf[input_ids, :], []] - # If MolecularFamily: elif isinstance(input_object[0], MolecularFamily): - - # Get necessary ids - # TODO: include Singletons, maybe optinal - #input_ids = np.zeros(query_size) - #mapping_fam_id = data_links.mapping_fam["original family id"] - #for i, family in enumerate(input_object): - # input_ids[i] = np.where(mapping_fam_id == int(family.family_id))[0] - #input_ids = input_ids.astype(int) - input_ids = np.array([mf.id for mf in input_object], - dtype=np.int32) - + input_ids = [mf.family_id for mf in input_object] input_type = "fam" link_levels = [1] if main_score == 'likescore': likescores = [[], self.likescores_fam_gcf[input_ids, :]] elif main_score == 'metcalf': - metcalf_scores = [[], self.metcalf_fam_gcf[input_ids, :]] + metcalf_scores = [[], self.metcalf_fam_gcf.loc[input_ids, :]] elif main_score == 'hg': hg_scores = [[], self.hg_fam_gcf[input_ids, :]] else: @@ -517,6 +496,8 @@ def get_links(self, candidate_ids = np.where( likescores[linklevel] >= score_cutoff) elif main_score == 'metcalf': + # get a tuple of arrays of indices where the metcalf score is above the cutoff + # the first array is the row indices, the second is the column indices candidate_ids = np.where( metcalf_scores[linklevel] >= score_cutoff) elif main_score == 'hg': @@ -532,6 +513,7 @@ def get_links(self, # currently abusing np.where like this candidate_ids = np.where(metcalf_scores[linklevel] != np.nan) + # TODO: 230503 continue replace array with dataframe link_candidates = np.zeros((3, candidate_ids[0].shape[0])) # this is supposed to construct a (3, x) array, where: From e4bdd958ea457672e39a808b52564a37d955fc89 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 8 May 2023 10:33:18 +0200 Subject: [PATCH 50/95] refactor `LinkFinder.get_scores` method --- src/nplinker/scoring/linking/link_finder.py | 41 ++++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 3fa9bf88..3b6c788d 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -67,26 +67,39 @@ def __init__(self): self.metcalf_expected = None self.metcalf_variance = None - def get_scores(self, method, type_): - if method == 'metcalf': - if type_ == 'spec-gcf': + def get_scores(self, scoring_method: str, link_type: str) -> pd.DataFrame: + """Get the scores for a given method and link type. + + Args: + scoring_method (str): The scoring method to use. Available methods + are 'metcalf', 'likescore', and 'hg'. + link_type (str): The type of link to get scores for. Available + types are 'spec-gcf' and 'mf-gcf'. + + Returns: + pd.DataFrame: The scores for the given method and link type. + + Raises: + ValueError: If the scoring method or link type is unknown. + """ + if scoring_method == 'metcalf': + if link_type == 'spec-gcf': return self.metcalf_spec_gcf - elif type_ == 'mf-gcf': + if link_type == 'mf-gcf': return self.metcalf_fam_gcf - elif method == 'likescore': - if type_ == 'spec-gcf': + if scoring_method == 'likescore': + if link_type == 'spec-gcf': return self.likescores_spec_gcf - elif type_ == 'mf-gcf': + if link_type == 'mf-gcf': return self.likescores_fam_gcf - elif method == 'hg': - if type_ == 'spec-gcf': + if scoring_method == 'hg': + if link_type == 'spec-gcf': return self.hg_spec_gcf - elif type_ == 'mf-gcf': + if link_type == 'mf-gcf': return self.hg_fam_gcf - - raise Exception( - 'Unknown method or type (method="{}", type="{}")'.format( - method, type_)) + raise ValueError( + f'Unknown method or type method="{scoring_method}", type="{link_type}"' + ) def metcalf_scoring(self, data_links, From 14a1aaccc8d1627eda6cacf4f87f2c8905c3e238 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 8 May 2023 11:31:29 +0200 Subject: [PATCH 51/95] refactor `LinkFinder.metcalf_scoring` method - rename parameter name - wrap parameters for weights to one parameter - extract private method `_cal_mean_std` --- src/nplinker/scoring/linking/link_finder.py | 160 ++++++++++---------- src/nplinker/scoring/metcalf_scoring.py | 12 +- tests/scoring/test_scoring.py | 4 +- 3 files changed, 91 insertions(+), 85 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 3b6c788d..7722c2ca 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -1,3 +1,5 @@ +from __future__ import annotations +from typing import TYPE_CHECKING from deprecated import deprecated import numpy as np import pandas as pd @@ -24,9 +26,13 @@ from nplinker.metabolomics.molecular_family import MolecularFamily +if TYPE_CHECKING: + from .data_linking import DataLinks + logger = LogConfig.getLogger(__file__) -SCORING_METHODS = ['metcalf', 'likescore', 'hg'] +SCORE_TYPES = ['metcalf', 'likescore', 'hg'] +LINK_TYPES = ['spec-gcf', 'mf-gcf'] class LinkFinder(): @@ -64,15 +70,15 @@ def __init__(self): self.link_candidates_gcf_fam = [] # metcalf caching - self.metcalf_expected = None - self.metcalf_variance = None + self.metcalf_mean = None + self.metcalf_std = None - def get_scores(self, scoring_method: str, link_type: str) -> pd.DataFrame: - """Get the scores for a given method and link type. + def get_scores(self, score_type: str, link_type: str) -> pd.DataFrame: + """Get the scores for given scoring type and link type. Args: - scoring_method (str): The scoring method to use. Available methods - are 'metcalf', 'likescore', and 'hg'. + score_type (str): The type of scoring method to use. Available + scoring methods are 'metcalf', 'likescore', and 'hg'. link_type (str): The type of link to get scores for. Available types are 'spec-gcf' and 'mf-gcf'. @@ -80,94 +86,94 @@ def get_scores(self, scoring_method: str, link_type: str) -> pd.DataFrame: pd.DataFrame: The scores for the given method and link type. Raises: - ValueError: If the scoring method or link type is unknown. + ValueError: If an invalid score type or link type is given. """ - if scoring_method == 'metcalf': + if score_type not in SCORE_TYPES: + raise ValueError( + f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"' + ) + if link_type not in LINK_TYPES: + raise ValueError( + f'Invalid link type "{link_type}". Must be one of "{LINK_TYPES}"' + ) + if score_type == 'metcalf': if link_type == 'spec-gcf': return self.metcalf_spec_gcf if link_type == 'mf-gcf': return self.metcalf_fam_gcf - if scoring_method == 'likescore': + if score_type == 'likescore': if link_type == 'spec-gcf': return self.likescores_spec_gcf if link_type == 'mf-gcf': return self.likescores_fam_gcf - if scoring_method == 'hg': + if score_type == 'hg': if link_type == 'spec-gcf': return self.hg_spec_gcf if link_type == 'mf-gcf': return self.hg_fam_gcf - raise ValueError( - f'Unknown method or type method="{scoring_method}", type="{link_type}"' - ) - - def metcalf_scoring(self, - data_links, - both=10, - type1_not_gcf=-10, - gcf_not_type1=0, - not_type1_not_gcf=1, - type='spec-gcf'): - """ - Calculate metcalf scores from DataLinks() co-occurence matrices + + def metcalf_scoring( + self, + data_links: DataLinks, + link_type: str = 'spec-gcf', + scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1) + ) -> None: + """Calculate metcalf scores. + + Args: + data_links (DataLinks): The DataLinks object to use for scoring. + link_type (str, optional): The type of link to score. Available + types are 'spec-gcf' and 'mf-gcf'. Defaults to 'spec-gcf'. + scoring_weights (tuple[int,int,int,int], optional): The weights to + use for Metcalf scoring. The weights are applied to + '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'. + Defaults to (10, -10, 0, 1). """ + if link_type == 'spec-gcf': + self.metcalf_spec_gcf = ( + data_links.cooccurrence_spec_gcf * scoring_weights[0] + + data_links.cooccurrence_spec_notgcf * scoring_weights[1] + + data_links.cooccurrence_notspec_gcf * scoring_weights[2] + + data_links.cooccurrence_notspec_notgcf * scoring_weights[3]) + if link_type == 'mf-gcf': + self.metcalf_fam_gcf = ( + data_links.cooccurrence_mf_gcf * scoring_weights[0] + + data_links.cooccurrence_mf_notgcf * scoring_weights[1] + + data_links.cooccurrence_notmf_gcf * scoring_weights[2] + + data_links.cooccurrence_notmf_notgcf * scoring_weights[3]) + if self.metcalf_mean is None or self.metcalf_std is None: + self.metcalf_mean, self.metcalf_std = self._cal_mean_std( + data_links, scoring_weights) + + def _cal_mean_std(self, data_links, scoring_weights): # Compute the expected values for all possible values of spec and gcf strains # we need the total number of strains _, n_strains = data_links.occurrence_gcf_strain.shape - if self.metcalf_expected is None: - sz = (n_strains + 1, n_strains + 1) - self.metcalf_expected = np.zeros(sz) - self.metcalf_variance = np.zeros(sz) - - for n in range(n_strains + 1): - for m in range(n_strains + 1): - max_overlap = min(n, m) - min_overlap = max( - 0, - n + m - n_strains) # minimum possible strain overlap - expected_value = 0 - expected_sq = 0 - for o in range(min_overlap, max_overlap + 1): - o_prob = hypergeom.pmf(o, n_strains, n, m) - # compute metcalf for n strains in type 1 and m in gcf - score = o * both - score += type1_not_gcf * (n - o) - score += gcf_not_type1 * (m - o) - score += not_type1_not_gcf * (n_strains - (n + m - o)) - expected_value += o_prob * score - expected_sq += o_prob * (score**2) - - self.metcalf_expected[n, m] = expected_value - expected_sq = expected_sq - expected_value**2 - if expected_sq < 1e-09: - expected_sq = 1 - self.metcalf_variance[n, m] = expected_sq - - self.metcalf_variance_sqrt = np.sqrt(self.metcalf_variance) - - # now, we would like an option to take any actual score an subtract the - # expected value and then divide by the square root of the variance - #Ā e.g. if we have a score computed between a type 1 object that has - # 3 strains, and a gcf with 6 strains, we should use the expected value - # at expected_metcalf[3,6] and sqrt of the variance in the same position - - if type == 'spec-gcf': - metcalf_scores = ( - data_links.cooccurrence_spec_gcf * both + - data_links.cooccurrence_spec_notgcf * type1_not_gcf + - data_links.cooccurrence_notspec_gcf * gcf_not_type1 + - data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf) - self.metcalf_spec_gcf = metcalf_scores - - elif type == 'mf-gcf': - metcalf_scores = ( - data_links.cooccurrence_mf_gcf * both + - data_links.cooccurrence_mf_notgcf * type1_not_gcf + - data_links.cooccurrence_notmf_gcf * gcf_not_type1 + - data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf) - - self.metcalf_fam_gcf = metcalf_scores + sz = (n_strains + 1, n_strains + 1) + mean = np.zeros(sz) + variance = np.zeros(sz) + for n in range(n_strains + 1): + for m in range(n_strains + 1): + max_overlap = min(n, m) + min_overlap = max(0, n + m - n_strains) + expected_value = 0 + expected_sq = 0 + for o in range(min_overlap, max_overlap + 1): + o_prob = hypergeom.pmf(o, n_strains, n, m) + # compute metcalf for n strains in type 1 and m in gcf + score = o * scoring_weights[0] + score += scoring_weights[1] * (n - o) + score += scoring_weights[2] * (m - o) + score += scoring_weights[3] * (n_strains - (n + m - o)) + expected_value += o_prob * score + expected_sq += o_prob * (score**2) + mean[n, m] = expected_value + expected_sq = expected_sq - expected_value**2 + if expected_sq < 1e-09: + expected_sq = 1 + variance[n, m] = expected_sq + return mean, np.sqrt(variance) def hg_scoring(self, data_links, type='spec-gcf'): """ diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 00c04487..4daa06f0 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -81,9 +81,9 @@ def setup(npl): npl._molfams, npl._strains) MetcalfScoring.LINKFINDER = LinkFinder() MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - type='spec-gcf') + link_type='spec-gcf') MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - type='mf-gcf') + link_type='mf-gcf') logger.debug('MetcalfScoring.setup caching results') save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS, MetcalfScoring.LINKFINDER), cache_file) @@ -117,8 +117,8 @@ def _metcalf_postprocess_met(self, linkfinder, results, input_type): gen_strains = len(gcf.strains) # lookup expected + variance values based on strain counts - expected = linkfinder.metcalf_expected[met_strains][gen_strains] - variance_sqrt = linkfinder.metcalf_variance_sqrt[met_strains][ + expected = linkfinder.metcalf_mean[met_strains][gen_strains] + variance_sqrt = linkfinder.metcalf_std[met_strains][ gen_strains] # calculate the final score based on the basic Metcalf score for these two @@ -163,9 +163,9 @@ def _metcalf_postprocess_gen(self, linkfinder, results, input_type): met_strains = len(met_obj.strains) # lookup expected + variance values based on strain counts - expected = linkfinder.metcalf_expected[met_strains][ + expected = linkfinder.metcalf_mean[met_strains][ gen_strains] - variance_sqrt = linkfinder.metcalf_variance_sqrt[met_strains][ + variance_sqrt = linkfinder.metcalf_std[met_strains][ gen_strains] # calculate the final score based on the basic Metcalf score for these two diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py index 8c46a28e..2825ad9e 100644 --- a/tests/scoring/test_scoring.py +++ b/tests/scoring/test_scoring.py @@ -84,8 +84,8 @@ def do_scoring_new(gcfs, spectra, strains, standardised): # (note that spectrum = type 1 object here) met_strains = len(spec.strains) gen_strains = len(gcf.strains) - expected = lf.metcalf_expected[met_strains][gen_strains] - variance = lf.metcalf_variance[met_strains][gen_strains] + expected = lf.metcalf_mean[met_strains][gen_strains] + variance = lf.metcalf_std[met_strains][gen_strains] scores[j][i] = (scores[j][i] - expected) / np.sqrt(variance) return scores From d2d6a1033dfed33efbb1c911245b068070e1e5ee Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 8 May 2023 12:12:27 +0200 Subject: [PATCH 52/95] refactor get_links --- src/nplinker/scoring/linking/link_finder.py | 234 ++++++++------------ src/nplinker/scoring/metcalf_scoring.py | 4 +- 2 files changed, 95 insertions(+), 143 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 7722c2ca..b409d8d6 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -339,7 +339,7 @@ def select_link_candidates(self, else: raise Exception( 'Wrong scoring type given. Must be one of: {}'.format( - SCORING_METHODS)) + SCORE_TYPES)) logger.debug(candidate_ids[0].shape[0], " candidates selected with ", index_names[2], " >= ", P_cutoff, " and a link score >= ", @@ -429,156 +429,108 @@ def select_link_candidates(self, raise Exception("No candidate selection was created.") return link_candidates_pd - def get_links(self, - data_links, - input_object, - main_score='likescore', - score_cutoff=0.5): - """ - Output likely links for 'input_object' + @staticmethod + def _isinstance(*objects, obj_type=GCF) -> bool: + return all(isinstance(x, obj_type) for x in objects) - Parameters - ---------- - input_objects: object() - Object or list of objects of either class: spectra, families, or GCFs - main_score: str - Which main score to use ('metcalf', 'likescore') - score_cutoff: - Thresholds to conly consider candidates for which: - score >= score_cutoff - """ + def get_links( + self, + *objects: tuple[GCF, ...] | tuple[Spectrum, ...] + | tuple[MolecularFamily, ...], + score_type: str = 'likescore', + score_cutoff: float = 0.5 + ) -> list[tuple[tuple[str], tuple[str], tuple[float]]]: + """Get scores for links between objects. - if main_score not in SCORING_METHODS: - raise Exception( - 'Wrong scoring type given. Must be one of: {}'.format( - SCORING_METHODS)) + Args: + objects(tuple): GCF, Spectrum or MolecularFamily objects. + score_type(str): Type of score to use for link calculation. + Must be one of 'likescore', 'metcalf' and 'hg'. + score_cutoff(float): Minimum score to consider a link (ā‰„score_cutoff). + Default is 0.5. + Returns: + list: List of tuples containing the ids of the linked objects and the score. + The tuple contains three tuples: + - the first tuple contains the ids of the input/source objects, + - the second tuple contains the ids of the target objects, + - the third tuple contains the scores. - # Check if input is list: - if isinstance(input_object, list): - query_size = len(input_object) - else: - input_object = [input_object] - query_size = 1 - - # Check type of input_object: - # TODO CG: replace integer ids with string ids - # If GCF: - if isinstance(input_object[0], GCF): - input_ids = [gcf.gcf_id for gcf in input_object] - input_type = "gcf" - link_levels = [0, 1] - if main_score == 'likescore': - likescores = [ - # TODO CG: use dataframe instead of numpy array - self.likescores_spec_gcf[:, input_ids], - self.likescores_fam_gcf[:, input_ids] - ] - elif main_score == 'metcalf': - metcalf_scores = [ - self.metcalf_spec_gcf.loc[:, input_ids], - self.metcalf_fam_gcf.loc[:, input_ids] - ] - elif main_score == 'hg': - hg_scores = [ - self.hg_spec_gcf[:, input_ids], self.hg_fam_gcf[:, - input_ids] - ] - elif isinstance(input_object[0], Spectrum): - input_ids = [spec.spectrum_id for spec in input_object] - input_type = "spec" - link_levels = [0] - if main_score == 'likescore': - likescores = [self.likescores_spec_gcf[input_ids, :], []] - elif main_score == 'metcalf': - metcalf_scores = [self.metcalf_spec_gcf.loc[input_ids, :], []] - elif main_score == 'hg': - hg_scores = [self.hg_spec_gcf[input_ids, :], []] - elif isinstance(input_object[0], MolecularFamily): - input_ids = [mf.family_id for mf in input_object] - input_type = "fam" - link_levels = [1] - if main_score == 'likescore': - likescores = [[], self.likescores_fam_gcf[input_ids, :]] - elif main_score == 'metcalf': - metcalf_scores = [[], self.metcalf_fam_gcf.loc[input_ids, :]] - elif main_score == 'hg': - hg_scores = [[], self.hg_fam_gcf[input_ids, :]] + Raises: + TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. + ValueError: If score type is not one of 'likescore', 'metcalf' and 'hg'. + """ + if self._isinstance(*objects, GCF): + obj_type = 'gcf' + elif self._isinstance(*objects, Spectrum): + obj_type = 'spec' + elif self._isinstance(*objects, MolecularFamily): + obj_type = 'fam' else: - raise Exception( - "Input_object must be Spectrum, MolecularFamily, or GCF object (single or list)." + raise TypeError( + 'Input objects must be GCF, Spectrum or MolecularFamily objects.' + ) + + if score_type not in SCORE_TYPES: + raise ValueError( + f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"' ) links = [] - for linklevel in link_levels: - if score_cutoff is not None: - if main_score == 'likescore': - candidate_ids = np.where( - likescores[linklevel] >= score_cutoff) - elif main_score == 'metcalf': - # get a tuple of arrays of indices where the metcalf score is above the cutoff - # the first array is the row indices, the second is the column indices - candidate_ids = np.where( - metcalf_scores[linklevel] >= score_cutoff) - elif main_score == 'hg': - candidate_ids = np.where( - hg_scores[linklevel] >= score_cutoff) - else: - # should never happen - raise Exception(f'Unknown scoring type! "{main_score}"') - else: - # TODO is this best way to get same output as above code? - # to keep the remainder of the method identical in the case of no cutoff - # being supplied, while still returning all the candidate links, I'm - # currently abusing np.where like this - candidate_ids = np.where(metcalf_scores[linklevel] != np.nan) - - # TODO: 230503 continue replace array with dataframe - link_candidates = np.zeros((3, candidate_ids[0].shape[0])) - - # this is supposed to construct a (3, x) array, where: - # - 1st index gives list of source/input object IDs - # - 2nd index gives list of destination/link object IDs - # - 3rd index gives list of scores for the link between the given pair of objects - # - x = number of links found - - # if there is only a single object given as input, things are pretty simple here: - if query_size == 1: - # first, can set every index of the input object ID array to the - # single object ID we've been give - link_candidates[0, :] = input_ids - # then, based on input type copy the other object IDs from candidate_ids - if input_type == 'gcf': - link_candidates[1, :] = candidate_ids[0].astype(int) - else: - link_candidates[1, :] = candidate_ids[1].astype(int) - else: - # if there is a list of input objects, things are slightly more complex - # - the "input IDs" element of the output array now needs to be set by - # a lookup into the original input_ids array based on candidate_ids - # - the "output IDs" element is taken directly from the other element - # of candidate IDs - if input_type == 'gcf': - link_candidates[0, :] = input_ids[candidate_ids[1].astype( - int)] - link_candidates[1, :] = candidate_ids[0].astype(int) - else: - link_candidates[0, :] = input_ids[candidate_ids[0].astype( - int)] - link_candidates[1, :] = candidate_ids[1].astype(int) - - # finally, copy in the actual scores too - if main_score == 'likescore': - link_candidates[2, :] = likescores[linklevel][candidate_ids] - elif main_score == 'metcalf': - link_candidates[ - 2, :] = metcalf_scores[linklevel][candidate_ids] - elif main_score == 'hg': - link_candidates[2, :] = hg_scores[linklevel][candidate_ids] - - links.append(link_candidates) + # TODO CG: replace integer ids with string ids for `hg` and `likescore` + if obj_type == 'gcf': + obj_ids = [gcf.gcf_id for gcf in objects] + if score_type == 'likescore': + all_scores = (self.likescores_spec_gcf[:, obj_ids], + self.likescores_fam_gcf[:, obj_ids]) + if score_type == 'metcalf': + all_scores = (self.metcalf_spec_gcf.loc[:, obj_ids], + self.metcalf_fam_gcf.loc[:, obj_ids]) + if score_type == 'hg': + all_scores = (self.hg_spec_gcf[:, obj_ids], + self.hg_fam_gcf[:, obj_ids]) + for scores in all_scores: + links.append(self._get_scores_source_gcf(scores, score_cutoff)) + + if obj_type == 'spec': + obj_ids = [spec.spectrum_id for spec in objects] + if score_type == 'likescore': + all_scores = self.likescores_spec_gcf[obj_ids, :] + if score_type == 'metcalf': + all_scores = self.metcalf_spec_gcf.loc[obj_ids, :] + if score_type == 'hg': + all_scores = self.hg_spec_gcf[obj_ids, :] + links.append(self._get_scores_source_met(all_scores, score_cutoff)) + + if obj_type == 'fam': + obj_ids = [mf.family_id for mf in objects] + if score_type == 'likescore': + all_scores = self.likescores_fam_gcf[obj_ids, :] + if score_type == 'metcalf': + all_scores = self.metcalf_fam_gcf.loc[obj_ids, :] + if score_type == 'hg': + all_scores = self.hg_fam_gcf[obj_ids:] + links.append(self._get_scores_source_met(all_scores, score_cutoff)) return links + def _get_scores_source_gcf(self, scores, score_cutoff): + candidate_met_gcf_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list() + target_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list() + scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist( + ) + return tuple(src_obj_ids), tuple(target_obj_ids), tuple( + scores_candidate) + + def _get_scores_source_met(self, scores, score_cutoff): + candidate_met_gcf_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list() + target_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list() + scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist( + ) + return tuple(src_obj_ids), tuple(target_obj_ids), tuple( + scores_candidate) + @deprecated(version="1.3.3", reason="The unworkable method will be removed") def create_cytoscape_files(self, diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 4daa06f0..9fb76fe6 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -203,13 +203,13 @@ def get_links(self, objects, link_collection): logger.debug('MetcalfScoring: standardised = {}'.format( self.standardised)) if not self.standardised: - results = linkfinder.get_links(datalinks, objects, self.name, + results = linkfinder.get_links(objects, self.name, self.cutoff) else: # get the basic Metcalf scores BUT ignore the cutoff value here by setting # it to None. The actual user-supplied cutoff value is applied further down # once the standardised scores for these results have been calculated. - results = linkfinder.get_links(datalinks, objects, self.name, None) + results = linkfinder.get_links(objects, self.name, None) # The "results" object varies slightly depending on the input provided # to the LinkFinder class: From 2078488f41dadaa74255486ead4367d34c5bc738 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 9 May 2023 10:14:11 +0200 Subject: [PATCH 53/95] remove unused methods and scorings from LinkFinder - remove unused `likescore` and `hg` scoring types - remove all unused methods --- src/nplinker/scoring/linking/link_finder.py | 574 +------------------- 1 file changed, 13 insertions(+), 561 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index b409d8d6..a97775bd 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -1,29 +1,12 @@ from __future__ import annotations from typing import TYPE_CHECKING -from deprecated import deprecated import numpy as np import pandas as pd from scipy.stats import hypergeom from nplinker.genomics.gcf import GCF -from nplinker.metabolomics.spectrum import Spectrum -from .data_linking_functions import pair_prob_approx -from .data_linking_functions import pair_prob_hg - - -# CG: TODO get_links function does not work any more, need to update its logics - -# import packages for plotting -# TODO move plotting to separate module? -try: - from matplotlib import pyplot as plt - import seaborn as sns -except ImportError: - print( - 'Warning: plotting functionality will not be available (missing matplotlib and/or seaborn)' - ) - from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.spectrum import Spectrum if TYPE_CHECKING: @@ -31,7 +14,6 @@ logger = LogConfig.getLogger(__file__) -SCORE_TYPES = ['metcalf', 'likescore', 'hg'] LINK_TYPES = ['spec-gcf', 'mf-gcf'] @@ -52,67 +34,15 @@ def __init__(self): Separate tables will exist for different linking scenarios, such as gcfs <-> spectra OR gcf <-> mol.families """ - # metcalf scores - self.metcalf_spec_gcf = pd.DataFrame() - self.metcalf_fam_gcf = pd.DataFrame() - - # likelihood scores - self.likescores_spec_gcf = [] - self.likescores_fam_gcf = [] - - # hg scores - self.hg_spec_gcf = [] - self.hg_fam_gcf = [] - - # link candidate tables - self.link_candidates_gcf_spec = [] - self.link_candidates_gcf_fam = [] + self.raw_score_spec_gcf = pd.DataFrame() + self.raw_score_fam_gcf = pd.DataFrame() # metcalf caching self.metcalf_mean = None self.metcalf_std = None - def get_scores(self, score_type: str, link_type: str) -> pd.DataFrame: - """Get the scores for given scoring type and link type. - - Args: - score_type (str): The type of scoring method to use. Available - scoring methods are 'metcalf', 'likescore', and 'hg'. - link_type (str): The type of link to get scores for. Available - types are 'spec-gcf' and 'mf-gcf'. - - Returns: - pd.DataFrame: The scores for the given method and link type. - - Raises: - ValueError: If an invalid score type or link type is given. - """ - if score_type not in SCORE_TYPES: - raise ValueError( - f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"' - ) - if link_type not in LINK_TYPES: - raise ValueError( - f'Invalid link type "{link_type}". Must be one of "{LINK_TYPES}"' - ) - if score_type == 'metcalf': - if link_type == 'spec-gcf': - return self.metcalf_spec_gcf - if link_type == 'mf-gcf': - return self.metcalf_fam_gcf - if score_type == 'likescore': - if link_type == 'spec-gcf': - return self.likescores_spec_gcf - if link_type == 'mf-gcf': - return self.likescores_fam_gcf - if score_type == 'hg': - if link_type == 'spec-gcf': - return self.hg_spec_gcf - if link_type == 'mf-gcf': - return self.hg_fam_gcf - - def metcalf_scoring( + def cal_score( self, data_links: DataLinks, link_type: str = 'spec-gcf', @@ -130,13 +60,13 @@ def metcalf_scoring( Defaults to (10, -10, 0, 1). """ if link_type == 'spec-gcf': - self.metcalf_spec_gcf = ( + self.raw_score_spec_gcf = ( data_links.cooccurrence_spec_gcf * scoring_weights[0] + data_links.cooccurrence_spec_notgcf * scoring_weights[1] + data_links.cooccurrence_notspec_gcf * scoring_weights[2] + data_links.cooccurrence_notspec_notgcf * scoring_weights[3]) if link_type == 'mf-gcf': - self.metcalf_fam_gcf = ( + self.raw_score_fam_gcf = ( data_links.cooccurrence_mf_gcf * scoring_weights[0] + data_links.cooccurrence_mf_notgcf * scoring_weights[1] + data_links.cooccurrence_notmf_gcf * scoring_weights[2] + @@ -175,277 +105,16 @@ def _cal_mean_std(self, data_links, scoring_weights): variance[n, m] = expected_sq return mean, np.sqrt(variance) - def hg_scoring(self, data_links, type='spec-gcf'): - """ - Calculate metcalf scores from DataLinks() co-occurence matrices - """ - - # NOTE:can't use the correlation matrices directly for this scoring method because - # it seems to require more inclusive counts of the strains in each object. - # Instead of "number of strains only in GCF", it requires "number of strains in the - # GCF PLUS the number shared between the GCF and the other object". - # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain, - # cooccurrence_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead - # of "3", because the spectrum only has 2 distinct strains vs the GCF. - # To fix this the cooccurrence_spec_gcf/cooccurrence_mf_gcf matrix can just be added onto the others to give - # the correct totals. - - if type == 'spec-gcf': - num_strains = np.ones(data_links.cooccurrence_spec_gcf.shape - ) * data_links.occurrence_gcf_strain.shape[1] - overlap_counts = data_links.cooccurrence_spec_gcf - gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf - spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf - hg_scores = hypergeom.sf(overlap_counts, - num_strains, - gcf_counts, - spec_counts, - loc=1) - self.hg_spec_gcf = hg_scores - elif type == 'mf-gcf': - num_strains = np.ones(data_links.cooccurrence_mf_gcf.shape - ) * data_links.occurrence_gcf_strain.shape[1] - overlap_counts = data_links.cooccurrence_mf_gcf - gcf_counts = overlap_counts + data_links.cooccurrence_notmf_gcf - fam_counts = overlap_counts + data_links.cooccurrence_mf_notgcf - hg_scores = hypergeom.sf(overlap_counts, - num_strains, - gcf_counts, - fam_counts, - loc=1) - self.hg_fam_gcf = hg_scores - - return hg_scores - - def likelihood_scoring(self, - data_links, - likelihoods, - alpha_weighing=0.5, - type='spec-gcf'): - """ - Calculate likelihood scores from DataLinks() co-occurence matrices. - - Idea: - Score reflect the directionality BGC-->compound-->spectrum, which - suggests that the most relevant likelihoods are: - P(gcf|type1) - If type1 is the result of only one particular gene cluster, - this value should be high (close or equal to 1) - P(type1|not gcf) - Following the same logic, this value should be very - small or 0 ("no gene cluster, no compound") - - Score: - Score = P(gcf|type1) * (1 - P(type1|not gcf) * weighing function - - weighing function is here a function of the number of strains they co-occur. - weighing function = (1 - exp(-alpha_weighing * num_of_co-occurrences) - """ - - if type == 'spec-gcf': - likelihood_scores = np.zeros( - data_links.cooccurrence_spec_gcf.shape) - likelihood_scores = ( - likelihoods.P_gcf_given_spec * - (1 - likelihoods.P_spec_not_gcf) * - (1 - - np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf))) - - self.likescores_spec_gcf = likelihood_scores - - elif type == 'mf-gcf': - likelihood_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape) - likelihood_scores = ( - likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.cooccurrence_mf_gcf))) - - self.likescores_fam_gcf = likelihood_scores - return likelihood_scores - - def select_link_candidates(self, - data_links, - likelihoods, - P_cutoff=0.8, - main_score='likescore', - score_cutoff=0, - type='mf-gcf'): - """ - Look for potential best candidate for links between - IF type='spec-gcf': GCFs and spectra - IF type='mf-gcf': GCFs and mol.families - - Parameters - ---------- - data_links: DataLinks() object - containing co-occurence matrices - likelihood: LinkLikelihood() object - containing co-occurence likelihoods - P_cutoff: float - Thresholds to conly consider candidates for which: - P_gcf_given_type1 >= P_cutoff - main_score: str - Which main score to use ('metcalf', 'likescore') - score_cutoff: - Thresholds to conly consider candidates for which: - score >= score_cutoff - """ - - # Select scenario: spec<->gcf or mf<->gcf - if type == 'spec-gcf': - P_gcf_given_type1 = likelihoods.P_gcf_given_spec - P_gcf_not_type1 = likelihoods.P_gcf_not_spec - P_type1_given_gcf = likelihoods.P_spec_given_gcf - P_type1_not_gcf = likelihoods.P_spec_not_gcf - M_type1_gcf = data_links.cooccurrence_spec_gcf - metcalf_scores = self.metcalf_spec_gcf - likescores = self.likescores_spec_gcf - index_names = [ - "spectrum_id", "GCF id", "P(gcf|spec)", "P(spec|gcf)", - "P(gcf|not spec)", "P(spec|not gcf)", "co-occur in # strains", - "metcalf score", "likelihood score", "HG prob", "link prob", - "link prob specific" - ] - - elif type == 'mf-gcf': - P_gcf_given_type1 = likelihoods.P_gcf_given_fam - P_gcf_not_type1 = likelihoods.P_gcf_not_fam - P_type1_given_gcf = likelihoods.P_fam_given_gcf - P_type1_not_gcf = likelihoods.P_fam_not_gcf - M_type1_gcf = data_links.cooccurrence_mf_gcf - metcalf_scores = self.metcalf_fam_gcf - likescores = self.likescores_fam_gcf - index_names = [ - "family_id", "GCF id", "P(gcf|fam)", "P(fam|gcf)", - "P(gcf|not fam)", "P(fam|not gcf)", "co-occur in # strains", - "metcalf score", "likelihood score", "HG prob", "link prob", - "link prob specific" - ] - - elif type == 'spec-bgc' or type == 'fam-bgc': - raise Exception("Given types are not yet supported... ") - else: - raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..." - ) - - dim1, dim2 = P_gcf_given_type1.shape - - # PRE-SELECTION: - # Select candidates with P_gcf_given_spec >= P_cutoff AND score >= score_cutoff - if main_score == 'likescore': - candidate_ids = np.where((P_gcf_given_type1[:, :] >= P_cutoff) - & (likescores >= score_cutoff)) - elif main_score == 'metcalf': - candidate_ids = np.where((P_gcf_given_type1[:, :] >= P_cutoff) - & (metcalf_scores >= score_cutoff)) - else: - raise Exception( - 'Wrong scoring type given. Must be one of: {}'.format( - SCORE_TYPES)) - - logger.debug(candidate_ids[0].shape[0], " candidates selected with ", - index_names[2], " >= ", P_cutoff, " and a link score >= ", - score_cutoff, ".") - - link_candidates = np.zeros((12, candidate_ids[0].shape[0])) - link_candidates[0, :] = candidate_ids[0] # spectrum/fam number - link_candidates[1, :] = candidate_ids[1] # gcf id - link_candidates[2, :] = P_gcf_given_type1[candidate_ids] - link_candidates[3, :] = P_type1_given_gcf[candidate_ids] - link_candidates[4, :] = P_gcf_not_type1[candidate_ids] - link_candidates[5, :] = P_type1_not_gcf[candidate_ids] - link_candidates[6, :] = M_type1_gcf[candidate_ids] - link_candidates[7, :] = metcalf_scores[candidate_ids] - link_candidates[8, :] = likescores[candidate_ids] - - # Calculate probability to find similar link by chance - Nx_list = data_links.mapping_gcf["no of strains"] - if type == 'spec-gcf': - Ny_list = data_links.mapping_spec["no of strains"] - elif type == 'mf-gcf': - Ny_list = data_links.mapping_fam["no of strains"] - - # Calculate probabilities of finding a spectrum in a certain strain - P_str = np.array(data_links.mapping_strain["no of spectra"]) - P_str = P_str / np.sum(P_str) - - num_strains = data_links.occurrence_gcf_strain.shape[1] - - # Calculate the hypergeometric probability (as before) - for i in range(link_candidates.shape[1]): - link_candidates[9, i] = pair_prob_hg( - link_candidates[6, i], num_strains, - Nx_list[link_candidates[1, i]], - Ny_list[int(link_candidates[0, i])]) - - # Calculate the GCF specific probability - for i in range(link_candidates.shape[1]): - id_spec = link_candidates[0, i] - id_gcf = link_candidates[1, i] - - # find set of strains which contain GCF with id link_candidates[1,i] - XG = np.where( - data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0] - - link_candidates[10, - i] = pair_prob_approx(P_str, XG, - int(Ny_list[id_spec]), - int(link_candidates[6, i])) - # Calculate the link specific probability - # Find strains where GCF and spectra/family co-occur - if type == 'spec-gcf': - XGS = np.where( - (data_links.occurrence_gcf_strain[id_gcf, :] == 1) - & (data_links.occurrence_spec_strain[id_spec, :] == 1))[0] - elif type == 'mf-gcf': - XGS = np.where( - (data_links.occurrence_gcf_strain[id_gcf, :] == 1) - & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0] - link_candidates[11, - i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]), - int(Ny_list[id_spec]), num_strains) - - # Transform into pandas Dataframe (to avoid index confusions): - link_candidates_pd = pd.DataFrame(link_candidates.transpose(1, 0), - columns=index_names) - - # add other potentially relevant knowdledge - # If this will grow to more collected information -> create separate function/class - bgc_class = [] - # TODO CG: bgc class should be obtained from GCF object - for i in link_candidates_pd["GCF id"].astype(int): - bgc_class.append(data_links.mapping_gcf["bgc class"][i]) - link_candidates_pd["BGC class"] = bgc_class - - # Change some columns to int - link_candidates_pd.iloc[:, [0, 1, 6, 7]] = link_candidates_pd.iloc[:, [ - 0, 1, 6, 7 - ]].astype(int) - - # return results - if type == 'spec-gcf': - self.link_candidates_gcf_spec = link_candidates_pd - elif type == 'mf-gcf': - self.link_candidates_gcf_fam = link_candidates_pd - else: - raise Exception("No candidate selection was created.") - return link_candidates_pd - - @staticmethod - def _isinstance(*objects, obj_type=GCF) -> bool: - return all(isinstance(x, obj_type) for x in objects) - def get_links( self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...], - score_type: str = 'likescore', score_cutoff: float = 0.5 ) -> list[tuple[tuple[str], tuple[str], tuple[float]]]: """Get scores for links between objects. Args: objects(tuple): GCF, Spectrum or MolecularFamily objects. - score_type(str): Type of score to use for link calculation. - Must be one of 'likescore', 'metcalf' and 'hg'. score_cutoff(float): Minimum score to consider a link (ā‰„score_cutoff). Default is 0.5. Returns: @@ -457,7 +126,6 @@ def get_links( Raises: TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. - ValueError: If score type is not one of 'likescore', 'metcalf' and 'hg'. """ if self._isinstance(*objects, GCF): obj_type = 'gcf' @@ -470,49 +138,26 @@ def get_links( 'Input objects must be GCF, Spectrum or MolecularFamily objects.' ) - if score_type not in SCORE_TYPES: - raise ValueError( - f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"' - ) - links = [] - # TODO CG: replace integer ids with string ids for `hg` and `likescore` if obj_type == 'gcf': obj_ids = [gcf.gcf_id for gcf in objects] - if score_type == 'likescore': - all_scores = (self.likescores_spec_gcf[:, obj_ids], - self.likescores_fam_gcf[:, obj_ids]) - if score_type == 'metcalf': - all_scores = (self.metcalf_spec_gcf.loc[:, obj_ids], - self.metcalf_fam_gcf.loc[:, obj_ids]) - if score_type == 'hg': - all_scores = (self.hg_spec_gcf[:, obj_ids], - self.hg_fam_gcf[:, obj_ids]) + all_scores = (self.raw_score_spec_gcf.loc[:, obj_ids], + self.raw_score_fam_gcf.loc[:, obj_ids]) for scores in all_scores: links.append(self._get_scores_source_gcf(scores, score_cutoff)) - if obj_type == 'spec': obj_ids = [spec.spectrum_id for spec in objects] - if score_type == 'likescore': - all_scores = self.likescores_spec_gcf[obj_ids, :] - if score_type == 'metcalf': - all_scores = self.metcalf_spec_gcf.loc[obj_ids, :] - if score_type == 'hg': - all_scores = self.hg_spec_gcf[obj_ids, :] + all_scores = self.raw_score_spec_gcf.loc[obj_ids, :] links.append(self._get_scores_source_met(all_scores, score_cutoff)) - if obj_type == 'fam': obj_ids = [mf.family_id for mf in objects] - if score_type == 'likescore': - all_scores = self.likescores_fam_gcf[obj_ids, :] - if score_type == 'metcalf': - all_scores = self.metcalf_fam_gcf.loc[obj_ids, :] - if score_type == 'hg': - all_scores = self.hg_fam_gcf[obj_ids:] + all_scores = self.raw_score_fam_gcf.loc[obj_ids, :] links.append(self._get_scores_source_met(all_scores, score_cutoff)) - return links + def _isinstance(self, *objects, obj_type=GCF) -> bool: + return all(isinstance(x, obj_type) for x in objects) + def _get_scores_source_gcf(self, scores, score_cutoff): candidate_met_gcf_indexes = np.where(scores >= score_cutoff) src_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list() @@ -530,196 +175,3 @@ def _get_scores_source_met(self, scores, score_cutoff): ) return tuple(src_obj_ids), tuple(target_obj_ids), tuple( scores_candidate) - - @deprecated(version="1.3.3", - reason="The unworkable method will be removed") - def create_cytoscape_files(self, - data_links, - network_filename, - link_type='mf-gcf', - score_type='metcalf'): - """ - Create network file for import into Cytoscape. - Network file will be generated using networkx. - The type of network created here is a bipartite network. - mass spec side --> bipartite = 0 - gene cluster side --> bipartite = 1 - Output format is a graphml file. - - Parameters - ---------- - data_links: DataLinks() object - containing co-occurence matrices - likelihood: LinkLikelihood() object - containing co-occurence likelihoods - network_filename: str - Filename to save generated model as graphml file. - """ - - import networkx as nx - NPlinker_net = nx.Graph() - - if link_type == 'mf-gcf': - link_candidates = self.link_candidates_gcf_fam - type1str = 'family_id' - elif link_type == 'spec-gcf': - link_candidates = self.link_candidates_gcf_spec - type1str = 'spectrum_id' - else: - raise Exception("Wrong link-type given.") - - # Select score type - if score_type == 'metcalf': - scorestr = 'metcalf score' - elif score_type == 'likescore': - scorestr = 'likelihood score' - else: - raise Exception( - "Wrong score_type given. Must be one of: 'metcalf', 'likescore' ." - ) - - # Add nodes (all partners from link_candidate table): - # mass spec side --> bipartite = 0 - type1_names = [] - for type1 in link_candidates[type1str].astype(int): - type1_names.append(type1str + str(type1)) - - type1_names_unique = list(set(type1_names)) - NPlinker_net.add_nodes_from(type1_names_unique, bipartite=0) - - # gene cluster side --> bipartite = 1 - type2_names = [] - for type2 in link_candidates['GCF id']: - type2_names.append("GCF_" + str(type2)) - - type2_names_unique = list(set(type2_names)) - NPlinker_net.add_nodes_from(type2_names_unique, bipartite=1) - - # Add edges: - for i in range(0, link_candidates.shape[0]): - NPlinker_net.add_edge(type1_names[i], - type2_names[i], - weight=float(link_candidates[scorestr][i])) - - # Add edges between molecular family members - if link_type == 'spec-gcf': - type1_unique = (np.unique(link_candidates[type1str])).astype(int) - map_spec_fam = data_links.mapping_spec["fam-id"] - for type1 in type1_unique: - if map_spec_fam[type1] > 0: # if no singleton - members = data_links.family_members[int( - map_spec_fam[type1])][0] - - # select other family members if among link candidates - members_present = [ - x for x in list(members) if x in list(type1_unique) - ] - for member in members_present: - NPlinker_net.add_edge(type1str + str(type1), - type1str + str(member)) - - # export graph for drawing (e.g. using Cytoscape) - nx.write_graphml(NPlinker_net, network_filename) - - def plot_candidates(self, - P_cutoff=0.8, - score_type='likescore', - score_cutoff=0, - type='mf-gcf'): - """ - Plot best rated correlations between gcfs and spectra/families - plot in form of seaborn clustermap - """ - - # Select score type - if score_type == 'metcalf': - scorestr = 'metcalf score' - elif score_type == 'likescore': - scorestr = 'likelihood score' - else: - raise Exception( - "Wrong score_type given. Must be one of: 'metcalf', 'likescore' ." - ) - - if type == 'spec-gcf': - link_candidates = self.link_candidates_gcf_spec - selected_ids = np.where( - (link_candidates["P(gcf|spec)"] > P_cutoff) - & (link_candidates[scorestr] > score_cutoff))[0] - elif type == 'mf-gcf': - link_candidates = self.link_candidates_gcf_fam - selected_ids = np.where( - (link_candidates["P(gcf|fam)"] > P_cutoff) - & (link_candidates[scorestr] > score_cutoff))[0] - else: - raise Exception("Wrong correlation 'type' given.") - - mapping_fams = np.unique(link_candidates.iloc[selected_ids, 0]) - mapping_gcfs = np.unique(link_candidates.iloc[selected_ids, 1]) - unique_fams = len(mapping_fams) - unique_gcfs = len(mapping_gcfs) - - M_links = np.zeros((unique_fams, unique_gcfs)) - - # define colors for different BGC classes - bigscape_classes_dict = { - "Others": "C0", - "NRPS": "C1", - "PKS-NRP_Hybrids": "C2", - "PKSother": "C3", - "PKSI": "C4", - "RiPPs": "C5", - "Saccharides": "C6", - "Terpene": "C7" - } - col_colors = [] - - # Create matrix with relevant link scores - # TODO replace by better numpy method... - for i in range(0, link_candidates.shape[0]): - x = np.where(mapping_fams == link_candidates.iloc[i, 0])[0] - y = np.where(mapping_gcfs == link_candidates.iloc[i, 1])[0] - M_links[x, y] = link_candidates[scorestr][i] - - # make pandas dataframe from numpy array - M_links = pd.DataFrame(M_links, - index=mapping_fams.astype(int), - columns=mapping_gcfs.astype(int)) - if type == 'spec-gcf': - M_links.index.name = 'spectrum number' - elif type == 'mf-gcf': - M_links.index.name = 'molecular family number' - M_links.columns.name = 'gene cluster family (GCF)' - - # add color label representing gene cluster class - for bgc_class in link_candidates["BGC class"]: - if bgc_class is None: - col_colors.append((0, 0, 0)) - else: - col_colors.append(bigscape_classes_dict[bgc_class]) - - # bgc_type_colors = pd.Series(mapping_gcfs.astype(int), index=M_links.columns).map(col_colors) - graph = sns.clustermap( - M_links, - metric="correlation", - method="weighted", - cmap= - "Reds", #sns.cubehelix_palette(8, start=0, rot=.3, dark=0, light=1), - vmin=0, - vmax=np.max(link_candidates[scorestr]), - col_cluster=True, - col_colors=col_colors, - robust=True) - graph.fig.suptitle('Correlation map') - - # Rotate labels - plt.setp(graph.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) - plt.setp(graph.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) - - # Make labels smaller - plt.setp(graph.ax_heatmap.xaxis.get_majorticklabels(), fontsize=7) - plt.setp(graph.ax_heatmap.yaxis.get_majorticklabels(), fontsize=7) - - plt.ylabel("scoring index") - - return M_links From 5d9a9169fd5ca0a403b6bedeb86662ce298b1a53 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 9 May 2023 11:23:19 +0200 Subject: [PATCH 54/95] refactor returned type of `LinkFinder.get_links` method --- src/nplinker/scoring/linking/data_linking.py | 3 +- src/nplinker/scoring/linking/link_finder.py | 118 ++++++++++--------- 2 files changed, 64 insertions(+), 57 deletions(-) diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index 0edec183..e0849421 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -9,13 +9,14 @@ from nplinker.metabolomics.spectrum import Spectrum from .data_linking_functions import calc_correlation_matrix - if TYPE_CHECKING: from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain logger = LogConfig.getLogger(__name__) +LINK_TYPES = ['spec-gcf', 'mf-gcf'] + class DataLinks(): diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index a97775bd..8bf16e1c 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -7,6 +7,7 @@ from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.spectrum import Spectrum +from .data_linking import LINK_TYPES if TYPE_CHECKING: @@ -14,19 +15,8 @@ logger = LogConfig.getLogger(__file__) -LINK_TYPES = ['spec-gcf', 'mf-gcf'] - class LinkFinder(): - """ - Class to: - 1) Score potential links based on collected information from: - DataLinks, LinkLikelihood (and potentially other resources) - Different scores can be used for this! - - 2) Rank and output selected candidates - 3) Create output plots and tables - """ def __init__(self): """ @@ -58,7 +48,14 @@ def cal_score( use for Metcalf scoring. The weights are applied to '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'. Defaults to (10, -10, 0, 1). + + Raises: + ValueError: If an invalid link type is provided. """ + if link_type not in LINK_TYPES: + raise ValueError( + f'Invalid link type: {link_type}. Must be one of {LINK_TYPES}') + if link_type == 'spec-gcf': self.raw_score_spec_gcf = ( data_links.cooccurrence_spec_gcf * scoring_weights[0] + @@ -105,12 +102,10 @@ def _cal_mean_std(self, data_links, scoring_weights): variance[n, m] = expected_sq return mean, np.sqrt(variance) - def get_links( - self, - *objects: tuple[GCF, ...] | tuple[Spectrum, ...] - | tuple[MolecularFamily, ...], - score_cutoff: float = 0.5 - ) -> list[tuple[tuple[str], tuple[str], tuple[float]]]: + def get_links(self, + *objects: tuple[GCF, ...] | tuple[Spectrum, ...] + | tuple[MolecularFamily, ...], + score_cutoff: float = 0.5) -> list[pd.DataFrame]: """Get scores for links between objects. Args: @@ -118,60 +113,71 @@ def get_links( score_cutoff(float): Minimum score to consider a link (ā‰„score_cutoff). Default is 0.5. Returns: - list: List of tuples containing the ids of the linked objects and the score. - The tuple contains three tuples: - - the first tuple contains the ids of the input/source objects, - - the second tuple contains the ids of the target objects, - - the third tuple contains the scores. + list: List of data frames containing the ids of the linked objects + and the score. The data frame contains three rows: + - the first row contains the ids of the input/source objects, + - the second row contains the ids of the target objects, + - the third row contains the scores. Raises: TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. """ - if self._isinstance(*objects, GCF): + if self._isinstance(GCF, *objects): obj_type = 'gcf' - elif self._isinstance(*objects, Spectrum): + elif self._isinstance(Spectrum, *objects): obj_type = 'spec' - elif self._isinstance(*objects, MolecularFamily): - obj_type = 'fam' + elif self._isinstance(MolecularFamily, *objects): + obj_type = 'mf' else: + types = [type(i) for i in objects] raise TypeError( - 'Input objects must be GCF, Spectrum or MolecularFamily objects.' + f'Invalid type "{set(types)}". Input objects must be GCF, Spectrum or MolecularFamily objects.' ) links = [] if obj_type == 'gcf': obj_ids = [gcf.gcf_id for gcf in objects] - all_scores = (self.raw_score_spec_gcf.loc[:, obj_ids], - self.raw_score_fam_gcf.loc[:, obj_ids]) - for scores in all_scores: - links.append(self._get_scores_source_gcf(scores, score_cutoff)) + # spec-gcf + scores = self.raw_score_spec_gcf.loc[:, obj_ids] + df = self._get_scores_source_gcf(scores, score_cutoff) + df.name = LINK_TYPES[0] + links.append(df) + # mf-gcf + scores = self.raw_score_fam_gcf.loc[:, obj_ids] + df = self._get_scores_source_gcf(scores, score_cutoff) + df.name = LINK_TYPES[1] + links.append(df) + if obj_type == 'spec': obj_ids = [spec.spectrum_id for spec in objects] - all_scores = self.raw_score_spec_gcf.loc[obj_ids, :] - links.append(self._get_scores_source_met(all_scores, score_cutoff)) - if obj_type == 'fam': + scores = self.raw_score_spec_gcf.loc[obj_ids, :] + df = self._get_scores_source_met(scores, score_cutoff) + df.name = LINK_TYPES[0] + links.append(df) + + if obj_type == 'mf': obj_ids = [mf.family_id for mf in objects] - all_scores = self.raw_score_fam_gcf.loc[obj_ids, :] - links.append(self._get_scores_source_met(all_scores, score_cutoff)) + scores = self.raw_score_fam_gcf.loc[obj_ids, :] + df = self._get_scores_source_met(scores, score_cutoff) + df.name = LINK_TYPES[1] + links.append(df) return links - def _isinstance(self, *objects, obj_type=GCF) -> bool: - return all(isinstance(x, obj_type) for x in objects) - - def _get_scores_source_gcf(self, scores, score_cutoff): - candidate_met_gcf_indexes = np.where(scores >= score_cutoff) - src_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list() - target_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list() - scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist( - ) - return tuple(src_obj_ids), tuple(target_obj_ids), tuple( - scores_candidate) - - def _get_scores_source_met(self, scores, score_cutoff): - candidate_met_gcf_indexes = np.where(scores >= score_cutoff) - src_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list() - target_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list() - scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist( - ) - return tuple(src_obj_ids), tuple(target_obj_ids), tuple( - scores_candidate) + def _isinstance(self, _type, *objects) -> bool: + return all(isinstance(x, _type) for x in objects) + + def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame: + row_indexes, col_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.columns[col_indexes].to_list() + target_obj_ids = scores.index[row_indexes].to_list() + scores_candidate = scores.values[row_indexes, col_indexes].tolist() + return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate], + index=['source', 'target', 'score']) + + def _get_scores_source_met(self, scores, score_cutoff) -> pd.DataFrame: + row_indexes, col_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.index[row_indexes].to_list() + target_obj_ids = scores.columns[col_indexes].to_list() + scores_candidate = scores.values[row_indexes, col_indexes].tolist() + return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate], + index=['source', 'target', 'score']) From 47770c57fd361d6ba00a3e5691a1bf05930fe6ee Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 9 May 2023 12:20:17 +0200 Subject: [PATCH 55/95] add `lookup_mf` method in NPLinker class --- src/nplinker/nplinker.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 02799ed0..943cb21b 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -18,6 +18,7 @@ from .scoring.rosetta_scoring import RosettaScoring from .strain_collection import StrainCollection + if TYPE_CHECKING: from collections.abc import Sequence from .strains import Strain @@ -124,6 +125,7 @@ def __init__(self, userconfig=None): self._bgc_lookup = {} self._gcf_lookup = {} self._spec_lookup = {} + self._mf_lookup = {} self._scoring_methods = {} config_methods = self._config.config.get('scoring_methods', []) @@ -298,6 +300,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): spec.spectrum_id: spec for spec in self._spectra } + self._mf_lookup = {mf.family_id: mf for mf in self._molfams} logger.debug('load_data: completed') return True @@ -392,8 +395,8 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): objects_for_method = input_objects[i] logger.debug('Calling scoring method {} on {} objects'.format( method.name, len(objects_for_method))) - link_collection = method.get_links(objects_for_method, - link_collection) + link_collection = method.get_links(*objects_for_method, + link_collection=link_collection) if not self._datalinks: logger.debug('Creating internal datalinks object') @@ -482,6 +485,10 @@ def lookup_spectrum(self, spectrum_id): """If Spectrum ``name`` exists, return it. Otherwise return None""" return self._spec_lookup.get(spectrum_id, None) + def lookup_mf(self, mf_id): + """If MolecularFamily `family_id` exists, return it. Otherwise return None""" + return self._mf_lookup.get(mf_id, None) + @property def strains(self): """Returns a list of all the strains in the dataset""" From 25606e129fdab7dcca2c68ad39963af571ab47d8 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 9 May 2023 09:40:57 +0200 Subject: [PATCH 56/95] refactor MetcalfScoring class --- src/nplinker/scoring/metcalf_scoring.py | 389 ++++++++++-------------- 1 file changed, 169 insertions(+), 220 deletions(-) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 9fb76fe6..8bc9d7d0 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -1,6 +1,8 @@ +from __future__ import annotations import os +from typing import TYPE_CHECKING import numpy as np -from nplinker.genomics import BGC +import pandas as pd from nplinker.genomics import GCF from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily @@ -11,7 +13,10 @@ from nplinker.scoring.linking.link_finder import LinkFinder from nplinker.scoring.methods import ScoringMethod from nplinker.scoring.object_link import ObjectLink +from .linking.data_linking import LINK_TYPES +if TYPE_CHECKING: + from .link_collection import LinkCollection logger = LogConfig.getLogger(__name__) @@ -22,17 +27,13 @@ class MetcalfScoring(ScoringMethod): LINKFINDER = None NAME = 'metcalf' - # enumeration for accessing results of LinkFinder.get_links, which are (3, num_links) arrays: - # - R_SRC_ID: the ID of an object that was supplied as input to get_links - # - R_DST_ID: the ID of an object that was discovered to have a link to an input object - # - R_SCORE: the score for the link between a pair of objects - R_SRC_ID, R_DST_ID, R_SCORE = range(3) - def __init__(self, npl): super().__init__(npl) self.cutoff = 1.0 self.standardised = True + # TODO CG: not sure why using staticmethod here. Check later and refactor if possible + # TODO CG: refactor this method and extract code for cache file to a separate method @staticmethod def setup(npl): logger.info( @@ -46,7 +47,6 @@ def setup(npl): # the metcalf preprocessing can take a long time for large datasets, so it's # better to cache as the data won't change unless the number of objects does - dataset_counts = [ len(npl.bgcs), len(npl.gcfs), @@ -77,246 +77,195 @@ def setup(npl): logger.info( 'MetcalfScoring.setup preprocessing dataset (this may take some time)' ) - MetcalfScoring.DATALINKS = DataLinks(npl._gcfs, npl._spectra, - npl._molfams, npl._strains) + MetcalfScoring.DATALINKS = DataLinks(npl.gcfs, npl.spectra, + npl.molfams, npl.strains) MetcalfScoring.LINKFINDER = LinkFinder() - MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - link_type='spec-gcf') - MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - link_type='mf-gcf') + MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS, + link_type=LINK_TYPES[0]) + MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS, + link_type=LINK_TYPES[1]) logger.debug('MetcalfScoring.setup caching results') save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS, MetcalfScoring.LINKFINDER), cache_file) logger.info('MetcalfScoring.setup completed') + # TODO CG: is it needed? remove it if not @property def datalinks(self): return MetcalfScoring.DATALINKS - def _metcalf_postprocess_met(self, linkfinder, results, input_type): - logger.debug( - 'Postprocessing results for standardised Metcalf scores (met input)' - ) - # results will be links from EITHER Spectrum OR MolFam => GCF here - - # need to know if the metabolomic objects given as input are Spectrum/MolFam - met_objs = self.npl.spectra if input_type == Spectrum else self.npl.molfams - new_src, new_dst, new_sco = [], [], [] - - # go through each pair of input objects and calculate their standardised scores - for i in range(len(results[0][self.R_SRC_ID])): - met_obj = met_objs[int(results[0][self.R_SRC_ID][i])] - # met_obj will now be either a Spectrum or a MolecularFamily, but - # doesn't matter which (in this implementation at least) because they - # both have a .strains attribute which is the only thing we need. For - # Spectra it's the number of strains, for a MolFam it's the total - # number of *unique* strains across all Spectra in that family. - met_strains = len(met_obj.strains) - gcf = self.npl.gcfs[int(results[0][self.R_DST_ID][i])] - gen_strains = len(gcf.strains) - - # lookup expected + variance values based on strain counts - expected = linkfinder.metcalf_mean[met_strains][gen_strains] - variance_sqrt = linkfinder.metcalf_std[met_strains][ - gen_strains] - - # calculate the final score based on the basic Metcalf score for these two - # particular objects - final_score = (results[0][self.R_SCORE][i] - - expected) / variance_sqrt - - # finally apply the scoring cutoff and store the result - if self.cutoff is None or (final_score >= self.cutoff): - new_src.append(int(results[0][self.R_SRC_ID][i])) - new_dst.append(int(results[0][self.R_DST_ID][i])) - new_sco.append(final_score) - - # overwrite original "results" with equivalent new data structure - return [np.array([new_src, new_dst, new_sco])] - - def _metcalf_postprocess_gen(self, linkfinder, results, input_type): - logger.debug( - 'Postprocessing results for standardised Metcalf scores (gen input)' - ) - # results will be links from GCF to BOTH Spectrum and MolFams here (first - # element Spectra, second MolFams) - - new_results = [] - met_objs_list = [self.npl.spectra, self.npl.molfams] - - # iterate over the Spectrum results and then the MolFam results - for m, met_objs in enumerate(met_objs_list): - new_src, new_dst, new_sco = [], [], [] - - # go through each pair of input objects and calculate their standardised scores - for i in range(len(results[m][self.R_SRC_ID])): - gcf = self.npl.gcfs[int(results[m][self.R_SRC_ID][i])] - gen_strains = len(gcf.strains) - - # met_obj will now be either a Spectrum or a MolecularFamily, but - # doesn't matter which (in this implementation at least) because they - # both have a .strains attribute which is the only thing we need. For - # Spectra it's the number of strains, for a MolFam it's the total - # number of *unique* strains across all Spectra in that family. - met_obj = met_objs[int(results[m][self.R_DST_ID][i])] - met_strains = len(met_obj.strains) - - # lookup expected + variance values based on strain counts - expected = linkfinder.metcalf_mean[met_strains][ - gen_strains] - variance_sqrt = linkfinder.metcalf_std[met_strains][ - gen_strains] - - # calculate the final score based on the basic Metcalf score for these two - # particular objects - final_score = (results[m][self.R_SCORE][i] - - expected) / variance_sqrt - - # finally apply the scoring cutoff and store the result - if self.cutoff is None or (final_score >= self.cutoff): - new_src.append(int(results[m][self.R_SRC_ID][i])) - new_dst.append(int(results[m][self.R_DST_ID][i])) - new_sco.append(final_score) - - # overwrite original "results" with equivalent new data structure - new_results.append(np.array([new_src, new_dst, new_sco])) - - return new_results - - def get_links(self, objects, link_collection): - # enforce constraint that the list must contain a set of identically typed objects - if not all(isinstance(x, type(objects[0])) for x in objects): - raise Exception( - 'MetcalfScoring: uniformly-typed list of objects is required') - - # also can't handle BGCs here, must be one of the other 3 types (GCF/Spectrum/MolecularFamily) - if isinstance(objects[0], BGC): - raise Exception( - 'MetcalfScoring requires input type GCF/Spectrum/MolecularFamily, not BGC' + def get_links(self, *objects, link_collection) -> LinkCollection: + if self._isinstance(GCF, *objects): + obj_type = 'gcf' + elif self._isinstance(Spectrum, *objects): + obj_type = 'spec' + elif self._isinstance(MolecularFamily, *objects): + obj_type = 'mf' + else: + types = [type(i) for i in objects] + raise TypeError( + f'Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects.' ) - datalinks = MetcalfScoring.DATALINKS - linkfinder = MetcalfScoring.LINKFINDER - input_type = type(objects[0]) + if self.LINKFINDER is None: + raise ValueError(( + 'LinkFinder object not found. Have you called `MetcalfScoring.setup(npl)`?' + )) - logger.debug('MetcalfScoring: standardised = {}'.format( - self.standardised)) + logger.debug(f'MetcalfScoring: standardised = {self.standardised}') if not self.standardised: - results = linkfinder.get_links(objects, self.name, - self.cutoff) + scores_list = self.LINKFINDER.get_links(*objects, + score_cutoff=self.cutoff) + # TODO CG: verify the logics of standardised score and add unit tests else: - # get the basic Metcalf scores BUT ignore the cutoff value here by setting - # it to None. The actual user-supplied cutoff value is applied further down - # once the standardised scores for these results have been calculated. - results = linkfinder.get_links(objects, self.name, None) - - # The "results" object varies slightly depending on the input provided - # to the LinkFinder class: - # - given Spectra/MolFam input, it will be a single element list containing - # a (3, x) array, where the first row contains source (input) object - # IDs, the second contains destination (linked) object IDs, and the - # third contains regular Metcalf scores for those pairs of objects. - # - however for GCF input, "results" is instead a 2-element list where - # each entry has the same structure as described above, with the first - # entry describing GCF-Spectrum links and the second GCF-MolFam links. - - gcf_input = (input_type == GCF) - - if not gcf_input: - results = self._metcalf_postprocess_met( - linkfinder, results, input_type) + # use negative infinity as the score cutoff to ensure we get all links + # the self.cutoff will be applied later in the postprocessing step + scores_list = self.LINKFINDER.get_links(*objects, + score_cutoff=np.NINF) + if obj_type == 'gcf': + scores_list = self._cal_standardised_score_gen( + self.LINKFINDER, scores_list) else: - results = self._metcalf_postprocess_gen( - linkfinder, results, input_type) + scores_list = self._cal_standardised_score_met( + self.LINKFINDER, scores_list) - scores_found = set() - metcalf_results = {} - - if input_type == GCF: + link_scores = {} + if obj_type == 'gcf': logger.debug( - 'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs={}, results={}' - .format(len(objects), results[0].shape)) - # for GCF input, results contains two arrays of shape (3, x), - # which contain spec-gcf and mf-gcf links respectively - result_gcf_spec, result_gcf_fam = results[0], results[1] - - for res, type_ in [(result_gcf_spec, Spectrum), - (result_gcf_fam, MolecularFamily)]: - if res.shape[1] == 0: - if type_ != MolecularFamily: - logger.debug( - 'Found no links for {} input objects (type {})'. - format(len(objects), type_)) - continue # no results - - # for each entry in the results (each Spectrum or MolecularFamily) - for j in range(res.shape[1]): - # extract the ID of the object and get the object itself - obj_id = int(res[self.R_DST_ID, j]) - obj = self.npl._spectra[ - obj_id] if type_ == Spectrum else self.npl._molfams[ - obj_id] - - # retrieve the GCF object too (can use its internal ID to index - # directly into the .gcfs list) - gcf = self.npl._gcfs[int(res[self.R_SRC_ID][j])] - - # record that this GCF has at least one link associated with it - scores_found.add(gcf) - - # save the scores - if gcf not in metcalf_results: - metcalf_results[gcf] = {} - metcalf_results[gcf][obj] = ObjectLink( - gcf, obj, self, res[self.R_SCORE, j]) - + f'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, ' + f'#inputs={len(objects)}.') + for scores in scores_list: + # when no links found + if scores.shape[1] == 0: + logger.debug( + f'MetcalfScoring: found no "{scores.name}" links') + else: + # when links found + for col_index in range(scores.shape[1]): + gcf = self.npl.lookup_gcf(scores.loc['source', + col_index]) + if scores.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum( + scores.loc['target', col_index]) + else: + met = self.npl.lookup_mf(scores.loc['target', + col_index]) + if gcf not in link_scores: + link_scores[gcf] = {} + # TODO CG: use id instead of object for gcf, met and self? + link_scores[gcf][met] = ObjectLink( + gcf, met, self, scores.loc['score', col_index]) + logger.debug( + f'MetcalfScoring: found {len(link_scores)} {scores.name} links.' + ) else: logger.debug( - 'MetcalfScoring: input_type=Spec/MolFam, result_type=GCF, inputs={}, results={}' - .format(len(objects), results[0].shape)) - # for non-GCF input, result is a list containing a single array, shape (3, x) - # where x is the total number of links found - results = results[0] - if results.shape[1] == 0: - logger.debug('Found no links for {} input objects'.format( - len(objects))) - link_collection._add_links_from_method(self, metcalf_results) - # can just bail out here in this case - logger.debug('MetcalfScoring: completed') - return link_collection - - # for each entry in the results (each GCF) - for j in range(results.shape[1]): - # extract the ID of the GCF and use that to get the object itself - gcf = self.npl._gcfs[int(results[self.R_DST_ID, j])] - - # retrieve the Spec/MolFam object too (can use its internal ID to index - # directly into the appropriate list) - obj_id = int(results[self.R_SRC_ID, j]) - obj = self.npl._spectra[ - obj_id] if input_type == Spectrum else self.npl._molfams[ - obj_id] - - # record that this Spectrum or MolecularFamily has at least one link associated with it - scores_found.add(obj) - - # save the scores - if obj not in metcalf_results: - metcalf_results[obj] = {} - metcalf_results[obj][gcf] = ObjectLink( - obj, gcf, self, results[self.R_SCORE, j]) - - logger.debug('MetcalfScoring found {} results'.format( - len(metcalf_results))) - link_collection._add_links_from_method(self, metcalf_results) + f'MetcalfScoring: input_type=Spec/MolFam, result_type=GCF, ' + f'#inputs={len(objects)}.') + scores = scores_list[0] + # when no links found + if scores.shape[1] == 0: + logger.debug( + f'MetcalfScoring: found no links "{scores.name}" for input objects' + ) + else: + for col_index in range(scores.shape[1]): + gcf = self.npl.lookup_gcf(scores.loc['target', col_index]) + if scores.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum(scores.loc['source', + col_index]) + else: + met = self.npl.lookup_mf(scores.loc['source', + col_index]) + if met not in link_scores: + link_scores[met] = {} + link_scores[met][gcf] = ObjectLink( + met, gcf, self, scores.loc['score', col_index]) + logger.debug( + f'MetcalfScoring: found {len(link_scores)} {scores.name} links.' + ) + + link_collection._add_links_from_method(self, link_scores) logger.debug('MetcalfScoring: completed') return link_collection + def _isinstance(self, _type, *objects) -> bool: + return all(isinstance(x, _type) for x in objects) + + def _cal_standardised_score_met(self, linkfinder, + results) -> list[pd.DataFrame]: + logger.debug('Calculating standardised Metcalf scores (met input)') + raw_score = results[0] + z_scores = [] + for col_index in range(raw_score.shape[1]): + gcf = self.npl.lookup_gcf(raw_score.loc['target', col_index]) + if raw_score.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum(raw_score.at['source', + col_index]) + else: + met = self.npl.lookup_mf(raw_score.at['source', col_index]) + + num_gcf_strains = len(gcf.strains) + num_met_strains = len(met.strains) + mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains] + sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains] + z_score = (raw_score.at['score', col_index] - mean) / sqrt + z_scores.append(z_score) + + z_scores = np.array(z_scores) + mask = z_scores >= self.cutoff + + scores_df = pd.DataFrame([ + raw_score.loc['source'].values[mask], + raw_score.loc['target'].values[mask], z_scores[mask] + ], + index=raw_score.index) + scores_df.name = raw_score.name + + return [scores_df] + + def _cal_standardised_score_gen(self, linkfinder, + results) -> list[pd.DataFrame]: + logger.debug('Calculating standardised Metcalf scores (gen input)') + postprocessed_scores = [] + for raw_score in results: + z_scores = [] + for col_index in range(raw_score.shape[1]): + gcf = self.npl.lookup_gcf(raw_score.loc['source', col_index]) + if raw_score.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum(raw_score.at['target', + col_index]) + else: + met = self.npl.lookup_mf(raw_score.at['target', col_index]) + + num_gcf_strains = len(gcf.strains) + num_met_strains = len(met.strains) + mean = linkfinder.metcalf_mean[num_met_strains][ + num_gcf_strains] + sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains] + z_score = (raw_score.at['score', col_index] - mean) / sqrt + z_scores.append(z_score) + + z_scores = np.array(z_scores) + mask = z_scores >= self.cutoff + + scores_df = pd.DataFrame([ + raw_score.loc['source'].values[mask], + raw_score.loc['target'].values[mask], z_scores[mask] + ], + index=raw_score.index) + scores_df.name = raw_score.name + postprocessed_scores.append(scores_df) + + return postprocessed_scores + + # TODO CG: refactor this method def format_data(self, data): # for metcalf the data will just be a floating point value (i.e. the score) return f'{data:.4f}' + # TODO CG: refactor this method def sort(self, objects, reverse=True): # sort based on score return sorted(objects, From e22a8ce380579cfc9f33b149abee02d9f94ceefd Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 08:39:25 +0200 Subject: [PATCH 57/95] add deprecation to LinkLikelihood class --- src/nplinker/scoring/linking/link_likelihood.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py index 830e0676..3a7b1e59 100644 --- a/src/nplinker/scoring/linking/link_likelihood.py +++ b/src/nplinker/scoring/linking/link_likelihood.py @@ -1,3 +1,4 @@ +from deprecated import deprecated from nplinker.logconfig import LogConfig from nplinker.scoring.linking.data_linking_functions import \ calc_likelihood_matrix @@ -5,7 +6,7 @@ logger = LogConfig.getLogger(__file__) - +@deprecated(version='1.3.3', reason="It's unused and will be removed in 2.0.0") class LinkLikelihood(): """ Class to: From 79ecb4dd138197c5eb8983147dfbac1b0df9e9ee Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 08:59:34 +0200 Subject: [PATCH 58/95] add `__init__.py` to linking module --- src/nplinker/scoring/__init__.py | 10 ++++++++++ src/nplinker/scoring/linking/__init__.py | 10 ++++++++++ src/nplinker/scoring/linking/data_linking.py | 1 + src/nplinker/scoring/linking/link_finder.py | 4 ++-- src/nplinker/scoring/metcalf_scoring.py | 12 ++++++------ tests/scoring/test_data_linking_functions.py | 2 +- tests/scoring/test_scoring.py | 4 ++-- 7 files changed, 32 insertions(+), 11 deletions(-) create mode 100644 src/nplinker/scoring/linking/__init__.py diff --git a/src/nplinker/scoring/__init__.py b/src/nplinker/scoring/__init__.py index e69de29b..0aaefeaf 100644 --- a/src/nplinker/scoring/__init__.py +++ b/src/nplinker/scoring/__init__.py @@ -0,0 +1,10 @@ +import logging +from .link_collection import LinkCollection +from .metcalf_scoring import MetcalfScoring +from .methods import ScoringMethod +from .object_link import ObjectLink + + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = ["LinkCollection", "MetcalfScoring", "ScoringMethod", "ObjectLink"] diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py new file mode 100644 index 00000000..bb9043df --- /dev/null +++ b/src/nplinker/scoring/linking/__init__.py @@ -0,0 +1,10 @@ +import logging +from .data_linking import DataLinks +from .data_linking import LINK_TYPES +from .data_linking_functions import calc_correlation_matrix +from .link_finder import LinkFinder + + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = ["DataLinks", "LINK_TYPES", "calc_correlation_matrix", "LinkFinder"] diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index e0849421..c8515aaf 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -9,6 +9,7 @@ from nplinker.metabolomics.spectrum import Spectrum from .data_linking_functions import calc_correlation_matrix + if TYPE_CHECKING: from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 8bf16e1c..5a3c167c 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -7,11 +7,11 @@ from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.spectrum import Spectrum -from .data_linking import LINK_TYPES +from . import LINK_TYPES if TYPE_CHECKING: - from .data_linking import DataLinks + from . import DataLinks logger = LogConfig.getLogger(__file__) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 8bc9d7d0..bff24861 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -9,14 +9,14 @@ from nplinker.metabolomics.spectrum import Spectrum from nplinker.pickler import load_pickled_data from nplinker.pickler import save_pickled_data -from nplinker.scoring.linking.data_linking import DataLinks -from nplinker.scoring.linking.link_finder import LinkFinder -from nplinker.scoring.methods import ScoringMethod -from nplinker.scoring.object_link import ObjectLink -from .linking.data_linking import LINK_TYPES +from .linking import DataLinks +from .linking import LINK_TYPES +from .linking import LinkFinder +from .methods import ScoringMethod +from .object_link import ObjectLink if TYPE_CHECKING: - from .link_collection import LinkCollection + from . import LinkCollection logger = LogConfig.getLogger(__name__) diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_data_linking_functions.py index 9f55d1ac..d0cd8c95 100644 --- a/tests/scoring/test_data_linking_functions.py +++ b/tests/scoring/test_data_linking_functions.py @@ -15,7 +15,7 @@ # test functions import numpy as np -from nplinker.scoring.linking.data_linking_functions import calc_correlation_matrix +from nplinker.scoring.linking import calc_correlation_matrix def test_calc_correlation_matrix(): diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py index 2825ad9e..b04f74b1 100644 --- a/tests/scoring/test_scoring.py +++ b/tests/scoring/test_scoring.py @@ -3,8 +3,8 @@ from nplinker.genomics import BGC from nplinker.genomics import GCF from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking.data_linking import DataLinks -from nplinker.scoring.linking.link_finder import LinkFinder +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder from nplinker.scoring.linking.misc_deprecated import hg_scoring from nplinker.scoring.linking.misc_deprecated import metcalf_scoring from nplinker.strains import Strain From ac94cd867067a5f9caadee04bde3a5c3724333ca Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 09:48:24 +0200 Subject: [PATCH 59/95] rename `data_linking.py` to `data_links.py` --- src/nplinker/scoring/linking/__init__.py | 4 ++-- .../scoring/linking/{data_linking.py => data_links.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename src/nplinker/scoring/linking/{data_linking.py => data_links.py} (100%) diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py index bb9043df..d2419513 100644 --- a/src/nplinker/scoring/linking/__init__.py +++ b/src/nplinker/scoring/linking/__init__.py @@ -1,6 +1,6 @@ import logging -from .data_linking import DataLinks -from .data_linking import LINK_TYPES +from .data_links import DataLinks +from .data_links import LINK_TYPES from .data_linking_functions import calc_correlation_matrix from .link_finder import LinkFinder diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_links.py similarity index 100% rename from src/nplinker/scoring/linking/data_linking.py rename to src/nplinker/scoring/linking/data_links.py From 8c98e9633dc32cde2e468108c4646a1010633b90 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 09:49:58 +0200 Subject: [PATCH 60/95] rename `data_linking_functions.py` to `utils.py` --- src/nplinker/scoring/linking/__init__.py | 2 +- src/nplinker/scoring/linking/data_links.py | 2 +- src/nplinker/scoring/linking/link_likelihood.py | 2 +- .../linking/{data_linking_functions.py => utils.py} | 0 tests/scoring/test_data_linking_functions.py | 12 ++++++------ 5 files changed, 9 insertions(+), 9 deletions(-) rename src/nplinker/scoring/linking/{data_linking_functions.py => utils.py} (100%) diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py index d2419513..5bf1db00 100644 --- a/src/nplinker/scoring/linking/__init__.py +++ b/src/nplinker/scoring/linking/__init__.py @@ -1,7 +1,7 @@ import logging from .data_links import DataLinks from .data_links import LINK_TYPES -from .data_linking_functions import calc_correlation_matrix +from .utils import calc_correlation_matrix from .link_finder import LinkFinder diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index c8515aaf..ab4b0dcb 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -7,7 +7,7 @@ from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.singleton_family import SingletonFamily from nplinker.metabolomics.spectrum import Spectrum -from .data_linking_functions import calc_correlation_matrix +from .utils import calc_correlation_matrix if TYPE_CHECKING: diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py index 3a7b1e59..b13e1e97 100644 --- a/src/nplinker/scoring/linking/link_likelihood.py +++ b/src/nplinker/scoring/linking/link_likelihood.py @@ -1,6 +1,6 @@ from deprecated import deprecated from nplinker.logconfig import LogConfig -from nplinker.scoring.linking.data_linking_functions import \ +from nplinker.scoring.linking.utils import \ calc_likelihood_matrix diff --git a/src/nplinker/scoring/linking/data_linking_functions.py b/src/nplinker/scoring/linking/utils.py similarity index 100% rename from src/nplinker/scoring/linking/data_linking_functions.py rename to src/nplinker/scoring/linking/utils.py diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_data_linking_functions.py index d0cd8c95..200c198e 100644 --- a/tests/scoring/test_data_linking_functions.py +++ b/tests/scoring/test_data_linking_functions.py @@ -37,7 +37,7 @@ def test_calc_correlation_matrix(): [1, 2, 2, 2, 2, 3], [1, 2, 2, 2, 2, 3]])) -from nplinker.scoring.linking.data_linking_functions import calc_likelihood_matrix +from nplinker.scoring.linking.utils import calc_likelihood_matrix def test_calc_likelihood_matrix(): @@ -59,7 +59,7 @@ def test_calc_likelihood_matrix(): assert LBA.shape == (len(A), len(B)) # must have shape len(A), len(B) -from nplinker.scoring.linking.data_linking_functions import pair_prob_hg +from nplinker.scoring.linking.utils import pair_prob_hg def test_pair_prob_hg(): @@ -70,7 +70,7 @@ def test_pair_prob_hg(): assert pair_prob_hg(1, 100, 2, 2) == 98 / 100 * 2 / 99 + 2 / 100 * 98 / 99 -from nplinker.scoring.linking.data_linking_functions import hit_prob_dist +from nplinker.scoring.linking.utils import hit_prob_dist def test_hit_prob_dist(): @@ -82,7 +82,7 @@ def test_hit_prob_dist(): assert pks[0][0] == 0.99**100 -from nplinker.scoring.linking.data_linking_functions import permutation_unique +from nplinker.scoring.linking.utils import permutation_unique def test_permutation_unique(): @@ -101,7 +101,7 @@ def test_permutation_unique(): testlist))) == 20 # math.factorial(5)/math.factorial(3) -from nplinker.scoring.linking.data_linking_functions import pair_prob +from nplinker.scoring.linking.utils import pair_prob def test_pair_prob(): @@ -124,7 +124,7 @@ def test_pair_prob(): assert pair_prob(P_str, XG, Ny, hits) < (2 / 90 + 0.00000001) -from nplinker.scoring.linking.data_linking_functions import link_prob +from nplinker.scoring.linking.utils import link_prob def test_link_prob(): From 96dcb66fc5d11e1e6ca58eb0da352cc295f43a16 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 09:56:45 +0200 Subject: [PATCH 61/95] rename `test_data_linking_functions.py` to `test_linking_utils.py`.py --- .../{test_data_linking_functions.py => test_linking_utils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/scoring/{test_data_linking_functions.py => test_linking_utils.py} (100%) diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_linking_utils.py similarity index 100% rename from tests/scoring/test_data_linking_functions.py rename to tests/scoring/test_linking_utils.py From 18eb841b478edcc2c49abf75744cd5ff4a73d82a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 09:54:49 +0200 Subject: [PATCH 62/95] Delete test_scoring.py --- tests/scoring/test_scoring.py | 132 ---------------------------------- 1 file changed, 132 deletions(-) delete mode 100644 tests/scoring/test_scoring.py diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py deleted file mode 100644 index b04f74b1..00000000 --- a/tests/scoring/test_scoring.py +++ /dev/null @@ -1,132 +0,0 @@ -import numpy as np -import pandas as pd -from nplinker.genomics import BGC -from nplinker.genomics import GCF -from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking import DataLinks -from nplinker.scoring.linking import LinkFinder -from nplinker.scoring.linking.misc_deprecated import hg_scoring -from nplinker.scoring.linking.misc_deprecated import metcalf_scoring -from nplinker.strains import Strain - - -np.random.seed(50) - - -def create_strains(n=10): - return [Strain(f'strain_{x:02d}') for x in range(n)] - - -def create_gcfs(strains, n=3): - gcfs = [] - for i in range(n): - gcf = GCF(f'fake_gcf_{i}') - num_strains = np.random.randint(1, len(strains)) - randoms = list(range(len(strains))) - np.random.shuffle(randoms) - randoms = randoms[:num_strains] - - for j in range(num_strains): - bgc = BGC(j, strains[randoms[j]]) - gcf.add_bgc(bgc) - - gcfs.append(gcf) - - return gcfs - - -def create_spectra(strains, n=3): - spectra = [] - - families = np.random.randint(0, n * 2, n) - - for i in range(n): - # (id, peaks, spectrum_id, precursor_mz, parent_mz=None, rt=None): - spec = Spectrum(i, [(1, 2), (3, 4)], i, np.random.random()) - num_strains = np.random.randint(1, len(strains)) - randoms = list(range(len(strains))) - np.random.shuffle(randoms) - randoms = randoms[:num_strains] - spec.family = int(families[i]) - for j in range(num_strains): - spec.add_strain(strains[randoms[j]], 'foo', 1) - - spectra.append(spec) - - return spectra - - -def do_scoring_old(gcfs, spectra, strains, standardised): - scores = {} - for gcf in gcfs: - scores[gcf] = {} - for spec in spectra: - score = metcalf_scoring(spec, - gcf, - strains, - standardised=standardised) - scores[gcf][spec] = score - - return scores - - -def do_scoring_new(gcfs, spectra, strains, standardised): - datalinks = DataLinks(gcfs, spectra, strains) - lf = LinkFinder() - scores = lf.metcalf_scoring(datalinks) - - # print(lf.metcalf_expected) - if standardised: - # standardised scoring thing - for i, gcf in enumerate(gcfs): - for j, spec in enumerate(spectra): - # get expected score, variance for objects with the current combo of strain counts - # (note that spectrum = type 1 object here) - met_strains = len(spec.strains) - gen_strains = len(gcf.strains) - expected = lf.metcalf_mean[met_strains][gen_strains] - variance = lf.metcalf_std[met_strains][gen_strains] - scores[j][i] = (scores[j][i] - expected) / np.sqrt(variance) - return scores - - -def do_scoring_old_hg(gcfs, spectra, strains): - scores = {} - for gcf in gcfs: - scores[gcf] = {} - for spec in spectra: - score, _ = hg_scoring(spec, gcf, strains) - scores[gcf][spec] = score - - return scores - - -def do_scoring_new_hg(gcfs, spectra, strains): - datalinks = DataLinks(gcfs, spectra, strains) - lf = LinkFinder() - scores = lf.hg_scoring(datalinks) - return scores - - -def run_metcalf_test(n_strains=3, n_gcfs=5, n_spectra=4, standardised=False): - strains = create_strains(n_strains) - gcfs = create_gcfs(strains, n_gcfs) - spectra = create_spectra(strains, n_spectra) - - old_scores = do_scoring_old(gcfs, spectra, strains, standardised) - new_scores = do_scoring_new(gcfs, spectra, strains, standardised) - - dfdata = {'nonvec_score': [], 'vec_score': [], 'gcf': [], 'spec': []} - - for i, gcf in enumerate(gcfs): - for j, spec in enumerate(spectra): - dfdata['nonvec_score'].append(old_scores[gcf][spec]) - dfdata['vec_score'].append(new_scores[j][i]) - dfdata['gcf'].append(gcf) - dfdata['spec'].append(spec) - - return pd.DataFrame(data=dfdata) - - -if __name__ == "__main__": - run_metcalf_test() From 4f12f355b04c1ef63ff29a13314140560335bd10 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 11:38:42 +0200 Subject: [PATCH 63/95] add dtype to DataLinks dataframes --- src/nplinker/scoring/linking/data_links.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index ab4b0dcb..e28b3bb5 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -157,7 +157,7 @@ def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF], """ df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))), index=[gcf.gcf_id for gcf in gcfs], - columns=[strain.id for strain in strains]) + columns=[strain.id for strain in strains], dtype=int) for gcf in gcfs: for strain in strains: if gcf.has_strain(strain): @@ -175,7 +175,7 @@ def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum], df_spec_strain = pd.DataFrame( np.zeros((len(spectra), len(strains))), index=[spectrum.spectrum_id for spectrum in spectra], - columns=[strain.id for strain in strains]) + columns=[strain.id for strain in strains], dtype=int) for spectrum in spectra: for strain in strains: if spectrum.has_strain(strain): @@ -198,7 +198,7 @@ def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily], df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))), index=[mf.family_id for mf in mfs], - columns=[strain.id for strain in strains]) + columns=[strain.id for strain in strains], dtype=int) for mf in mfs: for strain in strains: if mf.has_strain(strain): @@ -243,15 +243,15 @@ def _get_cooccurrence( self.occurrence_gcf_strain) df_met_gcf = pd.DataFrame(m1, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index) + columns=self.occurrence_gcf_strain.index, dtype=int) df_met_notgcf = pd.DataFrame(m2, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index) + columns=self.occurrence_gcf_strain.index, dtype=int) df_notmet_gcf = pd.DataFrame(m3, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index) + columns=self.occurrence_gcf_strain.index, dtype=int) df_notmet_notgcf = pd.DataFrame( m4, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index) + columns=self.occurrence_gcf_strain.index, dtype=int) return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf From e3fdd86f6c43a036a1934964e001e813478ecec4 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 11:42:17 +0200 Subject: [PATCH 64/95] remove mapping dataframes and relevant method from DataLinks Removed: - self.mapping_spec - self.mapping_gcf - self.mapping_fam -self.mapping_strain - _get_mappings_from_occurrence() method --- src/nplinker/scoring/linking/data_links.py | 48 +++++++--------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index e28b3bb5..7d275335 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -57,10 +57,6 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], of not molecular families<->gcfs. cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence of not molecular families<->not gcfs. - mapping_gcf(pd.DataFrame): A DataFrame to store mappings for gcfs. - mapping_spec(pd.DataFrame): A DataFrame to store mappings for spectra. - mapping_mf(pd.DataFrame): A DataFrame to store mappings for molecular families. - mapping_strain(pd.DataFrame): A DataFrame to store mappings for strains. """ self._strains = strains @@ -76,14 +72,6 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], self.occurrence_mf_strain = self._get_occurrence_mf_strain( mfs, strains) - # DataFrame to store mapping tables, check `_get_mappings_from_occurance` for details - # TODO: these mappings could be removed when refactoring LinkFinder - self.mapping_spec = pd.DataFrame() - self.mapping_gcf = pd.DataFrame() - self.mapping_fam = pd.DataFrame() - self.mapping_strain = pd.DataFrame() - self._get_mappings_from_occurrence() - # DataFrame to store co-occurrence of "spectra<->gcf" or "mfs<->gcf" logger.debug("Create correlation matrices: spectra<->gcfs.") (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf, @@ -157,7 +145,8 @@ def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF], """ df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))), index=[gcf.gcf_id for gcf in gcfs], - columns=[strain.id for strain in strains], dtype=int) + columns=[strain.id for strain in strains], + dtype=int) for gcf in gcfs: for strain in strains: if gcf.has_strain(strain): @@ -175,7 +164,8 @@ def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum], df_spec_strain = pd.DataFrame( np.zeros((len(spectra), len(strains))), index=[spectrum.spectrum_id for spectrum in spectra], - columns=[strain.id for strain in strains], dtype=int) + columns=[strain.id for strain in strains], + dtype=int) for spectrum in spectra: for strain in strains: if spectrum.has_strain(strain): @@ -198,28 +188,14 @@ def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily], df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))), index=[mf.family_id for mf in mfs], - columns=[strain.id for strain in strains], dtype=int) + columns=[strain.id for strain in strains], + dtype=int) for mf in mfs: for strain in strains: if mf.has_strain(strain): df_mf_strain.loc[mf.family_id, strain.id] = 1 return df_mf_strain - def _get_mappings_from_occurrence(self): - - # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs - self.mapping_gcf["no of strains"] = self.occurrence_gcf_strain.sum( - axis=1) - # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs - self.mapping_spec["no of strains"] = self.occurrence_spec_strain.sum( - axis=1) - # pd.Series with index = mf.family_id and value = number of strains where mf occurs - self.mapping_fam["no of strains"] = self.occurrence_mf_strain.sum( - axis=1) - # pd.Series with index = strain.id and value = number of spectra in strain - self.mapping_strain["no of spectra"] = self.occurrence_spec_strain.sum( - axis=0) - def _get_cooccurrence( self, link_type: str = 'spec-gcf' @@ -243,15 +219,19 @@ def _get_cooccurrence( self.occurrence_gcf_strain) df_met_gcf = pd.DataFrame(m1, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index, dtype=int) + columns=self.occurrence_gcf_strain.index, + dtype=int) df_met_notgcf = pd.DataFrame(m2, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index, dtype=int) + columns=self.occurrence_gcf_strain.index, + dtype=int) df_notmet_gcf = pd.DataFrame(m3, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index, dtype=int) + columns=self.occurrence_gcf_strain.index, + dtype=int) df_notmet_notgcf = pd.DataFrame( m4, index=met_strain_occurrence.index, - columns=self.occurrence_gcf_strain.index, dtype=int) + columns=self.occurrence_gcf_strain.index, + dtype=int) return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf From 7ef4da3473935f9cd22214441b17abe49049641d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 11:58:12 +0200 Subject: [PATCH 65/95] Create test_data_links.py --- tests/scoring/test_data_links.py | 174 +++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 tests/scoring/test_data_links.py diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py new file mode 100644 index 00000000..e812e24f --- /dev/null +++ b/tests/scoring/test_data_links.py @@ -0,0 +1,174 @@ +import pandas as pd +from pandas.util.testing import assert_frame_equal +from pytest import fixture +from nplinker.genomics import GCF +from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.spectrum import Spectrum +from nplinker.scoring.linking import DataLinks +from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain + + +@fixture(scope='module') +def strains_list(): + return Strain('strain1'), Strain('strain2'), Strain('strain3') + + +@fixture(scope='module') +def strains(strains_list): + strains = StrainCollection() + for strain in strains_list: + strains.add(strain) + return strains + + +@fixture(scope='module') +def gcfs(strains_list): + gcf1 = GCF('gcf1') + gcf1.strains.add(strains_list[0]) + gcf2 = GCF('gcf2') + gcf2.strains.add(strains_list[1]) + gcf3 = GCF('gcf3') + gcf3.strains.add(strains_list[0]) + gcf3.strains.add(strains_list[1]) + return gcf1, gcf2, gcf3 + + +@fixture(scope='module') +def spectra(strains_list): + spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None) + spectrum1.strains.add(strains_list[0]) + spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None) + spectrum2.strains.add(strains_list[1]) + spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None) + spectrum3.strains.add(strains_list[0]) + spectrum3.strains.add(strains_list[1]) + return spectrum1, spectrum2, spectrum3 + + +@fixture(scope='module') +def mfs(spectra): + mf1 = MolecularFamily('mf1') + mf1.add_spectrum(spectra[0]) + mf2 = MolecularFamily('mf2') + mf2.add_spectrum(spectra[1]) + mf3 = MolecularFamily('mf3') + mf3.add_spectrum(spectra[2]) + return mf1, mf2, mf3 + + +@fixture(scope='module') +def datalinks(gcfs, spectra, mfs, strains): + return DataLinks(gcfs, spectra, mfs, strains) + + +def test_init(datalinks): + # test occorrences + col_names = ['strain1', 'strain2', 'strain3'] + assert_frame_equal( + datalinks.occurrence_gcf_strain, + pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]], + index=['gcf1', 'gcf2', 'gcf3'], + columns=col_names)) + assert_frame_equal( + datalinks.occurrence_spec_strain, + pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.occurrence_mf_strain, + pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + # test co-occorrences spec-gcf + col_names = ['gcf1', 'gcf2', 'gcf3'] + assert_frame_equal( + datalinks.cooccurrence_spec_gcf, + pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_spec_notgcf, + pd.DataFrame([[0, 1, 0], [1, 0, 0], [1, 1, 0]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notspec_gcf, + pd.DataFrame([[0, 1, 1], [1, 0, 1], [0, 0, 0]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notspec_notgcf, + pd.DataFrame([[2, 1, 1], [1, 2, 1], [1, 1, 1]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + # test co-occorrences mf-gcf + assert_frame_equal( + datalinks.cooccurrence_mf_gcf, + pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_mf_notgcf, + pd.DataFrame([[0, 1, 0], [1, 0, 0], [1, 1, 0]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notmf_gcf, + pd.DataFrame([[0, 1, 1], [1, 0, 1], [0, 0, 0]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notmf_notgcf, + pd.DataFrame([[2, 1, 1], [1, 2, 1], [1, 1, 1]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + + +def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list): + sut = datalinks.get_common_strains(spectra[:2], gcfs) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[1]): [], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[0]): [], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + sut = datalinks.get_common_strains(spectra[:2], + gcfs, + filter_no_shared=True) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + +def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list): + sut = datalinks.get_common_strains(mfs[:2], gcfs) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[1]): [], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[0]): [], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + sut = datalinks.get_common_strains(mfs[:2], gcfs, filter_no_shared=True) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + +# TODO: add tests for the DataLinks class From fd01596c8fb839cba11274b72e40c35b660f0e3d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 12:06:17 +0200 Subject: [PATCH 66/95] add `conftest.py` for scoring tests --- tests/scoring/conftest.py | 54 +++++++++++++++++++++++++++++ tests/scoring/test_data_links.py | 58 +------------------------------- 2 files changed, 55 insertions(+), 57 deletions(-) create mode 100644 tests/scoring/conftest.py diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py new file mode 100644 index 00000000..d627bcf1 --- /dev/null +++ b/tests/scoring/conftest.py @@ -0,0 +1,54 @@ +from pytest import fixture +from nplinker.genomics import GCF +from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.spectrum import Spectrum +from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain + + +@fixture +def strains_list(): + return Strain('strain1'), Strain('strain2'), Strain('strain3') + + +@fixture +def strains(strains_list): + strains = StrainCollection() + for strain in strains_list: + strains.add(strain) + return strains + + +@fixture +def gcfs(strains_list): + gcf1 = GCF('gcf1') + gcf1.strains.add(strains_list[0]) + gcf2 = GCF('gcf2') + gcf2.strains.add(strains_list[1]) + gcf3 = GCF('gcf3') + gcf3.strains.add(strains_list[0]) + gcf3.strains.add(strains_list[1]) + return gcf1, gcf2, gcf3 + + +@fixture +def spectra(strains_list): + spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None) + spectrum1.strains.add(strains_list[0]) + spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None) + spectrum2.strains.add(strains_list[1]) + spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None) + spectrum3.strains.add(strains_list[0]) + spectrum3.strains.add(strains_list[1]) + return spectrum1, spectrum2, spectrum3 + + +@fixture +def mfs(spectra): + mf1 = MolecularFamily('mf1') + mf1.add_spectrum(spectra[0]) + mf2 = MolecularFamily('mf2') + mf2.add_spectrum(spectra[1]) + mf3 = MolecularFamily('mf3') + mf3.add_spectrum(spectra[2]) + return mf1, mf2, mf3 diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index e812e24f..36fc7b04 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -1,63 +1,10 @@ import pandas as pd from pandas.util.testing import assert_frame_equal from pytest import fixture -from nplinker.genomics import GCF -from nplinker.metabolomics.molecular_family import MolecularFamily -from nplinker.metabolomics.spectrum import Spectrum from nplinker.scoring.linking import DataLinks -from nplinker.strain_collection import StrainCollection -from nplinker.strains import Strain -@fixture(scope='module') -def strains_list(): - return Strain('strain1'), Strain('strain2'), Strain('strain3') - - -@fixture(scope='module') -def strains(strains_list): - strains = StrainCollection() - for strain in strains_list: - strains.add(strain) - return strains - - -@fixture(scope='module') -def gcfs(strains_list): - gcf1 = GCF('gcf1') - gcf1.strains.add(strains_list[0]) - gcf2 = GCF('gcf2') - gcf2.strains.add(strains_list[1]) - gcf3 = GCF('gcf3') - gcf3.strains.add(strains_list[0]) - gcf3.strains.add(strains_list[1]) - return gcf1, gcf2, gcf3 - - -@fixture(scope='module') -def spectra(strains_list): - spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None) - spectrum1.strains.add(strains_list[0]) - spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None) - spectrum2.strains.add(strains_list[1]) - spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None) - spectrum3.strains.add(strains_list[0]) - spectrum3.strains.add(strains_list[1]) - return spectrum1, spectrum2, spectrum3 - - -@fixture(scope='module') -def mfs(spectra): - mf1 = MolecularFamily('mf1') - mf1.add_spectrum(spectra[0]) - mf2 = MolecularFamily('mf2') - mf2.add_spectrum(spectra[1]) - mf3 = MolecularFamily('mf3') - mf3.add_spectrum(spectra[2]) - return mf1, mf2, mf3 - - -@fixture(scope='module') +@fixture def datalinks(gcfs, spectra, mfs, strains): return DataLinks(gcfs, spectra, mfs, strains) @@ -169,6 +116,3 @@ def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list): (mfs[1], gcfs[2]): [strains_list[1]] } assert sut == expected - - -# TODO: add tests for the DataLinks class From 9f0b289970582004d1865c294aca5790d5deaab3 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 14:09:16 +0200 Subject: [PATCH 67/95] update LinkFinder's attribute and private method - refactor method `_cal_mean_std` - rename attribute `raw_score_fam_gcf` to `raw_score_mf_gcf` --- src/nplinker/scoring/linking/link_finder.py | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 5a3c167c..38559e2d 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -16,6 +16,7 @@ logger = LogConfig.getLogger(__file__) +# TODO CG: this class could be merged to MetcalfScoring class class LinkFinder(): def __init__(self): @@ -26,12 +27,13 @@ def __init__(self): """ # metcalf scores self.raw_score_spec_gcf = pd.DataFrame() - self.raw_score_fam_gcf = pd.DataFrame() + self.raw_score_mf_gcf = pd.DataFrame() # metcalf caching self.metcalf_mean = None self.metcalf_std = None + # TODO CG: cal_score method could be integrated to __init__ def cal_score( self, data_links: DataLinks, @@ -63,20 +65,19 @@ def cal_score( data_links.cooccurrence_notspec_gcf * scoring_weights[2] + data_links.cooccurrence_notspec_notgcf * scoring_weights[3]) if link_type == 'mf-gcf': - self.raw_score_fam_gcf = ( + self.raw_score_mf_gcf = ( data_links.cooccurrence_mf_gcf * scoring_weights[0] + data_links.cooccurrence_mf_notgcf * scoring_weights[1] + data_links.cooccurrence_notmf_gcf * scoring_weights[2] + data_links.cooccurrence_notmf_notgcf * scoring_weights[3]) + n_strains = data_links.occurrence_gcf_strain.shape[1] if self.metcalf_mean is None or self.metcalf_std is None: self.metcalf_mean, self.metcalf_std = self._cal_mean_std( - data_links, scoring_weights) + n_strains, scoring_weights) - def _cal_mean_std(self, data_links, scoring_weights): - # Compute the expected values for all possible values of spec and gcf strains - # we need the total number of strains - _, n_strains = data_links.occurrence_gcf_strain.shape + # TODO CG: read paper and check the logics of this method + def _cal_mean_std(self, n_strains, scoring_weights): sz = (n_strains + 1, n_strains + 1) mean = np.zeros(sz) variance = np.zeros(sz) @@ -131,7 +132,7 @@ def get_links(self, else: types = [type(i) for i in objects] raise TypeError( - f'Invalid type "{set(types)}". Input objects must be GCF, Spectrum or MolecularFamily objects.' + f'Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects.' ) links = [] @@ -143,7 +144,7 @@ def get_links(self, df.name = LINK_TYPES[0] links.append(df) # mf-gcf - scores = self.raw_score_fam_gcf.loc[:, obj_ids] + scores = self.raw_score_mf_gcf.loc[:, obj_ids] df = self._get_scores_source_gcf(scores, score_cutoff) df.name = LINK_TYPES[1] links.append(df) @@ -157,7 +158,7 @@ def get_links(self, if obj_type == 'mf': obj_ids = [mf.family_id for mf in objects] - scores = self.raw_score_fam_gcf.loc[obj_ids, :] + scores = self.raw_score_mf_gcf.loc[obj_ids, :] df = self._get_scores_source_met(scores, score_cutoff) df.name = LINK_TYPES[1] links.append(df) @@ -166,6 +167,8 @@ def get_links(self, def _isinstance(self, _type, *objects) -> bool: return all(isinstance(x, _type) for x in objects) + # TODO CG: the returned data could be changed to dict, like the that in + # the get_links method of the MetcalfScoring class def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame: row_indexes, col_indexes = np.where(scores >= score_cutoff) src_obj_ids = scores.columns[col_indexes].to_list() From 3e3bdfa504dc335f4e4be227a48da82630f09711 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 15:47:17 +0200 Subject: [PATCH 68/95] Create test_link_finder.py --- tests/scoring/test_link_finder.py | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 tests/scoring/test_link_finder.py diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py new file mode 100644 index 00000000..be28dcd3 --- /dev/null +++ b/tests/scoring/test_link_finder.py @@ -0,0 +1,168 @@ +import numpy as np +import pandas as pd +from pandas.util.testing import assert_frame_equal +import pytest +from pytest import fixture +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder + + +@fixture +def linkfinder(): + return LinkFinder() + + +@fixture +def datalinks(gcfs, spectra, mfs, strains): + return DataLinks(gcfs, spectra, mfs, strains) + + +def test_init(linkfinder): + assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame()) + assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame()) + assert linkfinder.metcalf_mean is None + assert linkfinder.metcalf_std is None + + +def test_cal_score(linkfinder, datalinks): + linkfinder.cal_score(datalinks, link_type='spec-gcf') + assert_frame_equal( + linkfinder.raw_score_spec_gcf, + pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=['gcf1', 'gcf2', 'gcf3'])) + + linkfinder.cal_score(datalinks, link_type='mf-gcf') + assert_frame_equal( + linkfinder.raw_score_mf_gcf, + pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=['mf1', 'mf2', 'mf3'], + columns=['gcf1', 'gcf2', 'gcf3'])) + + # TODO CG: add tests for values after refactoring _cal_mean_std + assert isinstance(linkfinder.metcalf_mean, np.ndarray) + assert isinstance(linkfinder.metcalf_std, np.ndarray) + assert linkfinder.metcalf_mean.shape == (4, 4 + ) # (n_strains+1 , n_strains+1) + assert linkfinder.metcalf_mean.shape == (4, 4) + # generated metcalf_mean + # array([[ 3., 2., 1., 0.], + # [ -8., -2., 4., 10.], + # [-19., -6., 7., 20.], + # [-30., -10., 10., 30.]]) + # generated metcalf_std + # array([[1. , 1. , 1. , 1. ], + # [1. , 9.89949494, 9.89949494, 1. ], + # [1. , 9.89949494, 9.89949494, 1. ], + # [1. , 1. , 1. , 1. ]]) + + +def test_get_links_gcf(linkfinder, datalinks, gcfs): + linkfinder.cal_score(datalinks, link_type='spec-gcf') + linkfinder.cal_score(datalinks, link_type='mf-gcf') + index_names = ['source', 'target', 'score'] + # cutoff = negative infinity (float) + links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF) + assert len(links) == 2 + assert_frame_equal( + links[0], + pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3, + [ + *['spectrum1'] * 3, + *['spectrum2'] * 3, + *['spectrum3'] * 3, + ], [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + assert_frame_equal( + links[1], + pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3, + [ + *['mf1'] * 3, + *['mf2'] * 3, + *['mf3'] * 3, + ], [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + + # cutoff = 0 + links = linkfinder.get_links(*gcfs, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [ + *['spectrum1'] * 2, + *['spectrum2'] * 2, + *['spectrum3'] * 3, + ], [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + assert_frame_equal( + links[1], + pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [ + *['mf1'] * 2, + *['mf2'] * 2, + *['mf3'] * 3, + ], [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + + +def test_get_links_spec(linkfinder, datalinks, spectra): + linkfinder.cal_score(datalinks, link_type='spec-gcf') + linkfinder.cal_score(datalinks, link_type='mf-gcf') + index_names = ['source', 'target', 'score'] + # cutoff = negative infinity (float) + links = linkfinder.get_links(*spectra, score_cutoff=np.NINF) + assert len(links) == 1 + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['spectrum1'] * 3, + *['spectrum2'] * 3, + *['spectrum3'] * 3, + ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + # cutoff = 0 + links = linkfinder.get_links(*spectra, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['spectrum1'] * 2, + *['spectrum2'] * 2, + *['spectrum3'] * 3, + ], ['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + + +def test_get_links_mf(linkfinder, datalinks, mfs): + linkfinder.cal_score(datalinks, link_type='spec-gcf') + linkfinder.cal_score(datalinks, link_type='mf-gcf') + index_names = ['source', 'target', 'score'] + # cutoff = negative infinity (float) + links = linkfinder.get_links(*mfs, score_cutoff=np.NINF) + assert len(links) == 1 + + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['mf1'] * 3, + *['mf2'] * 3, + *['mf3'] * 3, + ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + + links = linkfinder.get_links(*mfs, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['mf1'] * 2, + *['mf2'] * 2, + *['mf3'] * 3, + ], ['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + + +def test_get_links_exceptions(linkfinder): + with pytest.raises(TypeError) as e: + linkfinder.get_links("") + assert "Invalid type {}" in str(e.value) From 46880a805e900d50a20d2e7130eda902947e4168 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 10:44:32 +0200 Subject: [PATCH 69/95] Update vscode plugin autodocstring template - fix indentation bug in autodocsting - remove `Examples:` section --- .vscode/vscode_docstring_google_adapted.mustache | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.vscode/vscode_docstring_google_adapted.mustache b/.vscode/vscode_docstring_google_adapted.mustache index 42a7d1e1..f2d2b35d 100644 --- a/.vscode/vscode_docstring_google_adapted.mustache +++ b/.vscode/vscode_docstring_google_adapted.mustache @@ -26,7 +26,3 @@ Returns: {{typePlaceholder}}: {{descriptionPlaceholder}} {{/returns}} {{/returnsExist}} - -Examples: - >>> {{#placeholder}} - {{/placeholder}} \ No newline at end of file From 31586ddec8acbd6c5284ca5a4d52b00bdf7d6348 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 12:15:53 +0200 Subject: [PATCH 70/95] add scope for fixtures --- tests/scoring/conftest.py | 10 +++++----- tests/scoring/test_data_links.py | 2 +- tests/scoring/test_link_finder.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py index d627bcf1..6a15f05f 100644 --- a/tests/scoring/conftest.py +++ b/tests/scoring/conftest.py @@ -6,12 +6,12 @@ from nplinker.strains import Strain -@fixture +@fixture(scope='session') def strains_list(): return Strain('strain1'), Strain('strain2'), Strain('strain3') -@fixture +@fixture(scope='session') def strains(strains_list): strains = StrainCollection() for strain in strains_list: @@ -19,7 +19,7 @@ def strains(strains_list): return strains -@fixture +@fixture(scope='session') def gcfs(strains_list): gcf1 = GCF('gcf1') gcf1.strains.add(strains_list[0]) @@ -31,7 +31,7 @@ def gcfs(strains_list): return gcf1, gcf2, gcf3 -@fixture +@fixture(scope='session') def spectra(strains_list): spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None) spectrum1.strains.add(strains_list[0]) @@ -43,7 +43,7 @@ def spectra(strains_list): return spectrum1, spectrum2, spectrum3 -@fixture +@fixture(scope='session') def mfs(spectra): mf1 = MolecularFamily('mf1') mf1.add_spectrum(spectra[0]) diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 36fc7b04..94d1d24c 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -4,7 +4,7 @@ from nplinker.scoring.linking import DataLinks -@fixture +@fixture(scope='module') def datalinks(gcfs, spectra, mfs, strains): return DataLinks(gcfs, spectra, mfs, strains) diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py index be28dcd3..871f4e90 100644 --- a/tests/scoring/test_link_finder.py +++ b/tests/scoring/test_link_finder.py @@ -7,12 +7,12 @@ from nplinker.scoring.linking import LinkFinder -@fixture +@fixture(scope='module') def linkfinder(): return LinkFinder() -@fixture +@fixture(scope='module') def datalinks(gcfs, spectra, mfs, strains): return DataLinks(gcfs, spectra, mfs, strains) From 22ffd9039940e092bed6f13e4732bf671a8dc96e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 10 May 2023 17:11:39 +0200 Subject: [PATCH 71/95] Create test_metcalf_scoring.py --- tests/scoring/test_metcalf_scoring.py | 238 ++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 tests/scoring/test_metcalf_scoring.py diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py new file mode 100644 index 00000000..8bb715fc --- /dev/null +++ b/tests/scoring/test_metcalf_scoring.py @@ -0,0 +1,238 @@ +import numpy as np +from numpy.testing import assert_array_equal +from pandas.util.testing import assert_frame_equal +import pytest +from pytest import fixture +from nplinker.nplinker import NPLinker +from nplinker.scoring import LinkCollection +from nplinker.scoring import MetcalfScoring +from nplinker.scoring import ObjectLink +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder +from .. import DATA_DIR + + +@fixture(scope='module') +def datalinks(gcfs, spectra, mfs, strains): + """DataLinks object. See `test_data_links.py` for its actual values.""" + return DataLinks(gcfs, spectra, mfs, strains) + + +@fixture(scope='module') +def linkfinder(datalinks): + """LinkFinder object. See `test_link_finder.py` for its actual values.""" + linkfinder = LinkFinder() + linkfinder.cal_score(datalinks, link_type='spec-gcf') + linkfinder.cal_score(datalinks, link_type='mf-gcf') + return linkfinder + + +@fixture(scope='module') +def npl(gcfs, spectra, mfs, strains, tmp_path_factory): + """Constructed NPLinker object. + + This NPLinker object does not do loading `npl.load_data()`, instead we + manually set its attributes to the values we want to test. + + The config file `nplinker_demo1.toml` does not affect the tests, just + making sure the NPLinker object can be created succesfully. + """ + npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) + npl._gcfs = gcfs + npl._spectra = spectra + npl._molfams = mfs + npl._strains = strains + npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} + npl._mf_lookup = {mf.family_id: mf for mf in mfs} + npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} + # tmp path to store 'metcalf/metcalf_scores.pckl' file + # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) + npl._loader._root = tmp_path_factory.mktemp('npl_test') + return npl + + +@fixture(scope='module') +def mc(npl): + """MetcalfScoring object.""" + mc = MetcalfScoring(npl) + mc.setup(npl) + return mc + + +def test_init(npl): + mc = MetcalfScoring(npl) + assert mc.npl == npl + assert mc.name == 'metcalf' + assert mc.cutoff == 1.0 + assert mc.standardised is True + assert mc.DATALINKS is None + assert mc.LINKFINDER is None + + +def test_setup(mc, datalinks, linkfinder): + """Test `setup` method when cache file does not exist.""" + assert isinstance(mc.DATALINKS, DataLinks) + assert isinstance(mc.LINKFINDER, LinkFinder) + + assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, + datalinks.occurrence_gcf_strain) + assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, + datalinks.cooccurrence_spec_gcf) + + assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, + linkfinder.raw_score_spec_gcf) + assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, + linkfinder.raw_score_mf_gcf) + assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean) + assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std) + + +def test_setup_load_cache(mc, npl, datalinks, linkfinder, caplog): + """Test `setup` method when cache file exists.""" + mc.setup(npl) + assert "MetcalfScoring.setup loading cached data" in caplog.text + assert "MetcalfScoring.setup caching results" not in caplog.text + + assert isinstance(mc.DATALINKS, DataLinks) + assert isinstance(mc.LINKFINDER, LinkFinder) + + assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, + datalinks.occurrence_gcf_strain) + assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, + datalinks.cooccurrence_spec_gcf) + + assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, + linkfinder.raw_score_spec_gcf) + assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, + linkfinder.raw_score_mf_gcf) + assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean) + assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std) + + +def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs): + """Test `get_links` method when input is GCF objects and `standardised` is False.""" + # test raw scores (no standardisation) + mc.standardised = False + + # when cutoff is negative infinity, i.e. taking all scores + mc.cutoff = np.NINF + links = mc.get_links(*gcfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + # check actual values in `test_get_links_gcf` of test_link_finder.py + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]][spectra[0]].data(mc) == -9 + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + + # when test cutoff is 0, i.e. taking scores >= 0 + mc.cutoff = 0 + links = mc.get_links(*gcfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]].get(spectra[0]) is None + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_gcf_standardised_true(mc, gcfs, spectra, mfs): + """Test `get_links` method when input is GCF objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_spec_standardised_false(mc, gcfs, spectra): + """Test `get_links` method when input is Spectrum objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = mc.get_links(*spectra, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]][gcfs[1]].data(mc) == -9 + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + + mc.cutoff = 0 + links = mc.get_links(*spectra, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]].get(gcfs[1]) is None + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_spec_standardised_true(mc, gcfs, spectra): + """Test `get_links` method when input is Spectrum objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_mf_standardised_false(mc, gcfs, mfs): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = mc.get_links(*mfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]][gcfs[1]].data(mc) == -9 + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + + mc.cutoff = 0 + links = mc.get_links(*mfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]].get(gcfs[1]) is None + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_mf_standardised_true(mc, gcfs, mfs): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_invalid_input(mc): + """Test `get_links` method when input is invalid.""" + with pytest.raises(TypeError) as e: + mc.get_links("", link_collection=LinkCollection()) + assert "Invalid type {}" in str(e.value) + + +def test_get_links_no_linkfinder(npl, gcfs): + """Test `get_links` method when no LinkFinder object is found.""" + mc = MetcalfScoring(npl) + mc.LINKFINDER = None + with pytest.raises(ValueError) as e: + mc.get_links(*gcfs, link_collection=LinkCollection()) + assert "LinkFinder object not found." in str(e.value) From 79082a6413be03a8cfe9b5ee45feebcda9fcb8c3 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 16:49:07 +0200 Subject: [PATCH 72/95] add docstrings and type hints to `MetcalfScoring` class --- src/nplinker/scoring/metcalf_scoring.py | 63 +++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index bff24861..5f9d61c8 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -15,19 +15,41 @@ from .methods import ScoringMethod from .object_link import ObjectLink + if TYPE_CHECKING: from . import LinkCollection + from ..nplinker import NPLinker logger = LogConfig.getLogger(__name__) class MetcalfScoring(ScoringMethod): + """Metcalf scoring method. + + Attributes: + DATALINKS (DataLinks): The DataLinks object to use for scoring. + LINKFINDER (LinkFinder): The LinkFinder object to use for scoring. + NAME (str): The name of the scoring method. This is set to 'metcalf'. + """ DATALINKS = None LINKFINDER = None NAME = 'metcalf' - def __init__(self, npl): + def __init__(self, npl: NPLinker) -> None: + """Create a MetcalfScoring object. + + Args: + npl (NPLinker): The NPLinker object to use for scoring. + + Attributes: + cutoff (float): The cutoff value to use for scoring. Scores below + this value will be discarded. Defaults to 1.0. + standardised (bool): Whether to use standardised scores. Defaults + to True. + name (str): The name of the scoring method. It's set to a fixed value + 'metcalf'. + """ super().__init__(npl) self.cutoff = 1.0 self.standardised = True @@ -35,7 +57,11 @@ def __init__(self, npl): # TODO CG: not sure why using staticmethod here. Check later and refactor if possible # TODO CG: refactor this method and extract code for cache file to a separate method @staticmethod - def setup(npl): + def setup(npl: NPLinker): + """Setup the MetcalfScoring object. + + DataLinks and LinkFinder objects are created and cached for later use. + """ logger.info( 'MetcalfScoring.setup (bgcs={}, gcfs={}, spectra={}, molfams={}, strains={})' .format(len(npl.bgcs), len(npl.gcfs), len(npl.spectra), @@ -92,10 +118,30 @@ def setup(npl): # TODO CG: is it needed? remove it if not @property - def datalinks(self): + def datalinks(self) -> DataLinks: return MetcalfScoring.DATALINKS - def get_links(self, *objects, link_collection) -> LinkCollection: + def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] + | tuple[MolecularFamily, ...], + link_collection: LinkCollection) -> LinkCollection: + """Get links for the given objects and add them to the given LinkCollection. + + The given objects are treated as input or source objects, which must + be GCF, Spectrum or MolecularFamily objects. + + Args: + objects(tuple): The objects to get links for. Must be GCF, Spectrum + or MolecularFamily objects. + link_collection: The LinkCollection object to add the links to. + + Returns: + LinkCollection: The LinkCollection object with the new links added. + + Raises: + TypeError: If the input objects are not of the correct type. + ValueError: If LinkFinder instance has not been created + (MetcalfScoring object has not been setup). + """ if self._isinstance(GCF, *objects): obj_type = 'gcf' elif self._isinstance(Spectrum, *objects): @@ -191,10 +237,11 @@ def get_links(self, *objects, link_collection) -> LinkCollection: return link_collection def _isinstance(self, _type, *objects) -> bool: + """Check if all objects are of the given type.""" return all(isinstance(x, _type) for x in objects) - def _cal_standardised_score_met(self, linkfinder, - results) -> list[pd.DataFrame]: + def _cal_standardised_score_met(self, linkfinder: LinkFinder, + results: list) -> list[pd.DataFrame]: logger.debug('Calculating standardised Metcalf scores (met input)') raw_score = results[0] z_scores = [] @@ -225,8 +272,8 @@ def _cal_standardised_score_met(self, linkfinder, return [scores_df] - def _cal_standardised_score_gen(self, linkfinder, - results) -> list[pd.DataFrame]: + def _cal_standardised_score_gen(self, linkfinder: LinkFinder, + results: list) -> list[pd.DataFrame]: logger.debug('Calculating standardised Metcalf scores (gen input)') postprocessed_scores = [] for raw_score in results: From 15a0bd1f69048a5fdfba6c478290fe67ecdacb0a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 17:13:05 +0200 Subject: [PATCH 73/95] add util func `isinstance_all` --- src/nplinker/scoring/linking/__init__.py | 8 ++++++-- src/nplinker/scoring/linking/utils.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py index 5bf1db00..f33d043b 100644 --- a/src/nplinker/scoring/linking/__init__.py +++ b/src/nplinker/scoring/linking/__init__.py @@ -1,10 +1,14 @@ import logging from .data_links import DataLinks from .data_links import LINK_TYPES -from .utils import calc_correlation_matrix from .link_finder import LinkFinder +from .utils import calc_correlation_matrix +from .utils import isinstance_all logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["DataLinks", "LINK_TYPES", "calc_correlation_matrix", "LinkFinder"] +__all__ = [ + "DataLinks", "LINK_TYPES", "LinkFinder", "calc_correlation_matrix", + "isinstance_all" +] diff --git a/src/nplinker/scoring/linking/utils.py b/src/nplinker/scoring/linking/utils.py index 20824c62..044627ad 100644 --- a/src/nplinker/scoring/linking/utils.py +++ b/src/nplinker/scoring/linking/utils.py @@ -18,6 +18,10 @@ import numpy as np +def isinstance_all(*objects, objtype) -> bool: + """Check if all objects are of the given type.""" + return all(isinstance(x, objtype) for x in objects) + def calc_correlation_matrix(M_type1_cond, M_type2_cond): """ Calculate correlation matrices from co-occurence matrices From f0f570a00d127ab717a9eeeb5d9347dbe5d87de2 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 17:17:25 +0200 Subject: [PATCH 74/95] replace `_isinstance` with util func `isinstance_all` --- src/nplinker/scoring/linking/link_finder.py | 10 ++++------ src/nplinker/scoring/metcalf_scoring.py | 11 ++++------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 38559e2d..10ffa811 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -8,6 +8,7 @@ from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.spectrum import Spectrum from . import LINK_TYPES +from .utils import isinstance_all if TYPE_CHECKING: @@ -123,11 +124,11 @@ def get_links(self, Raises: TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. """ - if self._isinstance(GCF, *objects): + if isinstance_all(*objects, objtype=GCF): obj_type = 'gcf' - elif self._isinstance(Spectrum, *objects): + elif isinstance_all(*objects, objtype=Spectrum): obj_type = 'spec' - elif self._isinstance(MolecularFamily, *objects): + elif isinstance_all(*objects, objtype=MolecularFamily): obj_type = 'mf' else: types = [type(i) for i in objects] @@ -164,9 +165,6 @@ def get_links(self, links.append(df) return links - def _isinstance(self, _type, *objects) -> bool: - return all(isinstance(x, _type) for x in objects) - # TODO CG: the returned data could be changed to dict, like the that in # the get_links method of the MetcalfScoring class def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame: diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 5f9d61c8..8dfd8432 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -10,6 +10,7 @@ from nplinker.pickler import load_pickled_data from nplinker.pickler import save_pickled_data from .linking import DataLinks +from .linking import isinstance_all from .linking import LINK_TYPES from .linking import LinkFinder from .methods import ScoringMethod @@ -142,11 +143,11 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] ValueError: If LinkFinder instance has not been created (MetcalfScoring object has not been setup). """ - if self._isinstance(GCF, *objects): + if isinstance_all(*objects, objtype=GCF): obj_type = 'gcf' - elif self._isinstance(Spectrum, *objects): + elif isinstance_all(*objects, objtype=Spectrum): obj_type = 'spec' - elif self._isinstance(MolecularFamily, *objects): + elif isinstance_all(*objects, objtype=MolecularFamily): obj_type = 'mf' else: types = [type(i) for i in objects] @@ -236,10 +237,6 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] logger.debug('MetcalfScoring: completed') return link_collection - def _isinstance(self, _type, *objects) -> bool: - """Check if all objects are of the given type.""" - return all(isinstance(x, _type) for x in objects) - def _cal_standardised_score_met(self, linkfinder: LinkFinder, results: list) -> list[pd.DataFrame]: logger.debug('Calculating standardised Metcalf scores (met input)') From 9f4bd2bc477d5797fb78124e2a107f811802eedd Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 18:27:54 +0200 Subject: [PATCH 75/95] update validation of args for `DataLinks` --- src/nplinker/scoring/linking/data_links.py | 23 ++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index 7d275335..1cde6883 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -8,6 +8,7 @@ from nplinker.metabolomics.singleton_family import SingletonFamily from nplinker.metabolomics.spectrum import Spectrum from .utils import calc_correlation_matrix +from .utils import isinstance_all if TYPE_CHECKING: @@ -96,7 +97,8 @@ def get_common_strains( Args: spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]): - A list of Spectrum or MolecularFamily objects. + A list of Spectrum or MolecularFamily objects and all objects + must be of the same type. gcfs(Sequence[GCF]): A list of GCF objects. filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF without common strains will be removed from the returned dict; @@ -104,13 +106,22 @@ def get_common_strains( Returns: dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) and values are a list of shared Strain objects. + + Raises: + ValueError: If the first argument is not a list of Spectrum or + MolecularFamily objects, or the second argument is not a list of + GCF objects. """ - if not isinstance(spectra_or_mfs[0], (Spectrum, MolecularFamily)): + # Check input arguments + if len(spectra_or_mfs) == 0 or len(gcfs) == 0: + raise ValueError('Empty list for first or second argument.') + if not isinstance_all(*spectra_or_mfs, + objtype=Spectrum) and not isinstance_all( + *spectra_or_mfs, objtype=MolecularFamily): raise ValueError( - 'Must provide Spectra or MolecularFamilies as the first argument!' - ) - if not isinstance(gcfs[0], GCF): - raise ValueError('Must provide GCFs as the second argument!') + 'First argument must be Spectrum or MolecularFamily objects.') + if not isinstance_all(*gcfs, objtype=GCF): + raise ValueError('Second argument must be GCF objects.') # Assume that 3 occurrence dataframes have same df.columns (strain ids) strain_ids = self.occurrence_gcf_strain.columns From 1bd05a86673e088b0b1a88f73988f198f197a07f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 18:29:54 +0200 Subject: [PATCH 76/95] Update test_data_links.py - add docstrings - add more tests --- tests/scoring/test_data_links.py | 70 ++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 94d1d24c..c89bc5cc 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -1,6 +1,8 @@ import pandas as pd from pandas.util.testing import assert_frame_equal +import pytest from pytest import fixture +from nplinker.metabolomics.singleton_family import SingletonFamily from nplinker.scoring.linking import DataLinks @@ -10,6 +12,11 @@ def datalinks(gcfs, spectra, mfs, strains): def test_init(datalinks): + """Test that the DataLinks object is initialised correctly. + + Multiple private methods are called in the init method, so we test + that the correct dataframes are created. + """ # test occorrences col_names = ['strain1', 'strain2', 'strain3'] assert_frame_equal( @@ -73,6 +80,7 @@ def test_init(datalinks): def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list): + """Test get_common_strains method for input spectra and gcfs.""" sut = datalinks.get_common_strains(spectra[:2], gcfs) expected = { (spectra[0], gcfs[0]): [strains_list[0]], @@ -97,6 +105,7 @@ def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list): def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list): + """Test get_common_strains method for input mfs and gcfs.""" sut = datalinks.get_common_strains(mfs[:2], gcfs) expected = { (mfs[0], gcfs[0]): [strains_list[0]], @@ -116,3 +125,64 @@ def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list): (mfs[1], gcfs[2]): [strains_list[1]] } assert sut == expected + + +def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list): + """Test get_common_strains method for input SingletonFamily.""" + smf = SingletonFamily() + + sut = datalinks.get_common_strains([smf], gcfs) + assert sut == {} + + # the expected are same as `test_get_common_strains_mf` + mfs_mix = (*mfs[:2], smf) + sut = datalinks.get_common_strains(mfs_mix, gcfs) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[1]): [], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[0]): [], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + sut = datalinks.get_common_strains(mfs_mix, gcfs, filter_no_shared=True) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + +@pytest.mark.parametrize( + "first_arg, expected", + [([], "Empty list for first or second argument."), + ((), "Empty list for first or second argument."), + ([1], "First argument must be Spectrum or MolecularFamily objects.")]) +def test_get_common_strains_exception_first_arg(datalinks, spectra, mfs, gcfs, + first_arg, expected): + """Test get_common_strains method for invalid 1st arugment.""" + with pytest.raises(ValueError) as e: + datalinks.get_common_strains(first_arg, gcfs) + assert expected in str(e.value) + + # mixed input + with pytest.raises(ValueError) as e: + datalinks.get_common_strains(spectra + mfs, gcfs) + assert "First argument must be Spectrum or MolecularFamily objects." in str( + e.value) + + +@pytest.mark.parametrize("second_arg, expected", + [([], "Empty list for first or second argument."), + ((), "Empty list for first or second argument."), + ([1], "Second argument must be GCF objects.")]) +def test_get_common_strains_exception_second_arg(datalinks, spectra, + second_arg, expected): + """Test get_common_strains method for invalid 2nd argument.""" + with pytest.raises(ValueError) as e: + datalinks.get_common_strains(spectra, second_arg) + assert expected in str(e.value) From a19f43f99aaddb8ba97f77f06e9a5c117e28f50e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 11 May 2023 18:39:51 +0200 Subject: [PATCH 77/95] add type hints for returned values to unit tests --- tests/scoring/conftest.py | 13 ++++++++----- tests/scoring/test_data_links.py | 2 +- tests/scoring/test_link_finder.py | 2 +- tests/scoring/test_metcalf_scoring.py | 8 ++++---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py index 6a15f05f..73606506 100644 --- a/tests/scoring/conftest.py +++ b/tests/scoring/conftest.py @@ -7,12 +7,12 @@ @fixture(scope='session') -def strains_list(): +def strains_list() -> tuple[Strain, Strain, Strain]: return Strain('strain1'), Strain('strain2'), Strain('strain3') @fixture(scope='session') -def strains(strains_list): +def strains(strains_list) -> StrainCollection: strains = StrainCollection() for strain in strains_list: strains.add(strain) @@ -20,7 +20,7 @@ def strains(strains_list): @fixture(scope='session') -def gcfs(strains_list): +def gcfs(strains_list) -> tuple[GCF, GCF, GCF]: gcf1 = GCF('gcf1') gcf1.strains.add(strains_list[0]) gcf2 = GCF('gcf2') @@ -32,7 +32,7 @@ def gcfs(strains_list): @fixture(scope='session') -def spectra(strains_list): +def spectra(strains_list) -> tuple[Spectrum, Spectrum, Spectrum]: spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None) spectrum1.strains.add(strains_list[0]) spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None) @@ -44,7 +44,10 @@ def spectra(strains_list): @fixture(scope='session') -def mfs(spectra): +def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]: + """For simplicity, we just use one Spectrum object for each MolecularFamily + object, and notice that they are not SingletonFamily object. + """ mf1 = MolecularFamily('mf1') mf1.add_spectrum(spectra[0]) mf2 = MolecularFamily('mf2') diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index c89bc5cc..5921c61b 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -7,7 +7,7 @@ @fixture(scope='module') -def datalinks(gcfs, spectra, mfs, strains): +def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: return DataLinks(gcfs, spectra, mfs, strains) diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py index 871f4e90..faab635e 100644 --- a/tests/scoring/test_link_finder.py +++ b/tests/scoring/test_link_finder.py @@ -8,7 +8,7 @@ @fixture(scope='module') -def linkfinder(): +def linkfinder() -> LinkFinder: return LinkFinder() diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py index 8bb715fc..34b59d58 100644 --- a/tests/scoring/test_metcalf_scoring.py +++ b/tests/scoring/test_metcalf_scoring.py @@ -13,13 +13,13 @@ @fixture(scope='module') -def datalinks(gcfs, spectra, mfs, strains): +def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: """DataLinks object. See `test_data_links.py` for its actual values.""" return DataLinks(gcfs, spectra, mfs, strains) @fixture(scope='module') -def linkfinder(datalinks): +def linkfinder(datalinks) -> LinkFinder: """LinkFinder object. See `test_link_finder.py` for its actual values.""" linkfinder = LinkFinder() linkfinder.cal_score(datalinks, link_type='spec-gcf') @@ -28,7 +28,7 @@ def linkfinder(datalinks): @fixture(scope='module') -def npl(gcfs, spectra, mfs, strains, tmp_path_factory): +def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: """Constructed NPLinker object. This NPLinker object does not do loading `npl.load_data()`, instead we @@ -52,7 +52,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path_factory): @fixture(scope='module') -def mc(npl): +def mc(npl) -> MetcalfScoring: """MetcalfScoring object.""" mc = MetcalfScoring(npl) mc.setup(npl) From 6fc0ef5c48856e6e77adbbd71690d77adaf75789 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 11:31:56 +0200 Subject: [PATCH 78/95] update exception types for invalid input --- src/nplinker/scoring/linking/data_links.py | 10 +++--- src/nplinker/scoring/linking/link_finder.py | 4 +++ src/nplinker/scoring/metcalf_scoring.py | 4 +++ tests/scoring/test_data_links.py | 34 +++++++++++++-------- tests/scoring/test_metcalf_scoring.py | 34 ++++++++++++++++----- 5 files changed, 61 insertions(+), 25 deletions(-) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index 1cde6883..733e53f9 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -108,9 +108,9 @@ def get_common_strains( and values are a list of shared Strain objects. Raises: - ValueError: If the first argument is not a list of Spectrum or - MolecularFamily objects, or the second argument is not a list of - GCF objects. + ValueError: If given `spectra_or_mfs` or `gcfs` is empty. + TypeError: If given `spectra_or_mfs` or `gcfs` is not a list of + Spectrum/MolecularFamily or GCF objects, respectively. """ # Check input arguments if len(spectra_or_mfs) == 0 or len(gcfs) == 0: @@ -118,10 +118,10 @@ def get_common_strains( if not isinstance_all(*spectra_or_mfs, objtype=Spectrum) and not isinstance_all( *spectra_or_mfs, objtype=MolecularFamily): - raise ValueError( + raise TypeError( 'First argument must be Spectrum or MolecularFamily objects.') if not isinstance_all(*gcfs, objtype=GCF): - raise ValueError('Second argument must be GCF objects.') + raise TypeError('Second argument must be GCF objects.') # Assume that 3 occurrence dataframes have same df.columns (strain ids) strain_ids = self.occurrence_gcf_strain.columns diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 10ffa811..ddef34aa 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -122,8 +122,12 @@ def get_links(self, - the third row contains the scores. Raises: + ValueError: If input objects are empty. TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. """ + if len(objects) == 0: + raise ValueError('Empty input objects.') + if isinstance_all(*objects, objtype=GCF): obj_type = 'gcf' elif isinstance_all(*objects, objtype=Spectrum): diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 8dfd8432..586f1595 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -139,10 +139,14 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] LinkCollection: The LinkCollection object with the new links added. Raises: + ValueError: If the input objects are empty. TypeError: If the input objects are not of the correct type. ValueError: If LinkFinder instance has not been created (MetcalfScoring object has not been setup). """ + if len(objects) == 0: + raise ValueError('Empty input objects.') + if isinstance_all(*objects, objtype=GCF): obj_type = 'gcf' elif isinstance_all(*objects, objtype=Spectrum): diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 5921c61b..7872266a 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -157,32 +157,40 @@ def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list): assert sut == expected +def test_get_common_strains_invalid_value(datalinks, spectra, gcfs): + """Test get_common_strains method for empty arguments.""" + with pytest.raises(ValueError) as e: + datalinks.get_common_strains([], gcfs) + assert "Empty list for first or second argument." in str(e.value) + with pytest.raises(ValueError) as e: + datalinks.get_common_strains(spectra, []) + assert "Empty list for first or second argument." in str(e.value) + + @pytest.mark.parametrize( "first_arg, expected", - [([], "Empty list for first or second argument."), - ((), "Empty list for first or second argument."), - ([1], "First argument must be Spectrum or MolecularFamily objects.")]) -def test_get_common_strains_exception_first_arg(datalinks, spectra, mfs, gcfs, - first_arg, expected): + [([1], "First argument must be Spectrum or MolecularFamily objects."), + ([1, 2], "First argument must be Spectrum or MolecularFamily objects."), + ("12", "First argument must be Spectrum or MolecularFamily objects.")]) +def test_get_common_strains_invalid_type_first_arg(datalinks, spectra, mfs, + gcfs, first_arg, expected): """Test get_common_strains method for invalid 1st arugment.""" - with pytest.raises(ValueError) as e: + with pytest.raises(TypeError) as e: datalinks.get_common_strains(first_arg, gcfs) assert expected in str(e.value) # mixed input - with pytest.raises(ValueError) as e: + with pytest.raises(TypeError) as e: datalinks.get_common_strains(spectra + mfs, gcfs) assert "First argument must be Spectrum or MolecularFamily objects." in str( e.value) @pytest.mark.parametrize("second_arg, expected", - [([], "Empty list for first or second argument."), - ((), "Empty list for first or second argument."), - ([1], "Second argument must be GCF objects.")]) -def test_get_common_strains_exception_second_arg(datalinks, spectra, - second_arg, expected): + [([1], "Second argument must be GCF objects.")]) +def test_get_common_strains_invalid_type_second_arg(datalinks, spectra, + second_arg, expected): """Test get_common_strains method for invalid 2nd argument.""" - with pytest.raises(ValueError) as e: + with pytest.raises(TypeError) as e: datalinks.get_common_strains(spectra, second_arg) assert expected in str(e.value) diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py index 34b59d58..e9398e5e 100644 --- a/tests/scoring/test_metcalf_scoring.py +++ b/tests/scoring/test_metcalf_scoring.py @@ -14,13 +14,13 @@ @fixture(scope='module') def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: - """DataLinks object. See `test_data_links.py` for its actual values.""" + """DataLinks object. See `test_data_links.py` for its values.""" return DataLinks(gcfs, spectra, mfs, strains) @fixture(scope='module') def linkfinder(datalinks) -> LinkFinder: - """LinkFinder object. See `test_link_finder.py` for its actual values.""" + """LinkFinder object. See `test_link_finder.py` for its values.""" linkfinder = LinkFinder() linkfinder.cal_score(datalinks, link_type='spec-gcf') linkfinder.cal_score(datalinks, link_type='mf-gcf') @@ -122,7 +122,7 @@ def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs): assert len(links) == 3 assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) - # check actual values in `test_get_links_gcf` of test_link_finder.py + # expected values are from `test_get_links_gcf` of test_link_finder.py assert links[gcfs[0]][spectra[0]].data(mc) == 12 assert links[gcfs[1]][spectra[0]].data(mc) == -9 assert links[gcfs[2]][spectra[0]].data(mc) == 11 @@ -222,11 +222,31 @@ def test_get_links_mf_standardised_true(mc, gcfs, mfs): ... -def test_get_links_invalid_input(mc): - """Test `get_links` method when input is invalid.""" +@pytest.mark.parametrize("objects, expected", [([], "Empty input objects"), + ("", "Empty input objects")]) +def test_get_links_invalid_input_value(mc, objects, expected): + with pytest.raises(ValueError) as e: + mc.get_links(*objects, link_collection=LinkCollection()) + assert expected in str(e.value) + + +@pytest.mark.parametrize("objects, expected", + [([1], "Invalid type {}"), + ([1, 2], "Invalid type {}"), + ("12", "Invalid type {}")]) +def test_get_links_invalid_input_type(mc, objects, expected): + with pytest.raises(TypeError) as e: + mc.get_links(*objects, link_collection=LinkCollection()) + assert expected in str(e.value) + + +def test_get_links_invalid_mixed_types(mc, spectra, mfs): + objects = (*spectra, *mfs) with pytest.raises(TypeError) as e: - mc.get_links("", link_collection=LinkCollection()) - assert "Invalid type {}" in str(e.value) + mc.get_links(*objects, link_collection=LinkCollection()) + assert "Invalid type" in str(e.value) + assert ".MolecularFamily" in str(e.value) + assert ".Spectrum" in str(e.value) def test_get_links_no_linkfinder(npl, gcfs): From 91bcff79212f2128fb04f7063ad8476114171081 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 11:54:05 +0200 Subject: [PATCH 79/95] add docstrings and type hints to `LinkFinder` class --- src/nplinker/scoring/linking/link_finder.py | 64 ++++++++++++--------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index ddef34aa..39bbb555 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -17,37 +17,41 @@ logger = LogConfig.getLogger(__file__) -# TODO CG: this class could be merged to MetcalfScoring class +# TODO CG: this class could be merged to MetcalfScoring class? class LinkFinder(): - def __init__(self): + def __init__(self) -> None: + """Initialise LinkFinder object. + + Attributes: + raw_score_spec_gcf (pd.DataFrame): The raw Metcalf scores for + spectrum-GCF links. + raw_score_mf_gcf (pd.DataFrame): The raw Metcalf scores for + molecular family-GCF links. + metcalf_mean (np.ndarray | None): The mean value used for + standardising Metcalf scores. + metcalf_std (np.ndarray | None): The standard deviation value used + for standardising Metcalf scores. """ - Create tables of prospective link candidates. - Separate tables will exist for different linking scenarios, such as - gcfs <-> spectra OR gcf <-> mol.families - """ - # metcalf scores self.raw_score_spec_gcf = pd.DataFrame() self.raw_score_mf_gcf = pd.DataFrame() - - # metcalf caching self.metcalf_mean = None self.metcalf_std = None - # TODO CG: cal_score method could be integrated to __init__ + # TODO CG: cal_score method could be integrated to __init__? def cal_score( self, data_links: DataLinks, link_type: str = 'spec-gcf', scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1) ) -> None: - """Calculate metcalf scores. + """Calculate Metcalf scores. Args: - data_links (DataLinks): The DataLinks object to use for scoring. - link_type (str, optional): The type of link to score. Available - types are 'spec-gcf' and 'mf-gcf'. Defaults to 'spec-gcf'. - scoring_weights (tuple[int,int,int,int], optional): The weights to + data_links(DataLinks): The DataLinks object to use for scoring. + link_type(str): The type of link to score. Must be 'spec-gcf' or + 'mf-gcf'. Defaults to 'spec-gcf'. + scoring_weights(tuple[int,int,int,int]): The weights to use for Metcalf scoring. The weights are applied to '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'. Defaults to (10, -10, 0, 1). @@ -72,13 +76,16 @@ def cal_score( data_links.cooccurrence_notmf_gcf * scoring_weights[2] + data_links.cooccurrence_notmf_notgcf * scoring_weights[3]) + # TODO CG: this part should be moved outside of this method n_strains = data_links.occurrence_gcf_strain.shape[1] if self.metcalf_mean is None or self.metcalf_std is None: self.metcalf_mean, self.metcalf_std = self._cal_mean_std( n_strains, scoring_weights) # TODO CG: read paper and check the logics of this method - def _cal_mean_std(self, n_strains, scoring_weights): + def _cal_mean_std( + self, n_strains: int, scoring_weights: tuple[int, int, int, int] + ) -> tuple[np.ndarray, np.ndarray]: sz = (n_strains + 1, n_strains + 1) mean = np.zeros(sz) variance = np.zeros(sz) @@ -104,22 +111,27 @@ def _cal_mean_std(self, n_strains, scoring_weights): variance[n, m] = expected_sq return mean, np.sqrt(variance) + # TODO CG: the data type of returned should be improved for faster + # processing. Maybe using dict instead of pd.DataFrame? + # like that in the get_links method of the MetcalfScoring class def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...], score_cutoff: float = 0.5) -> list[pd.DataFrame]: - """Get scores for links between objects. + """Get links and scores for given objects. Args: - objects(tuple): GCF, Spectrum or MolecularFamily objects. + objects(tuple): A list of GCF, Spectrum or MolecularFamily objects + and all objects must be of the same type. score_cutoff(float): Minimum score to consider a link (ā‰„score_cutoff). Default is 0.5. Returns: list: List of data frames containing the ids of the linked objects - and the score. The data frame contains three rows: - - the first row contains the ids of the input/source objects, - - the second row contains the ids of the target objects, - - the third row contains the scores. + and the score. The data frame has index names of + 'source', 'target' and 'score': + - the 'source' row contains the ids of the input/source objects, + - the 'target' row contains the ids of the target objects, + - the 'score' row contains the scores. Raises: ValueError: If input objects are empty. @@ -169,9 +181,8 @@ def get_links(self, links.append(df) return links - # TODO CG: the returned data could be changed to dict, like the that in - # the get_links method of the MetcalfScoring class - def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame: + def _get_scores_source_gcf(self, scores: pd.DataFrame, + score_cutoff: float) -> pd.DataFrame: row_indexes, col_indexes = np.where(scores >= score_cutoff) src_obj_ids = scores.columns[col_indexes].to_list() target_obj_ids = scores.index[row_indexes].to_list() @@ -179,7 +190,8 @@ def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame: return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate], index=['source', 'target', 'score']) - def _get_scores_source_met(self, scores, score_cutoff) -> pd.DataFrame: + def _get_scores_source_met(self, scores: pd.DataFrame, + score_cutoff: float) -> pd.DataFrame: row_indexes, col_indexes = np.where(scores >= score_cutoff) src_obj_ids = scores.index[row_indexes].to_list() target_obj_ids = scores.columns[col_indexes].to_list() From 989471aadc8c51f5ccfabc6dd7c9aff0b2fc10d2 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 11:54:29 +0200 Subject: [PATCH 80/95] add more unit tests for `LinkFinder` --- tests/scoring/test_link_finder.py | 64 ++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py index faab635e..3aa2e4e9 100644 --- a/tests/scoring/test_link_finder.py +++ b/tests/scoring/test_link_finder.py @@ -14,6 +14,7 @@ def linkfinder() -> LinkFinder: @fixture(scope='module') def datalinks(gcfs, spectra, mfs, strains): + """DataLinks object. See `test_data_links.py` for its actual values.""" return DataLinks(gcfs, spectra, mfs, strains) @@ -24,14 +25,20 @@ def test_init(linkfinder): assert linkfinder.metcalf_std is None -def test_cal_score(linkfinder, datalinks): +def test_cal_score_raw_score(linkfinder, datalinks): + """Test `cal_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`. + + The expected values are calculated manually by using values from `test_init` + of `test_data_links.py` and the default scoring weights. + """ + # link type = 'spec-gcf' linkfinder.cal_score(datalinks, link_type='spec-gcf') assert_frame_equal( linkfinder.raw_score_spec_gcf, pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], index=['spectrum1', 'spectrum2', 'spectrum3'], columns=['gcf1', 'gcf2', 'gcf3'])) - + # link type = 'mf-gcf' linkfinder.cal_score(datalinks, link_type='mf-gcf') assert_frame_equal( linkfinder.raw_score_mf_gcf, @@ -39,31 +46,29 @@ def test_cal_score(linkfinder, datalinks): index=['mf1', 'mf2', 'mf3'], columns=['gcf1', 'gcf2', 'gcf3'])) - # TODO CG: add tests for values after refactoring _cal_mean_std + +def test_cal_score_mean_std(linkfinder, datalinks): + """Test `cal_score` method for `metcalf_mean` and `metcalf_std`.""" + linkfinder.cal_score(datalinks, link_type='spec-gcf') assert isinstance(linkfinder.metcalf_mean, np.ndarray) assert isinstance(linkfinder.metcalf_std, np.ndarray) assert linkfinder.metcalf_mean.shape == (4, 4 ) # (n_strains+1 , n_strains+1) assert linkfinder.metcalf_mean.shape == (4, 4) - # generated metcalf_mean - # array([[ 3., 2., 1., 0.], - # [ -8., -2., 4., 10.], - # [-19., -6., 7., 20.], - # [-30., -10., 10., 30.]]) - # generated metcalf_std - # array([[1. , 1. , 1. , 1. ], - # [1. , 9.89949494, 9.89949494, 1. ], - # [1. , 9.89949494, 9.89949494, 1. ], - # [1. , 1. , 1. , 1. ]]) + # TODO CG: add tests for values after refactoring _cal_mean_std method + # assert linkfinder.metcalf_mean == expected_array def test_get_links_gcf(linkfinder, datalinks, gcfs): + """Test `get_links` method for input GCF objects.""" linkfinder.cal_score(datalinks, link_type='spec-gcf') linkfinder.cal_score(datalinks, link_type='mf-gcf') index_names = ['source', 'target', 'score'] + # cutoff = negative infinity (float) links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF) assert len(links) == 2 + # expected values got from `test_cal_score_raw_score` assert_frame_equal( links[0], pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3, @@ -85,6 +90,7 @@ def test_get_links_gcf(linkfinder, datalinks, gcfs): # cutoff = 0 links = linkfinder.get_links(*gcfs, score_cutoff=0) + assert len(links) == 2 assert_frame_equal( links[0], pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], @@ -106,6 +112,7 @@ def test_get_links_gcf(linkfinder, datalinks, gcfs): def test_get_links_spec(linkfinder, datalinks, spectra): + """Test `get_links` method for input Spectrum objects.""" linkfinder.cal_score(datalinks, link_type='spec-gcf') linkfinder.cal_score(datalinks, link_type='mf-gcf') index_names = ['source', 'target', 'score'] @@ -134,13 +141,13 @@ def test_get_links_spec(linkfinder, datalinks, spectra): def test_get_links_mf(linkfinder, datalinks, mfs): + """Test `get_links` method for input MolecularFamily objects.""" linkfinder.cal_score(datalinks, link_type='spec-gcf') linkfinder.cal_score(datalinks, link_type='mf-gcf') index_names = ['source', 'target', 'score'] # cutoff = negative infinity (float) links = linkfinder.get_links(*mfs, score_cutoff=np.NINF) assert len(links) == 1 - assert_frame_equal( links[0], pd.DataFrame([[ @@ -149,7 +156,7 @@ def test_get_links_mf(linkfinder, datalinks, mfs): *['mf3'] * 3, ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]], index=index_names)) - + # cutoff = 0 links = linkfinder.get_links(*mfs, score_cutoff=0) assert_frame_equal( links[0], @@ -162,7 +169,28 @@ def test_get_links_mf(linkfinder, datalinks, mfs): index=index_names)) -def test_get_links_exceptions(linkfinder): +@pytest.mark.parametrize("objects, expected", [([], "Empty input objects"), + ("", "Empty input objects")]) +def test_get_links_invalid_value(linkfinder, objects, expected): + with pytest.raises(ValueError) as e: + linkfinder.get_links(*objects) + assert expected in str(e.value) + + +@pytest.mark.parametrize("objects, expected", + [([1], "Invalid type {}"), + ([1, 2], "Invalid type {}"), + ("12", "Invalid type {}")]) +def test_get_links_invalid_type(linkfinder, objects, expected): + with pytest.raises(TypeError) as e: + linkfinder.get_links(*objects) + assert expected in str(e.value) + + +def test_get_links_invalid_mixed_types(linkfinder, spectra, mfs): + objects = (*spectra, *mfs) with pytest.raises(TypeError) as e: - linkfinder.get_links("") - assert "Invalid type {}" in str(e.value) + linkfinder.get_links(*objects) + assert "Invalid type" in str(e.value) + assert ".MolecularFamily" in str(e.value) + assert ".Spectrum" in str(e.value) From 04a5b27a94015c1204012d387e6addb15850edfe Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 12:26:40 +0200 Subject: [PATCH 81/95] fix input type bug for `DataLinks.get_common_strains` --- src/nplinker/scoring/linking/data_links.py | 14 +++--- tests/scoring/test_data_links.py | 58 +++++++++++++++++----- 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index 733e53f9..53a62ba1 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -10,7 +10,6 @@ from .utils import calc_correlation_matrix from .utils import isinstance_all - if TYPE_CHECKING: from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain @@ -87,7 +86,7 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], def get_common_strains( self, - spectra_or_mfs: Sequence[Spectrum] | Sequence[MolecularFamily], + spectra_or_mfs: Sequence[Spectrum | MolecularFamily], gcfs: Sequence[GCF], filter_no_shared: bool = False ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]: @@ -96,9 +95,8 @@ def get_common_strains( Note that SingletonFamily objects are excluded from given `spectra_or_mfs`. Args: - spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]): - A list of Spectrum or MolecularFamily objects and all objects - must be of the same type. + spectra_or_mfs(Sequence[Spectrum | MolecularFamily]): + A list of Spectrum and/or MolecularFamily objects. gcfs(Sequence[GCF]): A list of GCF objects. filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF without common strains will be removed from the returned dict; @@ -116,10 +114,10 @@ def get_common_strains( if len(spectra_or_mfs) == 0 or len(gcfs) == 0: raise ValueError('Empty list for first or second argument.') if not isinstance_all(*spectra_or_mfs, - objtype=Spectrum) and not isinstance_all( - *spectra_or_mfs, objtype=MolecularFamily): + objtype=(Spectrum, MolecularFamily)): raise TypeError( - 'First argument must be Spectrum or MolecularFamily objects.') + 'First argument must be Spectrum and/or MolecularFamily objects.' + ) if not isinstance_all(*gcfs, objtype=GCF): raise TypeError('Second argument must be GCF objects.') diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 7872266a..c652ea2c 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -127,6 +127,43 @@ def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list): assert sut == expected +def test_get_common_strains_spec_mf(datalinks, spectra, mfs, gcfs, + strains_list): + """Test get_common_strains method for mixed input of spectra and mfs.""" + mixed_input = (*spectra[:2], *mfs[:2]) + sut = datalinks.get_common_strains(mixed_input, gcfs) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[1]): [], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[0]): [], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]], + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[1]): [], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[0]): [], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + sut = datalinks.get_common_strains(mixed_input, + gcfs, + filter_no_shared=True) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]], + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list): """Test get_common_strains method for input SingletonFamily.""" smf = SingletonFamily() @@ -167,24 +204,19 @@ def test_get_common_strains_invalid_value(datalinks, spectra, gcfs): assert "Empty list for first or second argument." in str(e.value) -@pytest.mark.parametrize( - "first_arg, expected", - [([1], "First argument must be Spectrum or MolecularFamily objects."), - ([1, 2], "First argument must be Spectrum or MolecularFamily objects."), - ("12", "First argument must be Spectrum or MolecularFamily objects.")]) -def test_get_common_strains_invalid_type_first_arg(datalinks, spectra, mfs, - gcfs, first_arg, expected): +@pytest.mark.parametrize("first_arg, expected", [ + ([1], "First argument must be Spectrum and/or MolecularFamily objects."), + ([1, 2 + ], "First argument must be Spectrum and/or MolecularFamily objects."), + ("12", "First argument must be Spectrum and/or MolecularFamily objects.") +]) +def test_get_common_strains_invalid_type_first_arg(datalinks, gcfs, first_arg, + expected): """Test get_common_strains method for invalid 1st arugment.""" with pytest.raises(TypeError) as e: datalinks.get_common_strains(first_arg, gcfs) assert expected in str(e.value) - # mixed input - with pytest.raises(TypeError) as e: - datalinks.get_common_strains(spectra + mfs, gcfs) - assert "First argument must be Spectrum or MolecularFamily objects." in str( - e.value) - @pytest.mark.parametrize("second_arg, expected", [([1], "Second argument must be GCF objects.")]) From 9c7dd499f19a7607523db9f8a6268dbe98c36d3a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 14:08:17 +0200 Subject: [PATCH 82/95] Create test_nplinker_scoring.py --- tests/scoring/test_nplinker_scoring.py | 198 +++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 tests/scoring/test_nplinker_scoring.py diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py new file mode 100644 index 00000000..fc0656c3 --- /dev/null +++ b/tests/scoring/test_nplinker_scoring.py @@ -0,0 +1,198 @@ +import numpy as np +import pytest +from pytest import fixture +from nplinker.nplinker import NPLinker +from nplinker.scoring import LinkCollection +from nplinker.scoring import MetcalfScoring +from nplinker.scoring import ObjectLink +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder +from .. import DATA_DIR + + +@fixture(scope='module') +def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: + """DataLinks object. See `test_data_links.py` for its values.""" + return DataLinks(gcfs, spectra, mfs, strains) + + +@fixture(scope='module') +def linkfinder(datalinks) -> LinkFinder: + """LinkFinder object. See `test_link_finder.py` for its values.""" + linkfinder = LinkFinder() + linkfinder.cal_score(datalinks, link_type='spec-gcf') + linkfinder.cal_score(datalinks, link_type='mf-gcf') + return linkfinder + + +@fixture(scope='module') +def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: + """Constructed NPLinker object. + + This NPLinker object does not do loading `npl.load_data()`, instead we + manually set its attributes to the values we want to test. + + The config file `nplinker_demo1.toml` does not affect the tests, just + making sure the NPLinker object can be created succesfully. + """ + npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) + npl._gcfs = gcfs + npl._spectra = spectra + npl._molfams = mfs + npl._strains = strains + npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} + npl._mf_lookup = {mf.family_id: mf for mf in mfs} + npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} + # tmp path to store 'metcalf/metcalf_scores.pckl' file + # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) + npl._loader._root = tmp_path_factory.mktemp('npl_test') + return npl + + +@fixture(scope='module') +def mc(npl) -> MetcalfScoring: + """MetcalfScoring object.""" + mc = MetcalfScoring(npl) + mc.setup(npl) + return mc + + +def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs, + strains_list): + """Test `get_links` method when input is GCF objects and `standardised` is False.""" + # test raw scores (no standardisation) + mc.standardised = False + + # when cutoff is negative infinity, i.e. taking all scores + mc.cutoff = np.NINF + links = npl.get_links(list(gcfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + # expected values are from `test_get_links_gcf` of test_link_finder.py + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]][spectra[0]].data(mc) == -9 + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + # expected values are from `test_get_common_strains_spec` of test_data_links.py + assert links[gcfs[0]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[1]][spectra[0]].shared_strains == [] + assert links[gcfs[2]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[0]][mfs[0]].shared_strains == [strains_list[0]] + assert links[gcfs[1]][mfs[1]].shared_strains == [strains_list[1]] + assert set(links[gcfs[2]][mfs[2]].shared_strains) == set(strains_list[0:2]) + + # when test cutoff is 0, i.e. taking scores >= 0 + mc.cutoff = 0 + links = npl.get_links(list(gcfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + # test scores + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]].get(spectra[0]) is None + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + # test shared strains + assert links[gcfs[0]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[2]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[0]][mfs[0]].shared_strains == [strains_list[0]] + assert links[gcfs[1]][mfs[1]].shared_strains == [strains_list[1]] + assert set(links[gcfs[2]][mfs[2]].shared_strains) == set(strains_list[0:2]) + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_gcf_standardised_true(npl, mc, gcfs, spectra, mfs, + strains_list): + """Test `get_links` method when input is GCF objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_spec_standardised_false(npl, mc, gcfs, spectra, + strains_list): + """Test `get_links` method when input is Spectrum objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = npl.get_links(list(spectra), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]][gcfs[1]].data(mc) == -9 + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + assert links[spectra[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[spectra[0]][gcfs[1]].shared_strains == [] + assert links[spectra[0]][gcfs[2]].shared_strains == [strains_list[0]] + + mc.cutoff = 0 + links = npl.get_links(list(spectra), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]].get(gcfs[1]) is None + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + assert links[spectra[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[spectra[0]][gcfs[2]].shared_strains == [strains_list[0]] + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_spec_standardised_true(npl, mc, gcfs, spectra, + strains_list): + """Test `get_links` method when input is Spectrum objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_mf_standardised_false(npl, mc, gcfs, mfs, strains_list): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = npl.get_links(list(mfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]][gcfs[1]].data(mc) == -9 + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + assert links[mfs[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[mfs[0]][gcfs[1]].shared_strains == [] + assert links[mfs[0]][gcfs[2]].shared_strains == [strains_list[0]] + + mc.cutoff = 0 + links = npl.get_links(list(mfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]].get(gcfs[1]) is None + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + assert links[mfs[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[mfs[0]][gcfs[2]].shared_strains == [strains_list[0]] + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_mf_standardised_true(npl, mc, gcfs, mfs, strains_list): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is True.""" + mc.standardised = True + ... From 60ccbdbdfb3cb18cd9e6fc37c79d76024a1f1247 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 14:30:44 +0200 Subject: [PATCH 83/95] add todo comments to `NPLinker` class --- src/nplinker/nplinker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 943cb21b..12b266e7 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -305,6 +305,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): logger.debug('load_data: completed') return True + # TODO CG: refactor this method and update its unit tests def get_links(self, input_objects, scoring_methods, and_mode=True): """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams) From 9a1af05994caaa4e9c4f80a138eb935316ee7283 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 14:31:48 +0200 Subject: [PATCH 84/95] remove local integration tests for scoring part of `NPLinker` - rename `test_nplinker.py` to `test_nplinker_local.py` --- ...est_nplinker.py => test_nplinker_local.py} | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) rename tests/{test_nplinker.py => test_nplinker_local.py} (62%) diff --git a/tests/test_nplinker.py b/tests/test_nplinker_local.py similarity index 62% rename from tests/test_nplinker.py rename to tests/test_nplinker_local.py index 4399d24c..77640e6f 100644 --- a/tests/test_nplinker.py +++ b/tests/test_nplinker_local.py @@ -5,6 +5,11 @@ from . import DATA_DIR +# NOTE: This file only contains tests that run locally and are skipped on CI. +# Basically, only tests related to data loading should be put here. +# For tests on scoring/links, add them to `scoring/test_nplinker_scoring.py`. + + @pytest.fixture(scope='module') def npl() -> NPLinker: npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) @@ -23,17 +28,3 @@ def test_load_data(npl: NPLinker): assert len(npl.gcfs) == 113 assert len(npl.spectra) == 25935 assert len(npl.molfams) == 25769 - - -@pytest.mark.skipif(os.environ.get('CI') == 'true', - reason="Skip when running on CI") -def test_get_links(npl: NPLinker): - mc = npl.scoring_method('metcalf') - mc.cutoff = 3.5 - mc.standardised = True - - actual = npl.get_links(npl.gcfs, mc, and_mode=True) - assert len(actual) == len(actual.sources) == len(actual.links) == 101 - - actual.filter_links(lambda link: link[mc] > 5.0) - assert len(actual.links) == 60 From b0e44e12109d57390c92aa3a10da2337bd0c2496 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 14:38:27 +0200 Subject: [PATCH 85/95] remove unused imports --- src/nplinker/strains.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 54338027..7b07a588 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -1,6 +1,5 @@ from __future__ import annotations from .logconfig import LogConfig -from typing import Iterator logger = LogConfig.getLogger(__name__) From 6ee04e93cfe9b44a1901c2a6aecfc1cecce1653e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 12 May 2023 16:24:53 +0200 Subject: [PATCH 86/95] Fix mypy warnings as much as possible --- src/nplinker/genomics/bgc.py | 3 ++- src/nplinker/genomics/genomics.py | 3 ++- src/nplinker/metabolomics/molecular_family.py | 2 +- src/nplinker/scoring/linking/link_finder.py | 2 ++ src/nplinker/scoring/metcalf_scoring.py | 12 +++++++++++- src/nplinker/scoring/methods.py | 6 +++++- 6 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 7eb7921a..668d159c 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -4,6 +4,7 @@ from nplinker.logconfig import LogConfig from .aa_pred import predict_aa + if TYPE_CHECKING: from ..strains import Strain from .gcf import GCF @@ -121,7 +122,7 @@ def strain(self, strain: Strain) -> None: self._strain = strain @property - def bigscape_classes(self) -> set[str]: + def bigscape_classes(self) -> set[str | None]: """Get BiG-SCAPE's BGC classes. BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index c4e694b0..2ed1ab62 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -245,7 +245,8 @@ def _filter_gcfs( for bgc in bgcs_to_remove: bgcs.remove(bgc) - strains.remove(bgc.strain) + if bgc.strain is not None: + strains.remove(bgc.strain) logger.info( 'Remove GCFs that has only MIBiG BGCs: removing {} GCFs and {} BGCs'. diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 57cd686a..c4e584c5 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -55,7 +55,7 @@ def __str__(self) -> str: return 'MolFam(family_id={}, spectra={})'.format( self.family_id, len(self.spectra)) - def __eq__(self, other: Self) -> bool: + def __eq__(self, other) -> bool: if isinstance(other, MolecularFamily): return (self.id == other.id and self.family_id == other.family_id) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 39bbb555..9b194493 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -154,6 +154,8 @@ def get_links(self, links = [] if obj_type == 'gcf': + # TODO CG: the hint and mypy warnings will be gone after renaming all + # string ids to `.id` obj_ids = [gcf.gcf_id for gcf in objects] # spec-gcf scores = self.raw_score_spec_gcf.loc[:, obj_ids] diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 586f1595..4bf53177 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -181,7 +181,9 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] scores_list = self._cal_standardised_score_met( self.LINKFINDER, scores_list) - link_scores = {} + link_scores: dict[GCF | Spectrum | MolecularFamily, + dict[GCF | Spectrum | MolecularFamily, + ObjectLink]] = {} if obj_type == 'gcf': logger.debug( f'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, ' @@ -243,6 +245,10 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] def _cal_standardised_score_met(self, linkfinder: LinkFinder, results: list) -> list[pd.DataFrame]: + if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: + raise ValueError( + 'Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?' + ) logger.debug('Calculating standardised Metcalf scores (met input)') raw_score = results[0] z_scores = [] @@ -275,6 +281,10 @@ def _cal_standardised_score_met(self, linkfinder: LinkFinder, def _cal_standardised_score_gen(self, linkfinder: LinkFinder, results: list) -> list[pd.DataFrame]: + if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: + raise ValueError( + 'Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?' + ) logger.debug('Calculating standardised Metcalf scores (gen input)') postprocessed_scores = [] for raw_score in results: diff --git a/src/nplinker/scoring/methods.py b/src/nplinker/scoring/methods.py index ff6e7f54..e42b6cdd 100644 --- a/src/nplinker/scoring/methods.py +++ b/src/nplinker/scoring/methods.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations +from typing import TYPE_CHECKING from nplinker.logconfig import LogConfig +if TYPE_CHECKING: + from . import LinkCollection logger = LogConfig.getLogger(__name__) @@ -35,7 +39,7 @@ def setup(npl): """Perform any one-off initialisation required (will only be called once)""" pass - def get_links(self, objects, link_collection): + def get_links(self, *objects, link_collection: LinkCollection) -> LinkCollection: """Given a set of objects, return link information""" return link_collection From 88f48b32bea8e4c23e3272f1b8c41bf33c5a49d4 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 15 May 2023 11:59:49 +0200 Subject: [PATCH 87/95] check strain existence using strain dict --- src/nplinker/strain_collection.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 08291fc7..42387c9b 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -17,7 +17,7 @@ class StrainCollection(): def __init__(self): """A collection of Strain objects.""" self._strains: list[Strain] = [] - # dict of strain name (id and alias) to strain object + # dict of strain name (id and alias) to primary strain object self._strain_dict_name: dict[str, Strain] = {} def __repr__(self) -> str: @@ -45,14 +45,10 @@ def __contains__(self, item: str | Strain) -> bool: The given strain could be a Strain object, or a strain id or alias. """ if isinstance(item, str): - for strain in self: - if item == strain.id or item in strain: - return True - elif isinstance(item, Strain): - return item in self._strains - else: - raise TypeError(f"Expected Strain or str, got {type(item)}") - return False + return item in self._strain_dict_name + if isinstance(item, Strain): + return item.id in self._strain_dict_name + raise TypeError(f"Expected Strain or str, got {type(item)}") def __iter__(self) -> Iterator[Strain]: return iter(self._strains) @@ -101,10 +97,8 @@ def filter(self, strain_set: set[Strain]): def lookup(self, name: str) -> Strain: """Lookup a strain by name (id or alias). - If the name is found, return the strain object; Otherwise, raise a - KeyError. - Args: + name(str): Strain name (id or alias) to lookup. Returns: @@ -113,9 +107,8 @@ def lookup(self, name: str) -> Strain: Raises: KeyError: If the strain name is not found. """ - for strain in self: - if name == strain.id or name in strain: - return strain + if name in self: + return self._strain_dict_name[name] raise KeyError(f"Strain {name} not found in strain collection.") def add_from_file(self, file: str | PathLike) -> None: From 27bcc2dfa881948c1f8a212072d19fb8fb2f2b50 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 20 Jun 2023 14:22:16 +0200 Subject: [PATCH 88/95] change calculate abbrevation from "cal" to "calc" --- src/nplinker/scoring/linking/link_finder.py | 8 +++--- src/nplinker/scoring/metcalf_scoring.py | 12 ++++----- tests/scoring/test_link_finder.py | 30 ++++++++++----------- tests/scoring/test_metcalf_scoring.py | 4 +-- tests/scoring/test_nplinker_scoring.py | 4 +-- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 9b194493..39300f58 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -38,8 +38,8 @@ def __init__(self) -> None: self.metcalf_mean = None self.metcalf_std = None - # TODO CG: cal_score method could be integrated to __init__? - def cal_score( + # TODO CG: calc_score method could be integrated to __init__? + def calc_score( self, data_links: DataLinks, link_type: str = 'spec-gcf', @@ -79,11 +79,11 @@ def cal_score( # TODO CG: this part should be moved outside of this method n_strains = data_links.occurrence_gcf_strain.shape[1] if self.metcalf_mean is None or self.metcalf_std is None: - self.metcalf_mean, self.metcalf_std = self._cal_mean_std( + self.metcalf_mean, self.metcalf_std = self._calc_mean_std( n_strains, scoring_weights) # TODO CG: read paper and check the logics of this method - def _cal_mean_std( + def _calc_mean_std( self, n_strains: int, scoring_weights: tuple[int, int, int, int] ) -> tuple[np.ndarray, np.ndarray]: sz = (n_strains + 1, n_strains + 1) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 4bf53177..c291a511 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -107,9 +107,9 @@ def setup(npl: NPLinker): MetcalfScoring.DATALINKS = DataLinks(npl.gcfs, npl.spectra, npl.molfams, npl.strains) MetcalfScoring.LINKFINDER = LinkFinder() - MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS, + MetcalfScoring.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, link_type=LINK_TYPES[0]) - MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS, + MetcalfScoring.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, link_type=LINK_TYPES[1]) logger.debug('MetcalfScoring.setup caching results') save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS, @@ -175,10 +175,10 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] scores_list = self.LINKFINDER.get_links(*objects, score_cutoff=np.NINF) if obj_type == 'gcf': - scores_list = self._cal_standardised_score_gen( + scores_list = self._calc_standardised_score_gen( self.LINKFINDER, scores_list) else: - scores_list = self._cal_standardised_score_met( + scores_list = self._calc_standardised_score_met( self.LINKFINDER, scores_list) link_scores: dict[GCF | Spectrum | MolecularFamily, @@ -243,7 +243,7 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] logger.debug('MetcalfScoring: completed') return link_collection - def _cal_standardised_score_met(self, linkfinder: LinkFinder, + def _calc_standardised_score_met(self, linkfinder: LinkFinder, results: list) -> list[pd.DataFrame]: if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: raise ValueError( @@ -279,7 +279,7 @@ def _cal_standardised_score_met(self, linkfinder: LinkFinder, return [scores_df] - def _cal_standardised_score_gen(self, linkfinder: LinkFinder, + def _calc_standardised_score_gen(self, linkfinder: LinkFinder, results: list) -> list[pd.DataFrame]: if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: raise ValueError( diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py index 3aa2e4e9..94db051f 100644 --- a/tests/scoring/test_link_finder.py +++ b/tests/scoring/test_link_finder.py @@ -25,21 +25,21 @@ def test_init(linkfinder): assert linkfinder.metcalf_std is None -def test_cal_score_raw_score(linkfinder, datalinks): - """Test `cal_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`. +def test_calc_score_raw_score(linkfinder, datalinks): + """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`. The expected values are calculated manually by using values from `test_init` of `test_data_links.py` and the default scoring weights. """ # link type = 'spec-gcf' - linkfinder.cal_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='spec-gcf') assert_frame_equal( linkfinder.raw_score_spec_gcf, pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], index=['spectrum1', 'spectrum2', 'spectrum3'], columns=['gcf1', 'gcf2', 'gcf3'])) # link type = 'mf-gcf' - linkfinder.cal_score(datalinks, link_type='mf-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') assert_frame_equal( linkfinder.raw_score_mf_gcf, pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], @@ -47,28 +47,28 @@ def test_cal_score_raw_score(linkfinder, datalinks): columns=['gcf1', 'gcf2', 'gcf3'])) -def test_cal_score_mean_std(linkfinder, datalinks): - """Test `cal_score` method for `metcalf_mean` and `metcalf_std`.""" - linkfinder.cal_score(datalinks, link_type='spec-gcf') +def test_calc_score_mean_std(linkfinder, datalinks): + """Test `calc_score` method for `metcalf_mean` and `metcalf_std`.""" + linkfinder.calc_score(datalinks, link_type='spec-gcf') assert isinstance(linkfinder.metcalf_mean, np.ndarray) assert isinstance(linkfinder.metcalf_std, np.ndarray) assert linkfinder.metcalf_mean.shape == (4, 4 ) # (n_strains+1 , n_strains+1) assert linkfinder.metcalf_mean.shape == (4, 4) - # TODO CG: add tests for values after refactoring _cal_mean_std method + # TODO CG: add tests for values after refactoring _calc_mean_std method # assert linkfinder.metcalf_mean == expected_array def test_get_links_gcf(linkfinder, datalinks, gcfs): """Test `get_links` method for input GCF objects.""" - linkfinder.cal_score(datalinks, link_type='spec-gcf') - linkfinder.cal_score(datalinks, link_type='mf-gcf') + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') index_names = ['source', 'target', 'score'] # cutoff = negative infinity (float) links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF) assert len(links) == 2 - # expected values got from `test_cal_score_raw_score` + # expected values got from `test_calc_score_raw_score` assert_frame_equal( links[0], pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3, @@ -113,8 +113,8 @@ def test_get_links_gcf(linkfinder, datalinks, gcfs): def test_get_links_spec(linkfinder, datalinks, spectra): """Test `get_links` method for input Spectrum objects.""" - linkfinder.cal_score(datalinks, link_type='spec-gcf') - linkfinder.cal_score(datalinks, link_type='mf-gcf') + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') index_names = ['source', 'target', 'score'] # cutoff = negative infinity (float) links = linkfinder.get_links(*spectra, score_cutoff=np.NINF) @@ -142,8 +142,8 @@ def test_get_links_spec(linkfinder, datalinks, spectra): def test_get_links_mf(linkfinder, datalinks, mfs): """Test `get_links` method for input MolecularFamily objects.""" - linkfinder.cal_score(datalinks, link_type='spec-gcf') - linkfinder.cal_score(datalinks, link_type='mf-gcf') + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') index_names = ['source', 'target', 'score'] # cutoff = negative infinity (float) links = linkfinder.get_links(*mfs, score_cutoff=np.NINF) diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py index e9398e5e..e5643c00 100644 --- a/tests/scoring/test_metcalf_scoring.py +++ b/tests/scoring/test_metcalf_scoring.py @@ -22,8 +22,8 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: def linkfinder(datalinks) -> LinkFinder: """LinkFinder object. See `test_link_finder.py` for its values.""" linkfinder = LinkFinder() - linkfinder.cal_score(datalinks, link_type='spec-gcf') - linkfinder.cal_score(datalinks, link_type='mf-gcf') + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') return linkfinder diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py index fc0656c3..c3821493 100644 --- a/tests/scoring/test_nplinker_scoring.py +++ b/tests/scoring/test_nplinker_scoring.py @@ -20,8 +20,8 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: def linkfinder(datalinks) -> LinkFinder: """LinkFinder object. See `test_link_finder.py` for its values.""" linkfinder = LinkFinder() - linkfinder.cal_score(datalinks, link_type='spec-gcf') - linkfinder.cal_score(datalinks, link_type='mf-gcf') + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') return linkfinder From ab5d02593aa7bf1cc388784e4604a5bd44239adf Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 20 Jun 2023 14:36:47 +0200 Subject: [PATCH 89/95] remove resolved TODO comment --- src/nplinker/scoring/linking/link_finder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 39300f58..7a400bd9 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -111,9 +111,6 @@ def _calc_mean_std( variance[n, m] = expected_sq return mean, np.sqrt(variance) - # TODO CG: the data type of returned should be improved for faster - # processing. Maybe using dict instead of pd.DataFrame? - # like that in the get_links method of the MetcalfScoring class def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...], From e4a0b3ead8fc415bb3ae41a3b2e45af58732355f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 20 Jun 2023 15:07:02 +0200 Subject: [PATCH 90/95] move shared fixtures to conftest.py --- tests/scoring/conftest.py | 52 +++++++++++++++++++++++++ tests/scoring/test_data_links.py | 7 ---- tests/scoring/test_link_finder.py | 7 ---- tests/scoring/test_metcalf_scoring.py | 50 ------------------------ tests/scoring/test_nplinker_scoring.py | 53 -------------------------- 5 files changed, 52 insertions(+), 117 deletions(-) diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py index 73606506..b64a5694 100644 --- a/tests/scoring/conftest.py +++ b/tests/scoring/conftest.py @@ -2,8 +2,13 @@ from nplinker.genomics import GCF from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.spectrum import Spectrum +from nplinker.nplinker import NPLinker +from nplinker.scoring import MetcalfScoring +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain +from .. import DATA_DIR @fixture(scope='session') @@ -55,3 +60,50 @@ def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]: mf3 = MolecularFamily('mf3') mf3.add_spectrum(spectra[2]) return mf1, mf2, mf3 + + +@fixture(scope='module') +def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: + """DataLinks object. See `test_data_links.py` for its values.""" + return DataLinks(gcfs, spectra, mfs, strains) + + +@fixture(scope='module') +def linkfinder(datalinks) -> LinkFinder: + """LinkFinder object. See `test_link_finder.py` for its values.""" + linkfinder = LinkFinder() + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') + return linkfinder + + +@fixture(scope='module') +def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: + """Constructed NPLinker object. + + This NPLinker object does not do loading `npl.load_data()`, instead we + manually set its attributes to the values we want to test. + + The config file `nplinker_demo1.toml` does not affect the tests, just + making sure the NPLinker object can be created succesfully. + """ + npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) + npl._gcfs = gcfs + npl._spectra = spectra + npl._molfams = mfs + npl._strains = strains + npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} + npl._mf_lookup = {mf.family_id: mf for mf in mfs} + npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} + # tmp path to store 'metcalf/metcalf_scores.pckl' file + # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) + npl._loader._root = tmp_path_factory.mktemp('npl_test') + return npl + + +@fixture(scope='module') +def mc(npl) -> MetcalfScoring: + """MetcalfScoring object.""" + mc = MetcalfScoring(npl) + mc.setup(npl) + return mc diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index c652ea2c..22a9c21a 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -1,14 +1,7 @@ import pandas as pd from pandas.util.testing import assert_frame_equal import pytest -from pytest import fixture from nplinker.metabolomics.singleton_family import SingletonFamily -from nplinker.scoring.linking import DataLinks - - -@fixture(scope='module') -def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: - return DataLinks(gcfs, spectra, mfs, strains) def test_init(datalinks): diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py index 94db051f..30d84980 100644 --- a/tests/scoring/test_link_finder.py +++ b/tests/scoring/test_link_finder.py @@ -3,7 +3,6 @@ from pandas.util.testing import assert_frame_equal import pytest from pytest import fixture -from nplinker.scoring.linking import DataLinks from nplinker.scoring.linking import LinkFinder @@ -12,12 +11,6 @@ def linkfinder() -> LinkFinder: return LinkFinder() -@fixture(scope='module') -def datalinks(gcfs, spectra, mfs, strains): - """DataLinks object. See `test_data_links.py` for its actual values.""" - return DataLinks(gcfs, spectra, mfs, strains) - - def test_init(linkfinder): assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame()) assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame()) diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py index e5643c00..471c073c 100644 --- a/tests/scoring/test_metcalf_scoring.py +++ b/tests/scoring/test_metcalf_scoring.py @@ -2,61 +2,11 @@ from numpy.testing import assert_array_equal from pandas.util.testing import assert_frame_equal import pytest -from pytest import fixture -from nplinker.nplinker import NPLinker from nplinker.scoring import LinkCollection from nplinker.scoring import MetcalfScoring from nplinker.scoring import ObjectLink from nplinker.scoring.linking import DataLinks from nplinker.scoring.linking import LinkFinder -from .. import DATA_DIR - - -@fixture(scope='module') -def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: - """DataLinks object. See `test_data_links.py` for its values.""" - return DataLinks(gcfs, spectra, mfs, strains) - - -@fixture(scope='module') -def linkfinder(datalinks) -> LinkFinder: - """LinkFinder object. See `test_link_finder.py` for its values.""" - linkfinder = LinkFinder() - linkfinder.calc_score(datalinks, link_type='spec-gcf') - linkfinder.calc_score(datalinks, link_type='mf-gcf') - return linkfinder - - -@fixture(scope='module') -def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: - """Constructed NPLinker object. - - This NPLinker object does not do loading `npl.load_data()`, instead we - manually set its attributes to the values we want to test. - - The config file `nplinker_demo1.toml` does not affect the tests, just - making sure the NPLinker object can be created succesfully. - """ - npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) - npl._gcfs = gcfs - npl._spectra = spectra - npl._molfams = mfs - npl._strains = strains - npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} - npl._mf_lookup = {mf.family_id: mf for mf in mfs} - npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} - # tmp path to store 'metcalf/metcalf_scores.pckl' file - # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) - npl._loader._root = tmp_path_factory.mktemp('npl_test') - return npl - - -@fixture(scope='module') -def mc(npl) -> MetcalfScoring: - """MetcalfScoring object.""" - mc = MetcalfScoring(npl) - mc.setup(npl) - return mc def test_init(npl): diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py index c3821493..77743dbb 100644 --- a/tests/scoring/test_nplinker_scoring.py +++ b/tests/scoring/test_nplinker_scoring.py @@ -1,60 +1,7 @@ import numpy as np import pytest -from pytest import fixture -from nplinker.nplinker import NPLinker from nplinker.scoring import LinkCollection -from nplinker.scoring import MetcalfScoring from nplinker.scoring import ObjectLink -from nplinker.scoring.linking import DataLinks -from nplinker.scoring.linking import LinkFinder -from .. import DATA_DIR - - -@fixture(scope='module') -def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: - """DataLinks object. See `test_data_links.py` for its values.""" - return DataLinks(gcfs, spectra, mfs, strains) - - -@fixture(scope='module') -def linkfinder(datalinks) -> LinkFinder: - """LinkFinder object. See `test_link_finder.py` for its values.""" - linkfinder = LinkFinder() - linkfinder.calc_score(datalinks, link_type='spec-gcf') - linkfinder.calc_score(datalinks, link_type='mf-gcf') - return linkfinder - - -@fixture(scope='module') -def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: - """Constructed NPLinker object. - - This NPLinker object does not do loading `npl.load_data()`, instead we - manually set its attributes to the values we want to test. - - The config file `nplinker_demo1.toml` does not affect the tests, just - making sure the NPLinker object can be created succesfully. - """ - npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) - npl._gcfs = gcfs - npl._spectra = spectra - npl._molfams = mfs - npl._strains = strains - npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} - npl._mf_lookup = {mf.family_id: mf for mf in mfs} - npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} - # tmp path to store 'metcalf/metcalf_scores.pckl' file - # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) - npl._loader._root = tmp_path_factory.mktemp('npl_test') - return npl - - -@fixture(scope='module') -def mc(npl) -> MetcalfScoring: - """MetcalfScoring object.""" - mc = MetcalfScoring(npl) - mc.setup(npl) - return mc def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs, From ade54a36e3d00627edef533344fae2d99efbdd47 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 21 Jun 2023 09:05:21 +0200 Subject: [PATCH 91/95] remove unnecessary type hints --- src/nplinker/nplinker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 12b266e7..6e58981f 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -283,7 +283,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): self._bgcs = self._loader.bgcs self._gcfs = self._loader.gcfs self._mibig_bgc_dict = self._loader.mibig_bgc_dict - self._strains: StrainCollection = self._loader.strains + self._strains = self._loader.strains self._product_types = self._loader.product_types self._chem_classes = self._loader.chem_classes self._class_matches = self._loader.class_matches From 78aa794dcb072378cc1966cd4ba6544a81f138c7 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 21 Jun 2023 09:56:44 +0200 Subject: [PATCH 92/95] update docstrings for cooccurrences --- src/nplinker/scoring/linking/data_links.py | 26 +++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index 53a62ba1..30e7c1b8 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -10,6 +10,7 @@ from .utils import calc_correlation_matrix from .utils import isinstance_all + if TYPE_CHECKING: from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain @@ -42,21 +43,30 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of molecular families with respect to strains. cooccurrence_spec_gcf(pd.DataFrame): A DataFrame to store co-occurrence - of spectra<->gcfs. + of the presence of spectra and the presence of gcfs with respect + to strains. cooccurrence_spec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence - of spectra<->not gcfs. + of the presence of spectra and the absence of gcfs with respect + to strains. "notgcf" means the absence of gcfs. cooccurrence_notspec_gcf(pd.DataFrame): A DataFrame to store co-occurrence - of not spectra<->gcfs. + of the absence of spectra and the presence of gcfs with respect + to strains. "notspec" means the absence of spectra. cooccurrence_notspec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence - of not spectra<->not gcfs. + of the absence of spectra and the absence of gcfs with respect + to strains. cooccurrence_mf_gcf(pd.DataFrame): A DataFrame to store co-occurrence - of molecular families<->gcfs. + of the presence of molecular families and the presence of gcfs + with respect to strains. cooccurrence_mf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence - of molecular families<->not gcfs. + of the presence of molecular families and the absence of gcfs + with respect to strains. "notgcf" means the absence of gcfs. cooccurrence_notmf_gcf(pd.DataFrame): A DataFrame to store co-occurrence - of not molecular families<->gcfs. + of the absence of molecular families and the presence of gcfs + with respect to strains. "notmf" means the absence of molecular + families. cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence - of not molecular families<->not gcfs. + of the absence of molecular families and the absence of gcfs + with respect to strains. """ self._strains = strains From 09c049f98e6d2c6d038cd87258f79ccedbe90bf0 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 20 Jun 2023 14:17:37 +0200 Subject: [PATCH 93/95] use uuid for singleton molecular families #144 --- src/nplinker/class_info/chem_classes.py | 5 +++-- .../gnps/gnps_molecular_family_loader.py | 4 ++-- src/nplinker/metabolomics/singleton_family.py | 3 ++- src/nplinker/scoring/np_class_scoring.py | 4 ++-- .../test_gnps_molecular_family_loader.py | 15 ++++++++++----- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py index d2f184a3..eb0e0c99 100644 --- a/src/nplinker/class_info/chem_classes.py +++ b/src/nplinker/class_info/chem_classes.py @@ -401,8 +401,8 @@ class prediction for a level. When no class is present, instead of Tuple it will for molfam in molfams: fid = molfam.family_id # the key spectra = molfam.spectra - # if singleton family, format like '-1_spectrum-id' - if fid == '-1': + # if singleton family, format like 'fid_spectrum-id' + if fid.startswith('singleton-'): spec_id = spectra[0].spectrum_id fid += f'_{spec_id}' len_molfam = len(spectra) @@ -555,6 +555,7 @@ def _read_cf_classes(self, mne_dir): nr_nodes = line.pop(0) # todo: make it easier to query classes of singleton families # if singleton family, format like '-1_spectrum-id' like canopus results + # CG: Note that the singleton families id is "singleton-" + uuid. if nr_nodes == '1': component = f'-1_{cluster}' class_info = [] diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index 0a3b7659..acbe5547 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -19,9 +19,9 @@ def __init__(self, file: str | PathLike): self._families: list[MolecularFamily | SingletonFamily] = [] for family_id, spectra_ids in _load_molecular_families(file).items(): - if family_id == '-1': + if family_id == '-1': # the "-1" is from GNPS result for spectrum_id in spectra_ids: - family = SingletonFamily() + family = SingletonFamily() ## uuid as family id family.spectra_ids = set([spectrum_id]) self._families.append(family) else: diff --git a/src/nplinker/metabolomics/singleton_family.py b/src/nplinker/metabolomics/singleton_family.py index 36f03e49..94457049 100644 --- a/src/nplinker/metabolomics/singleton_family.py +++ b/src/nplinker/metabolomics/singleton_family.py @@ -1,10 +1,11 @@ +import uuid from .molecular_family import MolecularFamily class SingletonFamily(MolecularFamily): def __init__(self): - super().__init__('-1') + super().__init__("singleton-" + str(uuid.uuid4())) def __str__(self): return f"Singleton molecular family (id={self.id})" diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index 79d1a71f..84975344 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -271,7 +271,7 @@ def _get_met_classes(self, spec_like, method='mix'): spectra_classes_names_inds else: # molfam fam_id = spec_like.family_id - if fam_id == '-1': # account for singleton families + if fam_id.startswith("singleton-"): # account for singleton families fam_id += f'_{spec_like.spectra[0].spectrum_id}' all_classes = self.npl.chem_classes.canopus.molfam_classes.get( fam_id) @@ -289,7 +289,7 @@ def _get_met_classes(self, spec_like, method='mix'): spectra_classes(spec_like.spectrum_id) else: # molfam fam_id = spec_like.family_id - if fam_id == '-1': # account for singleton families + if fam_id.startswith("singleton"): # account for singleton families fam_id += f'_{spec_like.spectra[0].spectrum_id}' spec_like_classes = self.npl.chem_classes.molnetenhancer. \ molfam_classes.get(fam_id) diff --git a/tests/metabolomics/test_gnps_molecular_family_loader.py b/tests/metabolomics/test_gnps_molecular_family_loader.py index 1174e28b..6e9cd531 100644 --- a/tests/metabolomics/test_gnps_molecular_family_loader.py +++ b/tests/metabolomics/test_gnps_molecular_family_loader.py @@ -12,17 +12,22 @@ def test_has_molecular_families(filename): sut = GNPSMolecularFamilyLoader(filename) actual = sut.families() assert len(actual) == 25769 - mf_ids = [mf.family_id for mf in actual[:30]] - assert mf_ids == [ + + assert [mf.family_id for mf in actual[:29]] == [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '20', '21', '22', '23', '24', '26', '28', - '30', '31', '32', '33', '-1' + '30', '31', '32', '33' ] - num_spec_ids = [len(mf.spectra_ids) for mf in actual[:30]] - assert num_spec_ids == [ + for i in range(30, 25769): + assert actual[i].family_id.startswith('singleton-') + + assert [len(mf.spectra_ids) for mf in actual[:30]] == [ 19, 48, 3, 3, 11, 4, 9, 3, 15, 3, 5, 2, 3, 3, 5, 3, 14, 4, 2, 2, 12, 2, 3, 5, 2, 4, 2, 2, 2, 1 ] + for i in range(30, 25769): + assert len(actual[i].spectra_ids) == 1 + assert actual[0].spectra_ids == set( ('13170', '13662', '15316', '15364', '16341', '17201', '17270', '18120', '18172', '18748', '18831', '19005', '19673', '19719', From 1ba71062464b9383f70275c877b3812711b6bdc9 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 21 Jun 2023 12:02:55 +0200 Subject: [PATCH 94/95] add TODO comment for GNPSLoader --- src/nplinker/loader.py | 1 + src/nplinker/metabolomics/spectrum.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 0c1e0172..b25952c4 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -674,6 +674,7 @@ def _load_genomics(self): return True + # TODO CG: replace deprecated load_dataset with GPNSLoader def _load_metabolomics(self): spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset( self.strains, diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 0002843d..12663871 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -47,6 +47,7 @@ def __init__(self, # this is a dict indexed by Strain objects (the strains found in this Spectrum), with # the values being dicts of the form {growth_medium: peak intensity} for the parent strain self.growth_media = {} + # TODO CG: self.family_id should be removed, used in deprecated make_families method self.family_id = '-1' self.family = None # a dict indexed by filename, or "gnps" From 0f1352a0881297ec04211e2cbd309c37ff101248 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 3 Jul 2023 09:21:59 +0200 Subject: [PATCH 95/95] update type hints for `*args` parameter --- src/nplinker/scoring/metcalf_scoring.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index c291a511..53e6f062 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -122,8 +122,7 @@ def setup(npl: NPLinker): def datalinks(self) -> DataLinks: return MetcalfScoring.DATALINKS - def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] - | tuple[MolecularFamily, ...], + def get_links(self, *objects: GCF | Spectrum | MolecularFamily, link_collection: LinkCollection) -> LinkCollection: """Get links for the given objects and add them to the given LinkCollection. @@ -131,7 +130,7 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...] be GCF, Spectrum or MolecularFamily objects. Args: - objects(tuple): The objects to get links for. Must be GCF, Spectrum + objects: The objects to get links for. Must be GCF, Spectrum or MolecularFamily objects. link_collection: The LinkCollection object to add the links to.