From 742d4b583dce7183a2f7ab2f7eb0db1ab863386b Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 5 Apr 2023 18:07:21 +0200
Subject: [PATCH 01/95] change BGC attributes type from list to tuple

The following BGC attributes are updated:
- product_prediction
- mibig_bgc_class
- smiles
---
 .../genomics/antismash/antismash_loader.py    |  8 ++-----
 src/nplinker/genomics/bgc.py                  | 21 ++++++++++---------
 src/nplinker/genomics/mibig/mibig_loader.py   |  2 +-
 src/nplinker/genomics/mibig/mibig_metadata.py | 10 ++++-----
 .../antismash/test_antismash_loader.py        |  4 ++--
 tests/genomics/test_bgc.py                    |  6 +++---
 tests/genomics/test_gcf.py                    |  8 +++----
 tests/genomics/test_genomics.py               |  8 +++----
 tests/genomics/test_mibig_loader.py           |  2 +-
 tests/genomics/test_mibig_metadata.py         |  2 +-
 10 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
index 1f9ade80..e4bdcf6b 100644
--- a/src/nplinker/genomics/antismash/antismash_loader.py
+++ b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -117,10 +117,6 @@ def _parse_bgcs(bgc_files: dict[str, str]) -> dict[str, BGC]:
 def parse_bgc_genbank(file: str) -> BGC:
     """Parse a single BGC gbk file to BGC object.
 
-    Note:
-        If product info is not available in gbk file, the product of BGC
-            object (bgc.product_prediction) is set to empty list.
-
     Args:
         file(str): Path to BGC gbk file
 
@@ -143,7 +139,7 @@ def parse_bgc_genbank(file: str) -> BGC:
             f"Not found product prediction in antiSMASH Genbank file {file}")
 
     # init BGC
-    bgc = BGC(bgc_id=fname, product_prediction=product_prediction)
+    bgc = BGC(fname, *product_prediction)
     bgc.description = description
     bgc.antismash_id = antismash_id
     bgc.antismash_file = file
@@ -166,7 +162,7 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict:
             # space is not allowed in SMILES spec
             # biopython generates space when reading multi-line SMILES from .gbk
             if smiles is not None:
-                smiles = [i.replace(' ', '') for i in smiles]
+                smiles = tuple(i.replace(' ', '') for i in smiles)
             features["smiles"] = smiles
     return features
 
diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index b1b1e2a9..16caee08 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -16,7 +16,7 @@ class BGC():
 
     def __init__(self,
                  bgc_id: str,
-                 product_prediction: list[str]):
+                 *product_prediction: str):
         """Class to model BGC (biosynthetic gene cluster) data.
 
         BGC data include both annotations and sequence data. This class is
@@ -32,18 +32,18 @@ def __init__(self,
 
         Args:
             bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession.
-            product_prediction(list[str]): list of BGC's (predicted) natural
-                products or product classes.
+            product_prediction(tuple[str]): BGC's (predicted) natural products
+                or product classes.
 
         Attributes:
             bgc_id(str): BGC identifier, e.g. MIBiG accession, GenBank accession.
-            product_prediction(list[str]): List of BGC's (predicted) natural
-                products or product classes.
+            product_prediction(tuple[str]): A tuple of (predicted) natural
+                products or product classes of the BGC.
                 For antiSMASH's GenBank data, the feature `region /product`
                 gives product information.
                 For MIBiG metadata, its biosynthetic class provides such info.
-            mibig_bgc_class(list[str] | None): List of MIBiG biosynthetic classes to
-                which the BGC belongs.
+            mibig_bgc_class(tuple[str] | None): A tuple of MIBiG biosynthetic
+                classes to which the BGC belongs.
                 Defaults to None.
                 MIBiG defines 6 major biosynthetic classes for natural products,
                 including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide"
@@ -52,7 +52,8 @@ def __init__(self,
                 More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.
             description(str | None): Brief description of the BGC.
                 Defaults to None.
-            smiles(list[str] | None): SMILES formula of the BGC's product.
+            smiles(tuple[str] | None): A tuple of SMILES formulas of the BGC's
+                products.
                 Defaults to None.
             antismash_file(str | None): The path to the antiSMASH GenBank file.
                 Defaults to None.
@@ -72,9 +73,9 @@ def __init__(self,
         self.bgc_id = bgc_id
         self.product_prediction = product_prediction
 
-        self.mibig_bgc_class: list[str] | None = None
+        self.mibig_bgc_class: tuple[str] | None = None
         self.description: str | None = None
-        self.smiles: list[str] | None = None
+        self.smiles: tuple[str] | None = None
 
         # antismash related attributes
         self.antismash_file: str | None = None
diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
index a4f665c1..0be9f889 100644
--- a/src/nplinker/genomics/mibig/mibig_loader.py
+++ b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -122,7 +122,7 @@ def parse_bgc_metadata_json(file: str) -> BGC:
         BGC: :class:`nplinker.genomics.BGC` object
     """
     metadata = MibigMetadata(file)
-    mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class)
+    mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class)
     mibig_bgc.mibig_bgc_class = metadata.biosyn_class
     mibig_bgc.strain = Strain(metadata.mibig_accession)
     return mibig_bgc
diff --git a/src/nplinker/genomics/mibig/mibig_metadata.py b/src/nplinker/genomics/mibig/mibig_metadata.py
index c3921f37..94c85315 100644
--- a/src/nplinker/genomics/mibig/mibig_metadata.py
+++ b/src/nplinker/genomics/mibig/mibig_metadata.py
@@ -21,7 +21,7 @@ def __init__(self, file) -> None:
             self.metadata = json.load(f)
 
         self._mibig_accession: str
-        self._biosyn_class: list[str]
+        self._biosyn_class: tuple[str]
         self._parse_metadata()
 
     @property
@@ -30,7 +30,7 @@ def mibig_accession(self) -> str:
         return self._mibig_accession
 
     @property
-    def biosyn_class(self) -> list[str]:
+    def biosyn_class(self) -> tuple[str]:
         """Get the value of metadata item 'biosyn_class'.
 
         The 'biosyn_class' is biosynthetic class(es), namely the type of
@@ -50,8 +50,8 @@ def _parse_metadata(self) -> None:
         if 'general_params' in self.metadata:
             self._mibig_accession = self.metadata['general_params'][
                 'mibig_accession']
-            self._biosyn_class = self.metadata['general_params'][
-                'biosyn_class']
+            self._biosyn_class = tuple(self.metadata['general_params'][
+                'biosyn_class'])
         else:  # version≥2.0
             self._mibig_accession = self.metadata['cluster']['mibig_accession']
-            self._biosyn_class = self.metadata['cluster']['biosyn_class']
+            self._biosyn_class = tuple(self.metadata['cluster']['biosyn_class'])
diff --git a/tests/genomics/antismash/test_antismash_loader.py b/tests/genomics/antismash/test_antismash_loader.py
index f3f30820..027e2c7a 100644
--- a/tests/genomics/antismash/test_antismash_loader.py
+++ b/tests/genomics/antismash/test_antismash_loader.py
@@ -67,12 +67,12 @@ def test_parse_bgc_genbank():
     bgc = parse_bgc_genbank(gbk_file)
     assert isinstance(bgc, BGC)
     assert bgc.bgc_id == "NZ_AZWB01000005.region001"
-    assert bgc.product_prediction == ["NRPS", "lanthipeptide"]
+    assert bgc.product_prediction == ("NRPS", "lanthipeptide")
     assert "Salinispora pacifica CNT029 B170DRAFT_scaffold" in bgc.description
     assert bgc.antismash_id == "NZ_AZWB01000005"
     assert bgc.antismash_file == gbk_file
     assert bgc.antismash_region == "1"
-    assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"]
+    assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",)
 
 def test_parse_bgc_genbank_error():
     gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk")
diff --git a/tests/genomics/test_bgc.py b/tests/genomics/test_bgc.py
index 7434ff8d..83d7da63 100644
--- a/tests/genomics/test_bgc.py
+++ b/tests/genomics/test_bgc.py
@@ -5,9 +5,9 @@
 
 def test_default():
 
-    bgc = BGC("BGC0000001", ["Polyketide"])
+    bgc = BGC("BGC0000001", "Polyketide")
     assert bgc.bgc_id == "BGC0000001"
-    assert bgc.product_prediction == ["Polyketide"]
+    assert bgc.product_prediction == ("Polyketide",)
     assert bgc.is_mibig() is True
     assert bgc.parents == set()
     assert bgc.bigscape_classes == set()
@@ -19,7 +19,7 @@ def test_default():
 
 
 def test_add_and_detach_parent():
-    bgc = BGC("BGC0000001", ["Polyketide"])
+    bgc = BGC("BGC0000001", "Polyketide")
     gcf = GCF("1")
     bgc.add_parent(gcf)
     assert bgc.parents == {gcf}
diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py
index 0ccb20ca..a40f4728 100644
--- a/tests/genomics/test_gcf.py
+++ b/tests/genomics/test_gcf.py
@@ -7,14 +7,14 @@
 
 @pytest.fixture()
 def bgc_with_strain():
-    bgc = BGC("S0001", ["NPR"])
+    bgc = BGC("S0001", "NPR")
     bgc.strain = Strain("strain001")
     yield bgc
 
 
 @pytest.fixture()
 def bgc_without_strain():
-    bgc = BGC("S002", ["NPR"])
+    bgc = BGC("S002", "NPR")
     yield bgc
 
 
@@ -59,8 +59,8 @@ def test_has_strain(bgc_with_strain):
     assert gcf.has_strain("strain002") is False
 
 def test_has_mibig_only():
-    mibig_bgc = BGC("BGC0000001", ["NPR"])
-    nonmibig_bgc = BGC("S0001", ["NPR"])
+    mibig_bgc = BGC("BGC0000001", "NPR")
+    nonmibig_bgc = BGC("S0001", "NPR")
     gcf = GCF("1")
     gcf.add_bgc(mibig_bgc)
     assert gcf.has_mibig_only() is True
diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py
index a48c9d6c..dfde9e6c 100644
--- a/tests/genomics/test_genomics.py
+++ b/tests/genomics/test_genomics.py
@@ -38,10 +38,10 @@ def bgc_genome_mapping() -> dict[str, str]:
 @pytest.fixture
 def bgc_list() -> list[BGC]:
     return [
-        BGC("SAMPLE0001", ["NPR"]),
-        BGC("SAMPLE0002", ["Alkaloid"]),
-        BGC("BGC0000001", ["Polyketide"]),
-        BGC("BGC0000002", ["Terpene"])
+        BGC("SAMPLE0001", "NPR"),
+        BGC("SAMPLE0002", "Alkaloid"),
+        BGC("BGC0000001", "Polyketide"),
+        BGC("BGC0000002", "Terpene")
     ]
 
 
diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py
index bdeea051..c122c56e 100644
--- a/tests/genomics/test_mibig_loader.py
+++ b/tests/genomics/test_mibig_loader.py
@@ -79,4 +79,4 @@ def test_parse_bgc_metadata_json():
     bgc = parse_bgc_metadata_json(str(json_file))
     assert isinstance(bgc, BGC)
     assert bgc.bgc_id == "BGC0000001"
-    assert bgc.mibig_bgc_class == ["Polyketide"]
+    assert bgc.mibig_bgc_class == ("Polyketide",)
diff --git a/tests/genomics/test_mibig_metadata.py b/tests/genomics/test_mibig_metadata.py
index 0e2096e1..08607d0e 100644
--- a/tests/genomics/test_mibig_metadata.py
+++ b/tests/genomics/test_mibig_metadata.py
@@ -23,4 +23,4 @@ def test_mibig_accession(self, metadata):
         assert metadata.mibig_accession == "BGC0000001"
 
     def test_biosyn_class(self, metadata):
-        assert metadata.biosyn_class == ["Polyketide"]
+        assert metadata.biosyn_class == ("Polyketide",)

From f1cac59f4a071d042fbb372625c1bda2252173ae Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 5 Apr 2023 18:13:37 +0200
Subject: [PATCH 02/95] use positional-only parameter in BGC and GCF

Parameters before "/" are positional-only parameters, see https://docs.python.org/3/glossary.html#term-parameter.
---
 src/nplinker/genomics/bgc.py | 5 +----
 src/nplinker/genomics/gcf.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 16caee08..1ec6be9f 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -4,7 +4,6 @@
 from nplinker.logconfig import LogConfig
 from .aa_pred import predict_aa
 
-
 if TYPE_CHECKING:
     from ..strains import Strain
     from .gcf import GCF
@@ -14,9 +13,7 @@
 
 class BGC():
 
-    def __init__(self,
-                 bgc_id: str,
-                 *product_prediction: str):
+    def __init__(self, bgc_id: str, /, *product_prediction: str):
         """Class to model BGC (biosynthetic gene cluster) data.
 
         BGC data include both annotations and sequence data. This class is
diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
index e0c68c13..2a9e2b02 100644
--- a/src/nplinker/genomics/gcf.py
+++ b/src/nplinker/genomics/gcf.py
@@ -13,7 +13,7 @@
 
 class GCF():
 
-    def __init__(self, gcf_id: str) -> None:
+    def __init__(self, gcf_id: str, /) -> None:
         """Class to model gene cluster family (GCF).
 
         GCF is a group of similar BGCs and generated by clustering BGCs with

From 18e77a6b08e7ac39955c779ed6314b40622e8154 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 5 Apr 2023 15:33:10 +0200
Subject: [PATCH 03/95] update BGC's `__eq__` and `__hash__`

---
 src/nplinker/genomics/bgc.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 1ec6be9f..7eb7921a 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -91,11 +91,14 @@ def __str__(self):
             self.__class__.__name__, self.bgc_id, self.strain,
             self.antismash_id, self.antismash_region)
 
-    def __eq__(self, other):
-        return self.bgc_id == other.bgc_id
-
-    def __hash__(self):
-        return hash(self.bgc_id)
+    def __eq__(self, other) -> bool:
+        if isinstance(other, BGC):
+            return (self.bgc_id == other.bgc_id
+                    and self.product_prediction == other.product_prediction)
+        return NotImplemented
+
+    def __hash__(self) -> int:
+        return hash((self.bgc_id, self.product_prediction))
 
     def add_parent(self, gcf: GCF) -> None:
         """Add a parent GCF to the BGC.

From da2cf41b6528fd281288b47e30c413d48fe98f03 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 5 Apr 2023 16:10:20 +0200
Subject: [PATCH 04/95] update GCF's `__eq__` and `__hash__`

---
 src/nplinker/genomics/gcf.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
index 2a9e2b02..c3d9cc9c 100644
--- a/src/nplinker/genomics/gcf.py
+++ b/src/nplinker/genomics/gcf.py
@@ -3,7 +3,6 @@
 from nplinker.logconfig import LogConfig
 from nplinker.strain_collection import StrainCollection
 
-
 if TYPE_CHECKING:
     from nplinker.strains import Strain
     from .bgc import BGC
@@ -46,10 +45,17 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def __eq__(self, other):
-        return self.gcf_id == other.gcf_id
+    def __eq__(self, other) -> bool:
+        if isinstance(other, GCF):
+            return (self.gcf_id == other.gcf_id and self.bgcs == other.bgcs)
+        return NotImplemented
+
+    def __hash__(self) -> int:
+        """Hash the GCF object.
 
-    def __hash__(self):
+        Note that GCF class is a mutable container. We only hash the GCF id to
+        avoid the hash value changes when `self._bgcs` is updated.
+        """
         return hash(self.gcf_id)
 
     @property

From 2111e5b83329650841644c3e3f7d3a9a4b6d1898 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:21:24 +0200
Subject: [PATCH 05/95] Update gcf.py

---
 src/nplinker/genomics/gcf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
index c3d9cc9c..5ea61486 100644
--- a/src/nplinker/genomics/gcf.py
+++ b/src/nplinker/genomics/gcf.py
@@ -51,7 +51,7 @@ def __eq__(self, other) -> bool:
         return NotImplemented
 
     def __hash__(self) -> int:
-        """Hash the GCF object.
+        """Hash function for GCF.
 
         Note that GCF class is a mutable container. We only hash the GCF id to
         avoid the hash value changes when `self._bgcs` is updated.

From 07364767216fc70e06a50e4fec034d6463c168e9 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:02:45 +0200
Subject: [PATCH 06/95] update Spectrum's `__eq__` and `__hash__`

---
 src/nplinker/metabolomics/spectrum.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index c65824ce..78a8e8ce 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -121,11 +121,16 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def __eq__(self, other):
-        return self.id == other.id
-
-    def __hash__(self):
-        return hash(self.id)
+    def __eq__(self, other) -> bool:
+        if isinstance(other, Spectrum):
+            return (self.id == other.id
+                    and self.spectrum_id == other.spectrum_id
+                    and self.precursor_mz == other.precursor_mz
+                    and self.parent_mz == other.parent_mz)
+        return NotImplemented
+
+    def __hash__(self) -> int:
+        return hash((self.id, self.spectrum_id, self.precursor_mz, self.parent_mz))
 
     def __cmp__(self, other):
         if self.parent_mz >= other.parent_mz:

From fda12085f0da0ceaaa37dab9d822d2e5184072bc Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:09:14 +0200
Subject: [PATCH 07/95] update MolecularFamily `__eq__` and `__hash__`

---
 src/nplinker/metabolomics/molecular_family.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index 1eec71fc..d56f5b82 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -51,7 +51,16 @@ def __str__(self) -> str:
             self.family_id, len(self.spectra))
 
     def __eq__(self, other: Self) -> bool:
-        return bool(self.id == other.id)
+        if isinstance(other, MolecularFamily):
+            return (self.id == other.id
+                    and self.family_id == other.family_id
+                    and set(self.spectra) == set(other.spectra))
+        return NotImplemented
 
     def __hash__(self) -> int:
-        return hash(self.id)
+        """Hash function for MolecularFamily.
+
+        Note that MolecularFamily is a mutable container, so here we hash on
+        the id and family_id only.
+        """
+        return hash((self.id, self.family_id))

From 0a7cceeb965526a18ec1889bf46fd3b6d03b459f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:21:30 +0200
Subject: [PATCH 08/95] Update molecular_family.py

---
 src/nplinker/metabolomics/molecular_family.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index d56f5b82..a55a4df9 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -61,6 +61,7 @@ def __hash__(self) -> int:
         """Hash function for MolecularFamily.
 
         Note that MolecularFamily is a mutable container, so here we hash on
-        the id and family_id only.
+        the id and family_id only to avoid the hash value changing when
+        `self.spectra` is updated.
         """
         return hash((self.id, self.family_id))

From 6a373e1929b2e2ba3af3ba6b58eb68697bdbd806 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:18:29 +0200
Subject: [PATCH 09/95] update Strain `__eq__` and `__hash__`

---
 src/nplinker/strains.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index de41a8b0..661c7268 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -26,10 +26,17 @@ def __str__(self) -> str:
         return f'Strain({self.id}) [{len(self._aliases)} aliases]'
 
     def __eq__(self, other) -> bool:
-        return (isinstance(other, Strain) and self.id == other.id
-                and self._aliases == other._aliases)
+        if isinstance(other, Strain):
+            return (self.id == other.id
+                    and self.aliases == other.aliases)
+        return NotImplemented
 
     def __hash__(self) -> int:
+        """Hash function for Strain.
+
+        Note that Strain is a mutable container, so here we hash on only the id
+        to avoid the hash value changes when `self._aliases` is updated.
+        """
         return hash(self.id)
 
     @property

From 33ed3c60156fa394c8f2ae077568df469dc3667f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:18:46 +0200
Subject: [PATCH 10/95] update StrainCollection `__eq__`

---
 src/nplinker/strain_collection.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 5169f02a..00603e96 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -35,9 +35,11 @@ def __len__(self) -> int:
         return len(self._strains)
 
     def __eq__(self, other) -> bool:
-        return (self._strains == other._strains
-                and self._strain_dict_id == other._strain_dict_id
-                and self._strain_dict_index == other._strain_dict_index)
+        if isinstance(other, StrainCollection):
+            return (self._strains == other._strains
+                    and self._strain_dict_id == other._strain_dict_id
+                    and self._strain_dict_index == other._strain_dict_index)
+        return NotImplemented
 
     def __contains__(self, strain: str | Strain) -> bool:
         if isinstance(strain, str):

From f6bae56d3389a9071bb357a13570979e593dada7 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 10:27:25 +0200
Subject: [PATCH 11/95] add TODO comments to ObjectLink

---
 src/nplinker/scoring/object_link.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nplinker/scoring/object_link.py b/src/nplinker/scoring/object_link.py
index eb380db5..a88e543d 100644
--- a/src/nplinker/scoring/object_link.py
+++ b/src/nplinker/scoring/object_link.py
@@ -48,6 +48,8 @@ def __getitem__(self, name):
 
     def __hash__(self):
         # return the nplinker internal ID as hash value (for set/dict etc)
+        # TODO: hashable object should also have `__eq__` defined, see #136.
+        # this implementation is not ideal as the hash value is not unique
         return hash(self.source.id)
 
     def __str__(self):

From f81accf6ec888f039160dce338bdaa905898ba19 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 11:07:52 +0200
Subject: [PATCH 12/95] add parameter type check for `add_alias`

---
 src/nplinker/strains.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 661c7268..65887ced 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -54,6 +54,8 @@ def add_alias(self, alias: str) -> None:
         Args:
             alias(str): The alias to add to the list of known aliases.
         """
+        if not isinstance(alias, str):
+            raise TypeError(f'Expected str, got {type(alias)}')
         if len(alias) == 0:
             logger.warning(
                 'Refusing to add an empty-string alias to strain {%s}', self)

From 53dad8206060b2f7e56a2a1e41758d15b4227217 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 11:50:58 +0200
Subject: [PATCH 13/95] add `__contains__` to Strain class

---
 src/nplinker/strains.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 65887ced..76caf37c 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from .logconfig import LogConfig
+from typing import Iterator
 
 
 logger = LogConfig.getLogger(__name__)
@@ -39,6 +40,11 @@ def __hash__(self) -> int:
         """
         return hash(self.id)
 
+    def __contains__(self, alias: str) -> bool:
+        if not isinstance(alias, str):
+            raise TypeError(f'Expected str, got {type(alias)}')
+        return alias in self._aliases
+
     @property
     def aliases(self) -> set[str]:
         """Get the set of known aliases.

From 45d876537e8134ffcc4a9aca4d8dd1aec8dc8dff Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 11:52:02 +0200
Subject: [PATCH 14/95] update `lookup` method of StrainCollection

---
 src/nplinker/strain_collection.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 00603e96..741090f2 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -9,7 +9,6 @@
 from .utils import list_dirs
 from .utils import list_files
 
-
 logger = LogConfig.getLogger(__name__)
 
 
@@ -18,6 +17,7 @@ class StrainCollection():
     def __init__(self):
         """A collection of Strain objects."""
         self._strains: list[Strain] = []
+        # dict of strain name (id and alias) to strain object
         self._strain_dict_id: dict[str, Strain] = {}
         self._strain_dict_index: dict[int, Strain] = {}
 
@@ -122,9 +122,10 @@ def lookup(self, name: str) -> Strain:
         Raises:
             KeyError: If the strain name is not found.
         """
-        if name not in self._strain_dict_id:
-            raise KeyError(f"Strain {name} not found in strain collection.")
-        return self._strain_dict_id[name]
+        for strain in self:
+            if name == strain.id or name in strain:
+                return strain
+        raise KeyError(f"Strain {name} not found in strain collection.")
 
     def add_from_file(self, file: str | PathLike) -> None:
         """Add strains from a strain mapping file.

From 7cb75e0ad208ffa4affddfb764c69ae13f1c1025 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 14:15:39 +0200
Subject: [PATCH 15/95] update `__contains__` in StrainCollection

---
 src/nplinker/strain_collection.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 741090f2..deaa47ed 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -9,6 +9,7 @@
 from .utils import list_dirs
 from .utils import list_files
 
+
 logger = LogConfig.getLogger(__name__)
 
 
@@ -41,14 +42,20 @@ def __eq__(self, other) -> bool:
                     and self._strain_dict_index == other._strain_dict_index)
         return NotImplemented
 
-    def __contains__(self, strain: str | Strain) -> bool:
-        if isinstance(strain, str):
-            value = strain in self._strain_dict_id
-        elif isinstance(strain, Strain):
-            value = strain.id in self._strain_dict_id
+    def __contains__(self, item: str | Strain) -> bool:
+        """Check if the strain collection contains the given strain.
+
+        The given strain could be a Strain object, or a strain id or alias.
+        """
+        if isinstance(item, str):
+            for strain in self:
+                if item == strain.id or item in strain:
+                    return True
+        elif isinstance(item, Strain):
+            return item in self._strains
         else:
-            raise TypeError(f"Expected Strain or str, got {type(strain)}")
-        return value
+            raise TypeError(f"Expected Strain or str, got {type(item)}")
+        return False
 
     def __iter__(self) -> Iterator[Strain]:
         return iter(self._strains)

From f797ee50cb532be1d2ce9288e221d05c95967387 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 11 Apr 2023 14:19:40 +0200
Subject: [PATCH 16/95] remove from __eq__

---
 src/nplinker/strain_collection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index deaa47ed..6ccb9830 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -38,7 +38,6 @@ def __len__(self) -> int:
     def __eq__(self, other) -> bool:
         if isinstance(other, StrainCollection):
             return (self._strains == other._strains
-                    and self._strain_dict_id == other._strain_dict_id
                     and self._strain_dict_index == other._strain_dict_index)
         return NotImplemented
 

From 8a3fa1bafa59e24d08343255f6ac124543d0e445 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 13 Apr 2023 09:04:17 +0200
Subject: [PATCH 17/95] update `__eq__` logic for Strain

---
 src/nplinker/strains.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 76caf37c..54338027 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -28,8 +28,7 @@ def __str__(self) -> str:
 
     def __eq__(self, other) -> bool:
         if isinstance(other, Strain):
-            return (self.id == other.id
-                    and self.aliases == other.aliases)
+            return self.id == other.id
         return NotImplemented
 
     def __hash__(self) -> int:

From 88754005f724579c845fc596e197a3d24b5a336a Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 13 Apr 2023 09:11:31 +0200
Subject: [PATCH 18/95] rename `_strain_dict_id` to `_strain_dict_name` in
 StrainCollection

---
 src/nplinker/strain_collection.py | 16 ++++++++--------
 tests/test_strain_collection.py   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 6ccb9830..35e63daf 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -19,7 +19,7 @@ def __init__(self):
         """A collection of Strain objects."""
         self._strains: list[Strain] = []
         # dict of strain name (id and alias) to strain object
-        self._strain_dict_id: dict[str, Strain] = {}
+        self._strain_dict_name: dict[str, Strain] = {}
         self._strain_dict_index: dict[int, Strain] = {}
 
     def __repr__(self) -> str:
@@ -68,17 +68,17 @@ def add(self, strain: Strain) -> None:
             strain(Strain): The strain to add.
         """
         # if the strain exists, merge the aliases
-        if strain.id in self._strain_dict_id:
+        if strain.id in self._strain_dict_name:
             existing: Strain = self.lookup(strain.id)
             for alias in strain.aliases:
                 existing.add_alias(alias)
-                self._strain_dict_id[alias] = existing
+                self._strain_dict_name[alias] = existing
         else:
             self._strain_dict_index[len(self)] = strain
             self._strains.append(strain)
-            self._strain_dict_id[strain.id] = strain
+            self._strain_dict_name[strain.id] = strain
             for alias in strain.aliases:
-                self._strain_dict_id[alias] = strain
+                self._strain_dict_name[alias] = strain
 
     def remove(self, strain: Strain):
         """Remove a strain from the collection.
@@ -86,12 +86,12 @@ def remove(self, strain: Strain):
         Args:
             strain(Strain): The strain to remove.
         """
-        if strain.id in self._strain_dict_id:
+        if strain.id in self._strain_dict_name:
             self._strains.remove(strain)
             # remove from dict id
-            del self._strain_dict_id[strain.id]
+            del self._strain_dict_name[strain.id]
             for alias in strain.aliases:
-                del self._strain_dict_id[alias]
+                del self._strain_dict_name[alias]
 
     def filter(self, strain_set: set[Strain]):
         """
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index 26ab2767..bd900a0f 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -54,7 +54,7 @@ def test_remove(collection: StrainCollection, strain: Strain):
     assert strain in collection
     collection.remove(strain)
     with pytest.raises(KeyError):
-        _ = collection._strain_dict_id[strain.id]
+        _ = collection._strain_dict_name[strain.id]
     assert strain not in collection
     # TODO: issue #90
     # with pytest.raises(KeyError):

From 2f7753978439a0b148f73a53ca60ec7e89b6d625 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 13 Apr 2023 11:48:51 +0200
Subject: [PATCH 19/95] add comments to `get_common_strains`

---
 src/nplinker/nplinker.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 1019abe5..006e57e2 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -493,13 +493,17 @@ def get_common_strains(self, objects_a, objects_b, filter_no_shared=True):
 
         # this is a dict with structure:
         #   (Spectrum/MolecularFamily, GCF) => list of strain indices
-        common_strains = self._datalinks.common_strains(
+        common_strains_index_dict = self._datalinks.common_strains(
             objects_a, objects_b, filter_no_shared)
 
+        common_strains = {}
         # replace the lists of strain indices with actual strain objects
-        for objpair in common_strains.keys():
-            common_strains[objpair] = [
-                self._strains.lookup_index(x) for x in common_strains[objpair]
+        # TODO: bug here, the index value of common_strains_index_dict is
+        #       not the same as the index value of self._strains
+        # Solution: lookup with strain.id instead of index
+        for key in common_strains_index_dict:
+            common_strains[key] = [
+                self._strains.lookup_index(x) for x in common_strains_index_dict[key]
             ]
 
         return common_strains

From dae421d7af40ac56313b6b6e3bfe4c1755a4c490 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 13 Apr 2023 11:55:23 +0200
Subject: [PATCH 20/95] add comments and rename variables for DataLinks

---
 src/nplinker/scoring/linking/data_linking.py | 48 ++++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index a6f181f0..bec8b6d2 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -28,9 +28,8 @@
 # import packages
 import numpy as np
 import pandas as pd
-
-from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.genomics.gcf import GCF
+from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.spectrum import Spectrum
 from .data_linking_functions import calc_correlation_matrix
 
@@ -55,6 +54,8 @@ def __init__(self):
         # matrices that store co-occurences with respect to strains
         # values = 1 where gcf/spec/fam occur in strain
         # values = 0 where gcf/spec/fam do not occur in strain
+
+        # 2D array [gcf: int, strain: int]
         self.M_gcf_strain = []
         self.M_spec_strain = []
         self.M_fam_strain = []
@@ -209,10 +210,10 @@ def matrix_strain_gcf(self, gcf_list, strain_list):
         # Collect co-ocurences in M_spec_strain matrix
         M_gcf_strain = np.zeros((len(gcf_list), len(strain_list)))
 
-        for i, strain in enumerate(strain_list):
-            for m, gcf in enumerate(gcf_list):
+        for i, gcf in enumerate(gcf_list):
+            for j, strain in enumerate(strain_list):
                 if gcf.has_strain(strain):
-                    M_gcf_strain[m, i] = 1
+                    M_gcf_strain[i, j] = 1
 
         self.M_gcf_strain = M_gcf_strain
         # extend mapping tables:
@@ -304,7 +305,7 @@ def data_family_mapping(self, include_singletons=False):
         self.mapping_fam["no of members"] = num_members
         return self.family_members
 
-    def common_strains(self, objects_a, objects_b, filter_no_shared=False):
+    def common_strains(self, metabolome_objects, gcfs, filter_no_shared=False) -> dict:
         """
         Obtain the set of common strains between all pairs from the lists objects_a
         and objects_b.
@@ -320,11 +321,11 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False):
 
         # TODO make this work for BGCs too?
 
-        is_list_a = isinstance(objects_a, list)
-        is_list_b = isinstance(objects_b, list)
+        is_list_a = isinstance(metabolome_objects, list)
+        is_list_b = isinstance(gcfs, list)
 
-        type_a = type(objects_a[0]) if is_list_a else type(objects_a)
-        type_b = type(objects_b[0]) if is_list_b else type(objects_b)
+        type_a = type(metabolome_objects[0]) if is_list_a else type(metabolome_objects)
+        type_b = type(gcfs[0]) if is_list_b else type(gcfs)
 
         if type_a == type_b:
             raise Exception('Must supply objects with different types!')
@@ -333,33 +334,40 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False):
         if type_a == GCF:
             type_a, type_b = type_b, type_a
             is_list_a, is_list_b = is_list_b, is_list_a
-            objects_a, objects_b = objects_b, objects_a
+            metabolome_objects, gcfs = gcfs, metabolome_objects
 
         if not is_list_a:
-            objects_a = [objects_a]
+            metabolome_objects = [metabolome_objects]
         if not is_list_b:
-            objects_b = [objects_b]
+            gcfs = [gcfs]
 
         # retrieve object IDs
         # TODO: issue #103 stop using gcf.id, but note that the ids_b should be
         # a list of int
-        ids_b = [gcf.id for gcf in objects_b]
+        gcfs_id_list = [gcf.id for gcf in gcfs]
         # these might be MolFams or Spectra, either way they'll have a .id attribute
-        ids_a = [obj.id for obj in objects_a]
+        ids_a = [obj.id for obj in metabolome_objects]
 
         data_a = self.M_spec_strain if type_a == Spectrum else self.M_fam_strain
-        data_b = self.M_gcf_strain
 
         results = {}
-        for a, obj_a in enumerate(objects_a):
-            for b, obj_b in enumerate(objects_b):
+        for i, obj_meta in enumerate(metabolome_objects):
+            for j, obj_gcf in enumerate(gcfs):
                 # just AND both arrays and extract the indices with positive results
+
+                # self.M_gcf_strain is a 2D np.array [index of gcf_list, index of strain_list]
+                # TODO: bug here. self.M_gcf_strain use the enumerate count of the gcf_list as index
+                # it's wrong to assume that enumerate count is same as the gcf.id.
+                # self.M_gcf_strain should use dataframe
+                # TODO: Bug: the result is 1D array of the enumerated count of strain_list
+                # for the common strains. Same here, the enumerate count is not
+                # the same as the strain.id
                 result = np.where(
-                    np.logical_and(data_a[ids_a[a]], data_b[ids_b[b]]))[0]
+                    np.logical_and(data_a[ids_a[i]], self.M_gcf_strain[gcfs_id_list[j]]))[0]
                 # if we want to exclude results with no shared strains
                 if (filter_no_shared
                         and len(result) > 0) or not filter_no_shared:
-                    results[(obj_a, obj_b)] = result
+                    results[(obj_meta, obj_gcf)] = result
 
         return results
 

From 504862ea44ed7e81f5e3496915967ee002320077 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 14 Apr 2023 11:14:25 +0200
Subject: [PATCH 21/95] add comment about `met_only` parameter

---
 src/nplinker/nplinker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 006e57e2..8c78f47f 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -260,6 +260,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
         Returns:
             bool: True if successful, False otherwise
         """
+        # TODO: the met_only is useless, remove it. NPlinker will stop working if met_only=True
         # typical case where load_data is being called with no params
         if new_bigscape_cutoff is None:
             logger.debug(

From 8b160e66b7658000fc028ffd929f974f138caa42 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 18 Apr 2023 14:39:43 +0200
Subject: [PATCH 22/95] add todo comments to LinkFinder

---
 src/nplinker/scoring/linking/link_finder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 2e58acd9..cf3563a5 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -453,6 +453,7 @@ def get_links(self,
 
             if main_score == 'likescore':
                 likescores = [
+                    # TODO CG: use dataframe instead of numpy array
                     self.likescores_spec_gcf[:, input_ids],
                     self.likescores_fam_gcf[:, input_ids]
                 ]

From 8e9557c6c30ef78848373cce78ab1617a4c4a3cf Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 18 Apr 2023 15:25:59 +0200
Subject: [PATCH 23/95] add comments to GNPSSpectrumLoader

to figure out how `spectrum_id` is set
---
 src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
index 59acd06f..91a83171 100644
--- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
@@ -18,7 +18,7 @@ def __init__(self, file: str | PathLike):
         ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(file)])
         logger.info('%d molecules parsed from MGF file', len(ms1))
         self._spectra = _mols_to_spectra(ms2, metadata)
-    
+
     def spectra(self) -> list[Spectrum]:
         """Get the spectra loaded from the file.
 
@@ -26,7 +26,7 @@ def spectra(self) -> list[Spectrum]:
             list[Spectrum]: the loaded spectra as a list of `Spectrum` objects.
         """
         return self._spectra
-    
+
 
 def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spectrum]:
     """Function to convert ms2 object and metadata to `Spectrum` objects.
@@ -39,13 +39,15 @@ def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spe
         list[Spectrum]: List of mass spectra obtained from ms2 and metadata.
     """
     ms2_dict = {}
+    # an example of m:
+    # (118.487999, 0.0, 18.753, <nplinker.parsers.mg...105f2c970>, 'spectra.mgf', 0.0)
     for m in ms2:
-        if not m[3] in ms2_dict:
+        if not m[3] in ms2_dict: # m[3] is `nplinker.parsers.mgf.MS1` object
             ms2_dict[m[3]] = []
         ms2_dict[m[3]].append((m[0], m[2]))
 
     spectra = []
-    for i, m in enumerate(ms2_dict.keys()):
+    for i, m in enumerate(ms2_dict.keys()): # m is `nplinker.parsers.mgf.MS1` object
         new_spectrum = Spectrum(i, ms2_dict[m], int(m.name),
                                 metadata[m.name]['precursormass'],
                                 metadata[m.name]['parentmass'])

From 9257e0b561fedd0710ee8b7d070d3fc9dea22d3f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 18 Apr 2023 15:35:08 +0200
Subject: [PATCH 24/95] change Spectrum.spectrum_id from type int to str

---
 src/nplinker/annotations.py                     | 17 ++++++++---------
 src/nplinker/class_info/chem_classes.py         |  4 ++--
 .../metabolomics/gnps/gnps_annotation_loader.py | 16 ++++++++--------
 .../gnps/gnps_molecular_family_loader.py        | 16 ++++++++--------
 .../metabolomics/gnps/gnps_spectrum_loader.py   |  2 +-
 src/nplinker/metabolomics/metabolomics.py       | 14 +++++++-------
 src/nplinker/metabolomics/molecular_family.py   |  4 ++--
 src/nplinker/metabolomics/spectrum.py           |  6 +++---
 src/nplinker/scoring/np_class_scoring.py        |  2 +-
 src/nplinker/scoring/rosetta/spec_lib.py        |  2 +-
 tests/conftest.py                               |  2 +-
 11 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/src/nplinker/annotations.py b/src/nplinker/annotations.py
index 970576a7..6449b6d2 100644
--- a/src/nplinker/annotations.py
+++ b/src/nplinker/annotations.py
@@ -14,10 +14,9 @@
 
 import csv
 import os
-
 from deprecated import deprecated
-
-from nplinker.metabolomics.spectrum import Spectrum, GNPS_KEY
+from nplinker.metabolomics.spectrum import GNPS_KEY
+from nplinker.metabolomics.spectrum import Spectrum
 from .logconfig import LogConfig
 
 
@@ -61,14 +60,14 @@ def create_gnps_annotation(spec: Spectrum, gnps_anno: dict):
 
 
 @deprecated(version="1.3.3", reason="Use GNPSAnnotationLoader class instead.")
-def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[int, Spectrum]) -> list[Spectrum]:
+def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[str, Spectrum]) -> list[Spectrum]:
     """Load the annotations from the GNPS annotation file present in root to the spectra.
 
     Args:
         root(str | os.PathLike): Path to the downloaded and extracted GNPS results.
         config(str | os.PathLike): Path to config file for custom file locations.
         spectra(list[Spectrum]): List of spectra to annotate.
-        spec_dict(dict[int, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable.
+        spec_dict(dict[str, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable.
 
     Raises:
         Exception: Raises exception if custom annotation config file has invalid content.
@@ -76,7 +75,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
     Returns:
         list[Spectrum]: List of annotated spectra.
     """
-    
+
     if not os.path.exists(root):
         logger.debug(f'Annotation directory not found ({root})')
         return spectra
@@ -89,7 +88,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
 
     logger.debug('Found {} annotations .tsv files in {}'.format(
         len(annotation_files), root))
-    
+
     for af in annotation_files:
         with open(af) as f:
             rdr = csv.reader(f, delimiter='\t')
@@ -105,7 +104,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
                 # each line should be a different spec ID here
                 for line in rdr:
                     # read the scan ID column and get the corresponding Spectrum object
-                    scan_id = int(line[scans_index])
+                    scan_id = line[scans_index]
                     if scan_id not in spec_dict:
                         logger.warning(
                             'Unknown spectrum ID found in GNPS annotation file (ID={})'
@@ -147,7 +146,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
                 # note that might have multiple lines for the same spec ID!
                 spec_annotations = {}
                 for line in rdr:
-                    scan_id = int(line[spec_id_index])
+                    scan_id = line[spec_id_index]
                     if scan_id not in spec_dict:
                         logger.warning(
                             'Unknown spectrum ID found in annotation file "{}", ID is "{}"'
diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py
index 8aeb0190..00a9cdb4 100644
--- a/src/nplinker/class_info/chem_classes.py
+++ b/src/nplinker/class_info/chem_classes.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import Counter
 import glob
 import os
-from collections import Counter
 from canopus import Canopus
 from canopus.classifications_to_gnps import analyse_canopus
 from ..logconfig import LogConfig
@@ -409,7 +409,7 @@ class prediction for a level. When no class is present, instead of Tuple it will
 
             classes_per_spectra = []
             for spec in spectra:
-                spec_classes = self.spectra_classes.get(str(spec.spectrum_id))
+                spec_classes = self.spectra_classes.get(spec.spectrum_id)
                 if spec_classes:  # account for spectra without prediction
                     classes_per_spectra.append(spec_classes)
 
diff --git a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
index 13cb5028..aade4de2 100644
--- a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
@@ -2,9 +2,9 @@
 from os import PathLike
 from pathlib import Path
 from typing import Any
-
 from nplinker.metabolomics.abc import AnnotationLoaderBase
 
+
 GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}'
 
 class GNPSAnnotationLoader(AnnotationLoaderBase):
@@ -15,28 +15,28 @@ def __init__(self, file: str | PathLike):
             file(str | PathLike): The GNPS annotation file.
         """
         self._file = Path(file)
-        self._annotations : dict[int, dict] = dict()
+        self._annotations : dict[str, dict] = {}
 
         with open(self._file, mode='rt', encoding='UTF-8') as f:
             header = f.readline().split('\t')
             dict_reader = csv.DictReader(f, header, delimiter='\t')
             for row in dict_reader:
-                scan_id = int(row.pop('#Scan#'))
+                scan_id = row.pop('#Scan#')
                 self._annotations[scan_id] = row
-                
+
                 # also insert useful URLs
                 for t in ['png', 'json', 'svg', 'spectrum']:
                     self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID'])
 
-            
 
-    def get_annotations(self) -> dict[int, dict]:
+
+    def get_annotations(self) -> dict[str, dict]:
         """Get annotations.
 
         Returns:
-            dict[int, dict]: Spectra indices are keys and values are the annotations for this spectrum.
+            dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum.
 
         Examples:
             >>> print(loader.annotations()[100])
             """
-        return self._annotations
\ No newline at end of file
+        return self._annotations
diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
index 47783e15..729b34b8 100644
--- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
@@ -17,7 +17,7 @@ def __init__(self, file: str | PathLike):
             file(str | PathLike): str or PathLike object pointing towards the GNPS molecular families file to load.
         """
         self._families: list[MolecularFamily | SingletonFamily] = []
-        
+
         for family_id, spectra_ids in _load_molecular_families(file).items():
             if family_id == -1:
                 for spectrum_id in spectra_ids:
@@ -28,32 +28,32 @@ def __init__(self, file: str | PathLike):
                 family = MolecularFamily(family_id)
                 family.spectra_ids = spectra_ids
                 self._families.append(family)
-    
+
     def families(self) -> list[MolecularFamily]:
         return self._families
 
 
-def _load_molecular_families(file: str | PathLike) -> dict[int, set[int]]:
+def _load_molecular_families(file: str | PathLike) -> dict[int, set[str]]:
     """Load ids of molecular families and corresponding spectra from GNPS output file.
 
     Args:
         file(str | PathLike): path to the GNPS file to load molecular families.
 
     Returns:
-        dict[int, set[int]]: Mapping from molecular family/cluster id to the spectra ids.
+        dict[int, set[str]]: Mapping from molecular family/cluster id to the spectra ids.
     """
     logger.debug('loading edges file: %s', file)
 
     families: dict = {}
-    
+
     with open(file, mode='rt', encoding='utf-8') as f:
         reader = csv.reader(f, delimiter='\t')
         headers = next(reader)
         cid1_index, cid2_index, fam_index = _sniff_column_indices(file, headers)
 
         for line in reader:
-            spec1_id = int(line[cid1_index])
-            spec2_id = int(line[cid2_index])
+            spec1_id = line[cid1_index]
+            spec2_id = line[cid2_index]
             family_id = int(line[fam_index])
 
             if families.get(family_id) is None:
@@ -84,5 +84,5 @@ def _sniff_column_indices(file: str | PathLike, headers: list[str]) -> tuple[int
     except ValueError as ve:
         message = f'Unknown or missing column(s) in edges file: {file}'
         raise Exception(message) from ve
-                
+
     return cid1_index,cid2_index,fam_index
diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
index 91a83171..0a1431cf 100644
--- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
@@ -48,7 +48,7 @@ def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spe
 
     spectra = []
     for i, m in enumerate(ms2_dict.keys()): # m is `nplinker.parsers.mgf.MS1` object
-        new_spectrum = Spectrum(i, ms2_dict[m], int(m.name),
+        new_spectrum = Spectrum(i, ms2_dict[m], m.name,
                                 metadata[m.name]['precursormass'],
                                 metadata[m.name]['parentmass'])
         new_spectrum.metadata = metadata[m.name]
diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py
index 1c4cde5d..c9b2c929 100644
--- a/src/nplinker/metabolomics/metabolomics.py
+++ b/src/nplinker/metabolomics/metabolomics.py
@@ -35,7 +35,7 @@ def _mols_to_spectra(ms2, metadata):
 
     spectra = []
     for i, m in enumerate(ms2_dict.keys()):
-        new_spectrum = Spectrum(i, ms2_dict[m], int(m.name),
+        new_spectrum = Spectrum(i, ms2_dict[m], m.name,
                                 metadata[m.name]['precursormass'],
                                 metadata[m.name]['parentmass'])
         new_spectrum.metadata = metadata[m.name]
@@ -53,15 +53,15 @@ def _mols_to_spectra(ms2, metadata):
     return spectra
 
 @deprecated(version="1.3.3", reason="Use the GNPSMolecularFamilyLoader class instead.")
-def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]):
+def load_edges(edges_file: str, spec_dict: dict[str, Spectrum]):
     """Insert information about the molecular family into the spectra.
 
     Args:
         edges_file(str): File containing the molecular families.
-        spec_dict(dict[int, Spectrum]): Dictionary with mapping from spectra_id to Spectrum.
+        spec_dict(dict[str, Spectrum]): Dictionary with mapping from spectra_id to Spectrum.
 
     Raises:
-        Exception: Raises exception if the edges file doesn't contain the correct columns. 
+        Exception: Raises exception if the edges file doesn't contain the correct columns.
     """
     logger.debug('loading edges file: {} [{} spectra from MGF]'.format(
         edges_file, len(spec_dict)))
@@ -145,7 +145,7 @@ def load_dataset(strains,
 
 
 @deprecated(version="1.3.3", reason="Use the GNPSSpectrumLoader class instead.")
-def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[int, Spectrum]:
+def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[str, Spectrum]:
     """Wrapper function to load spectra and init the molecular family links.
 
     Args:
@@ -153,14 +153,14 @@ def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[i
         edges_file(str | PathLike): File storing the molecular family information in .selfloop or .pairsinfo format.
 
     Returns:
-        dict[int, Spectrum]: Indexed dict of mass spectra.
+        dict[str, Spectrum]: Indexed dict of mass spectra.
     """
 
     ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(mgf_file)])
     logger.info('%d molecules parsed from MGF file', len(ms1))
     spectra = _mols_to_spectra(ms2, metadata)    # above returns a list, create a dict indexed by spectrum_id to make
     # the rest of the parsing a bit simpler
-    spec_dict: dict[int, Spectrum] = {spec.spectrum_id: spec for spec in spectra}
+    spec_dict: dict[str, Spectrum] = {spec.spectrum_id: spec for spec in spectra}
     load_edges(edges_file, spec_dict)
     return spec_dict
 
diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index a55a4df9..02b8496a 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -1,9 +1,9 @@
 from typing_extensions import Self
-
 from nplinker.metabolomics.spectrum import Spectrum
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
 
+
 class MolecularFamily():
 
     def __init__(self, family_id: int):
@@ -16,7 +16,7 @@ def __init__(self, family_id: int):
         self.family_id: int = family_id
         self.spectra: list[Spectrum] = []
         self.family = None
-        self.spectra_ids: set[int] = set()
+        self.spectra_ids: set[str] = set()
 
     # def has_strain(self, strain):
     #     for spectrum in self.spectra:
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 78a8e8ce..b6fa62a3 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -2,6 +2,7 @@
 from nplinker.strain_collection import StrainCollection
 from nplinker.utils import sqrt_normalise
 
+
 GNPS_KEY = 'gnps'
 
 JCAMP = '##TITLE={}\\n' +\
@@ -22,7 +23,7 @@ class Spectrum():
     def __init__(self,
                  id,
                  peaks,
-                 spectrum_id,
+                 spectrum_id: str,
                  precursor_mz,
                  parent_mz=None,
                  rt=None):
@@ -34,8 +35,7 @@ def __init__(self,
             intensity for mz, intensity in self.peaks)
         self.total_ms2_intensity = sum(
             intensity for mz, intensity in self.peaks)
-        assert (isinstance(spectrum_id, int))
-        self.spectrum_id = spectrum_id # == metadata.get('cluster_index')
+        self.spectrum_id = spectrum_id  # MS1.name
         self.rt = rt
         self.precursor_mz = precursor_mz
         self.parent_mz = parent_mz
diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py
index 51335c00..2340d9f1 100644
--- a/src/nplinker/scoring/np_class_scoring.py
+++ b/src/nplinker/scoring/np_class_scoring.py
@@ -261,7 +261,7 @@ def _get_met_classes(self, spec_like, method='mix'):
                 # list of list of tuples/None - todo: add to spectrum object?
                 # take only 'best' (first) classification per ontology level
                 all_classes = self.npl.chem_classes.canopus. \
-                    spectra_classes.get(str(spec_like.spectrum_id))
+                    spectra_classes.get(spec_like.spectrum_id)
                 if all_classes:
                     spec_like_classes = [
                         cls_per_lvl for lvl in all_classes
diff --git a/src/nplinker/scoring/rosetta/spec_lib.py b/src/nplinker/scoring/rosetta/spec_lib.py
index 27bb9723..1a332a5e 100644
--- a/src/nplinker/scoring/rosetta/spec_lib.py
+++ b/src/nplinker/scoring/rosetta/spec_lib.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from sortedcontainers import SortedList
-from ...logconfig import LogConfig
 from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader
+from ...logconfig import LogConfig
 from .rosetta_functions import fast_cosine
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index cac0255c..123e45d4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,7 +36,7 @@ def prepare_data():
 
 
 @pytest.fixture
-def spec_dict() -> dict[int, Spectrum]:
+def spec_dict() -> dict[str, Spectrum]:
     mgf_file = DATA_DIR / "spectra.mgf"
     edges_file = DATA_DIR / "edges.pairsinfo"
     return load_spectra(mgf_file, edges_file)

From 43f6d80d5c180155855522838f64002c59631ab4 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 19 Apr 2023 10:42:03 +0200
Subject: [PATCH 25/95] update spec_dict

---
 src/nplinker/metabolomics/load_gnps.py        | 27 ++++++++++---------
 src/nplinker/metabolomics/metabolomics.py     |  8 +++---
 .../test_gnps_annotation_loader.py            |  4 +--
 tests/scoring/test_data_links.py              |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py
index 0266df56..14a5ec2c 100644
--- a/src/nplinker/metabolomics/load_gnps.py
+++ b/src/nplinker/metabolomics/load_gnps.py
@@ -4,10 +4,11 @@
 from typing import Any
 from deprecated import deprecated
 from nplinker.logconfig import LogConfig
+from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping
+from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
 from nplinker.metabolomics.spectrum import Spectrum
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
-from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping ,GNPSFormat
 
 
 logger = LogConfig.getLogger(__name__)
@@ -102,7 +103,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str,
         the strain_mappings file. finally it also tries to extract the growth
         medium label as given in the metadata_table file, again using the strain
         label which should match between the two files.
-        >>> 
+        >>>
         """
 
 
@@ -160,14 +161,14 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str,
     return (strain_name, growth_medium, strain_name not in strains)
 
 
-def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[int, Spectrum]) -> dict[str, int]:
+def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[str, Spectrum]) -> dict[str, int]:
     """ Load info about clusters from old style GNPS files.
 
     Args:
         gnps_format(str): Identifier for the GNPS format of the file. Has to be one of [GNPS_FORMAT_OLD_ALLFILES, GNPS_FORMAT_OLD_UNIQUEFILES]
         strains(StrainCollection): StrainCollection in which to search for the detected strains.
         file(str): Path to file from which to load the cluster information.
-        spec_dict(dict[int, Spectrum]): Dictionary with already loaded spectra into which the metadata read from the file will be inserted.
+        spec_dict(dict[str, Spectrum]): Dictionary with already loaded spectra into which the metadata read from the file will be inserted.
 
     Raises:
         Exception: Raises exception if not supported GNPS format was detected.
@@ -194,7 +195,7 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str
 
         for line in reader:
             # get the values of the important columns
-            clu_index = int(line[clu_index_index])
+            clu_index = line[clu_index_index]
             if gnps_format == GNPSFormat.UniqueFiles:
                 mzxmls = line[mzxml_index].split('|')
             else:
@@ -314,7 +315,7 @@ def _parse_metadata_table(file: str) -> dict[str, dict[str, str|None]]:
 
 
 def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nodes_file: str,
-                           md_table_file: str, spec_dict: dict[int, Spectrum], ext_metadata_parsing: bool) -> tuple[dict[int, dict[str, str|None]], dict[str, int]]:
+                           md_table_file: str, spec_dict: dict[str, Spectrum], ext_metadata_parsing: bool) -> tuple[dict[str, dict[str, str|None]], dict[str, str]]:
     """Load the clusterinfo from a feature based molecular networking run output from GNPS.
 
     Args:
@@ -322,11 +323,11 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod
         nodes_file(str): File from which to load the cluster information.
         extra_nodes_file(str): Unknown.
         md_table_file(str): Path to metadata table. Deprecated.
-        spec_dict(dict[int, Spectrum]): Dictionary with already loaded spectra.
+        spec_dict(dict[str, Spectrum]): Dictionary with already loaded spectra.
         ext_metadata_parsing(bool): Whether to use extended metadata parsing.
 
     Returns:
-        tuple[dict[int, dict], dict[str, int]]: Spectra info mapping from spectrum id to all columns in the nodes file and unknown strain mapping from file identifier to spectrum id.
+        tuple[dict[str, dict], dict[str, str]]: Spectra info mapping from spectrum id to all columns in the nodes file and unknown strain mapping from file identifier to spectrum id.
     """
     spec_info = {}
 
@@ -347,7 +348,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod
             tmp = {}
             for i, v in enumerate(line):
                 tmp[headers[i]] = v
-            spec_info[int(line[ci_index])] = tmp
+            spec_info[line[ci_index]] = tmp
 
     with open(extra_nodes_file) as f:
         reader = csv.reader(f, delimiter=',')
@@ -358,7 +359,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod
         # nodes_file to the "row ID" from this file, and update the per-row dict
         # with the extra columns from this file
         for line in reader:
-            ri = int(line[ri_index])
+            ri = line[ri_index]
             tmp = {}
             for i, v in enumerate(line):
                 tmp[headers[i]] = v
@@ -437,7 +438,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod
 
 @deprecated(version="1.3.3", reason="Use the GNPSFileMappingLoader class instead.")
 def load_gnps(strains: StrainCollection, nodes_file: str, quant_table_file: str, metadata_table_file: str,
-              ext_metadata_parsing: bool, spec_dict: dict[int, Spectrum]) -> dict[str, int]:
+              ext_metadata_parsing: bool, spec_dict: dict[str, Spectrum]) -> dict[str, int]:
     """Wrapper function to load information from GNPS outputs.
 
     Args:
@@ -446,13 +447,13 @@ def load_gnps(strains: StrainCollection, nodes_file: str, quant_table_file: str,
         quant_table_file(str): Path to the quantification table.
         metadata_table_file(str): Path to the metadata table.
         ext_metadata_parsing(bool): Whether to use extended metadata parsing.
-        spec_dict(dict[int, Spectrum]): Mapping from int to spectra loaded from file.
+        spec_dict(dict[str, Spectrum]): Mapping from int to spectra loaded from file.
 
     Raises:
         Exception: Raises exception if an unknown GNPS format is encountered.
 
     Returns:
-        dict[str, int]: Returns a mapping from unknown strains which are found to spectra ids which occur in these unknown strains. 
+        dict[str, int]: Returns a mapping from unknown strains which are found to spectra ids which occur in these unknown strains.
         """
     gnps_format = gnps_format_from_file_mapping(nodes_file, quant_table_file
                                         is not None)
diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py
index c9b2c929..041fbe2e 100644
--- a/src/nplinker/metabolomics/metabolomics.py
+++ b/src/nplinker/metabolomics/metabolomics.py
@@ -53,7 +53,7 @@ def _mols_to_spectra(ms2, metadata):
     return spectra
 
 @deprecated(version="1.3.3", reason="Use the GNPSMolecularFamilyLoader class instead.")
-def load_edges(edges_file: str, spec_dict: dict[str, Spectrum]):
+def load_edges(edges_file: str | PathLike, spec_dict: dict[str, Spectrum]):
     """Insert information about the molecular family into the spectra.
 
     Args:
@@ -79,8 +79,8 @@ def load_edges(edges_file: str, spec_dict: dict[str, Spectrum]):
                     edges_file))
 
         for line in reader:
-            spec1_id = int(line[cid1_index])
-            spec2_id = int(line[cid2_index])
+            spec1_id = line[cid1_index]
+            spec2_id = line[cid2_index]
             cosine = float(line[cos_index])
             family = int(line[fam_index])
 
@@ -134,7 +134,7 @@ def load_dataset(strains,
     # spec_dict = {spec.spectrum_id: spec for spec in spectra}
 
     # add edges info to the spectra
-    molfams = make_families(spec_dict.values())
+    molfams = make_families(list(spec_dict.values()))
     # molfams = GNPSMolecularFamilyLoader(edges_file).families()
 
     unknown_strains = load_gnps(strains, nodes_file, quant_table_file,
diff --git a/tests/metabolomics/test_gnps_annotation_loader.py b/tests/metabolomics/test_gnps_annotation_loader.py
index 352be137..24f7cf99 100644
--- a/tests/metabolomics/test_gnps_annotation_loader.py
+++ b/tests/metabolomics/test_gnps_annotation_loader.py
@@ -32,7 +32,7 @@ def test_reads_all_annotations(file, expected):
     assert len(sut.get_annotations()) == expected
 
 
-def test_annotations_are_equal(spec_dict: dict[int, Spectrum]):
+def test_annotations_are_equal(spec_dict: dict[str, Spectrum]):
     annotations_dir = DATA_DIR / "annotations"
     annotations_file = annotations_dir / "gnps_annotations.tsv"
 
@@ -44,7 +44,7 @@ def test_annotations_are_equal(spec_dict: dict[int, Spectrum]):
         spectra,
         spec_dict
     )
-    expected: dict[int, dict] = {}
+    expected = {}
     for x in sut:
         if x.has_annotations():
             expected[x.spectrum_id] = x.gnps_annotations
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index 004db682..e794a29b 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -9,7 +9,7 @@
 
 
 @pytest.fixture
-def spec_with_families(spec_dict) -> dict[int, Spectrum]:
+def spec_with_families(spec_dict) -> dict[str, Spectrum]:
     make_families(spec_dict.values())
     return spec_dict
 

From 7536e42e3d2c8e439a93b36775affe03b0b086d3 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 19 Apr 2023 11:20:51 +0200
Subject: [PATCH 26/95] Update tests

---
 tests/metabolomics/test_load_gnps.py | 10 +++++-----
 tests/metabolomics/test_spectrum.py  |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/metabolomics/test_load_gnps.py b/tests/metabolomics/test_load_gnps.py
index f3c4961b..354895c6 100644
--- a/tests/metabolomics/test_load_gnps.py
+++ b/tests/metabolomics/test_load_gnps.py
@@ -1,13 +1,13 @@
 from itertools import chain
-
 import pytest
-from nplinker.metabolomics.load_gnps import _messy_strain_naming_lookup, _parse_mzxml_header
-from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping, GNPSFormat
+from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping
+from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
 from nplinker.metabolomics.load_gnps import _load_clusterinfo_old
+from nplinker.metabolomics.load_gnps import _messy_strain_naming_lookup
+from nplinker.metabolomics.load_gnps import _parse_mzxml_header
 from nplinker.metabolomics.load_gnps import load_gnps
 from nplinker.strain_collection import StrainCollection
 from nplinker.utils import get_headers
-
 from .. import DATA_DIR
 
 
@@ -51,7 +51,7 @@ def test_load_clusterinfo_old(spec_dict):
     for spectrum_id, spec in spec_dict.items():
         metadata = spec.metadata
         assert len(metadata.get('files')) > 1
-        assert isinstance(metadata.get('cluster_index'), int)
+        assert isinstance(metadata.get('cluster_index'), str)
         assert spectrum_id == metadata.get('cluster_index')
 
 
diff --git a/tests/metabolomics/test_spectrum.py b/tests/metabolomics/test_spectrum.py
index bcb82d36..30473f90 100644
--- a/tests/metabolomics/test_spectrum.py
+++ b/tests/metabolomics/test_spectrum.py
@@ -1,13 +1,13 @@
 import pytest
-
 from nplinker.metabolomics.spectrum import Spectrum
 
+
 @pytest.fixture
 def spectrum() -> Spectrum:
     spec = Spectrum(
         1,
         peaks=[[10, 100], [20, 150]],
-        spectrum_id=2,
+        spectrum_id="2",
         precursor_mz=30,
         parent_mz=50,
         rt= 100

From a0cf97ab23a99b515e5f7fda782918868631bd4d Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 28 Apr 2023 11:34:18 +0200
Subject: [PATCH 27/95] update `__eq__` in MolecularFamily

---
 src/nplinker/metabolomics/molecular_family.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index 02b8496a..4de13e3e 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -53,8 +53,7 @@ def __str__(self) -> str:
     def __eq__(self, other: Self) -> bool:
         if isinstance(other, MolecularFamily):
             return (self.id == other.id
-                    and self.family_id == other.family_id
-                    and set(self.spectra) == set(other.spectra))
+                    and self.family_id == other.family_id)
         return NotImplemented
 
     def __hash__(self) -> int:

From b53af96a0ca2c0228febaae8d739735aa32c3681 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 28 Apr 2023 12:09:49 +0200
Subject: [PATCH 28/95] change `MolecularFamily.family_id` from type int to str

---
 src/nplinker/class_info/chem_classes.py       |  2 +-
 .../gnps/gnps_molecular_family_loader.py      |  8 +--
 src/nplinker/metabolomics/metabolomics.py     |  6 +--
 src/nplinker/metabolomics/molecular_family.py | 13 ++---
 src/nplinker/metabolomics/singleton_family.py |  4 +-
 src/nplinker/metabolomics/spectrum.py         |  2 +-
 src/nplinker/scoring/np_class_scoring.py      |  4 +-
 .../test_gnps_molecular_family_loader.py      | 49 +++++++------------
 8 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py
index 00a9cdb4..d2f184a3 100644
--- a/src/nplinker/class_info/chem_classes.py
+++ b/src/nplinker/class_info/chem_classes.py
@@ -399,7 +399,7 @@ class prediction for a level. When no class is present, instead of Tuple it will
         molfam_classes = {}
 
         for molfam in molfams:
-            fid = str(molfam.family_id)  # the key
+            fid = molfam.family_id  # the key
             spectra = molfam.spectra
             # if singleton family, format like '-1_spectrum-id'
             if fid == '-1':
diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
index 729b34b8..0a3b7659 100644
--- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
@@ -19,7 +19,7 @@ def __init__(self, file: str | PathLike):
         self._families: list[MolecularFamily | SingletonFamily] = []
 
         for family_id, spectra_ids in _load_molecular_families(file).items():
-            if family_id == -1:
+            if family_id == '-1':
                 for spectrum_id in spectra_ids:
                     family = SingletonFamily()
                     family.spectra_ids = set([spectrum_id])
@@ -33,14 +33,14 @@ def families(self) -> list[MolecularFamily]:
         return self._families
 
 
-def _load_molecular_families(file: str | PathLike) -> dict[int, set[str]]:
+def _load_molecular_families(file: str | PathLike) -> dict[str, set[str]]:
     """Load ids of molecular families and corresponding spectra from GNPS output file.
 
     Args:
         file(str | PathLike): path to the GNPS file to load molecular families.
 
     Returns:
-        dict[int, set[str]]: Mapping from molecular family/cluster id to the spectra ids.
+        dict[str, set[str]]: Mapping from molecular family/cluster id to the spectra ids.
     """
     logger.debug('loading edges file: %s', file)
 
@@ -54,7 +54,7 @@ def _load_molecular_families(file: str | PathLike) -> dict[int, set[str]]:
         for line in reader:
             spec1_id = line[cid1_index]
             spec2_id = line[cid2_index]
-            family_id = int(line[fam_index])
+            family_id = line[fam_index]
 
             if families.get(family_id) is None:
                 families[family_id] = set([spec1_id, spec2_id])
diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py
index 041fbe2e..79b1c7d5 100644
--- a/src/nplinker/metabolomics/metabolomics.py
+++ b/src/nplinker/metabolomics/metabolomics.py
@@ -82,13 +82,13 @@ def load_edges(edges_file: str | PathLike, spec_dict: dict[str, Spectrum]):
             spec1_id = line[cid1_index]
             spec2_id = line[cid2_index]
             cosine = float(line[cos_index])
-            family = int(line[fam_index])
+            family = line[fam_index]
 
             if spec1_id in spec_dict and spec2_id in spec_dict:
                 spec1 = spec_dict[spec1_id]
                 spec2 = spec_dict[spec2_id]
 
-                if family != -1:  # singletons
+                if family != '-1':  # singletons
                     spec1.family_id = family
                     spec2.family_id = family
 
@@ -181,7 +181,7 @@ def make_families(spectra: list[Spectrum]) -> list[MolecularFamily]:
     fams, singles = 0, 0
     for spectrum in spectra:
         family_id = spectrum.family_id
-        if family_id == -1:  # singleton
+        if family_id == '-1':  # singleton
             new_family = SingletonFamily()
             new_family.id = family_index
             family_index += 1
diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index 4de13e3e..13d8430e 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -6,25 +6,18 @@
 
 class MolecularFamily():
 
-    def __init__(self, family_id: int):
+    def __init__(self, family_id: str):
         """Class to model molecular families.
 
         Args:
-            family_id(int): Id for the molecular family.
+            family_id(str): Id for the molecular family.
         """
         self.id: int = -1
-        self.family_id: int = family_id
+        self.family_id: str = family_id
         self.spectra: list[Spectrum] = []
         self.family = None
         self.spectra_ids: set[str] = set()
 
-    # def has_strain(self, strain):
-    #     for spectrum in self.spectra:
-    #         if spectrum.has_strain(strain):
-    #             return True
-
-    #     return False
-
     @property
     def strains(self) -> StrainCollection:
         """Get strains of spectra in the molecular family.
diff --git a/src/nplinker/metabolomics/singleton_family.py b/src/nplinker/metabolomics/singleton_family.py
index a742b807..36f03e49 100644
--- a/src/nplinker/metabolomics/singleton_family.py
+++ b/src/nplinker/metabolomics/singleton_family.py
@@ -1,10 +1,10 @@
 from .molecular_family import MolecularFamily
 
+
 class SingletonFamily(MolecularFamily):
 
     def __init__(self):
-        super().__init__(-1)
+        super().__init__('-1')
 
     def __str__(self):
         return f"Singleton molecular family (id={self.id})"
-
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index b6fa62a3..0002843d 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -47,7 +47,7 @@ def __init__(self,
         # this is a dict indexed by Strain objects (the strains found in this Spectrum), with
         # the values being dicts of the form {growth_medium: peak intensity} for the parent strain
         self.growth_media = {}
-        self.family_id = -1
+        self.family_id = '-1'
         self.family = None
         # a dict indexed by filename, or "gnps"
         self.annotations = {}
diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py
index 2340d9f1..304651dc 100644
--- a/src/nplinker/scoring/np_class_scoring.py
+++ b/src/nplinker/scoring/np_class_scoring.py
@@ -270,7 +270,7 @@ def _get_met_classes(self, spec_like, method='mix'):
                 spec_like_classes_names_inds = self.npl.chem_classes.canopus. \
                     spectra_classes_names_inds
             else:  # molfam
-                fam_id = str(spec_like.family_id)
+                fam_id = spec_like.family_id
                 if fam_id == '-1':  # account for singleton families
                     fam_id += f'_{spec_like.spectra[0].spectrum_id}'
                 all_classes = self.npl.chem_classes.canopus.molfam_classes.get(
@@ -288,7 +288,7 @@ def _get_met_classes(self, spec_like, method='mix'):
                 spec_like_classes = self.npl.chem_classes.molnetenhancer. \
                     spectra_classes(spec_like.spectrum_id)
             else:  # molfam
-                fam_id = str(spec_like.family_id)
+                fam_id = spec_like.family_id
                 if fam_id == '-1':  # account for singleton families
                     fam_id += f'_{spec_like.spectra[0].spectrum_id}'
                 spec_like_classes = self.npl.chem_classes.molnetenhancer. \
diff --git a/tests/metabolomics/test_gnps_molecular_family_loader.py b/tests/metabolomics/test_gnps_molecular_family_loader.py
index e8ac05a1..1174e28b 100644
--- a/tests/metabolomics/test_gnps_molecular_family_loader.py
+++ b/tests/metabolomics/test_gnps_molecular_family_loader.py
@@ -1,42 +1,29 @@
 import os
-import numpy
 import pytest
 from nplinker.metabolomics.gnps.gnps_molecular_family_loader import \
     GNPSMolecularFamilyLoader
-from nplinker.metabolomics.metabolomics import make_families
-from nplinker.metabolomics.molecular_family import MolecularFamily
 from .. import DATA_DIR
 
 
-@pytest.fixture
-def molecular_families(spec_dict) -> list[MolecularFamily]:
-    return make_families(spec_dict.values())
-
-@pytest.mark.parametrize("filename", [
-    os.path.join(DATA_DIR, "edges.pairsinfo"),
-    DATA_DIR / "edges.pairsinfo"
-])
+@pytest.mark.parametrize(
+    "filename",
+    [os.path.join(DATA_DIR, "edges.pairsinfo"), DATA_DIR / "edges.pairsinfo"])
 def test_has_molecular_families(filename):
     sut = GNPSMolecularFamilyLoader(filename)
     actual = sut.families()
     assert len(actual) == 25769
-    assert len(actual[0].spectra_ids) == 19
-
-
-def test_families_are_identical(spec_dict, molecular_families):
-    filename = os.path.join(DATA_DIR, "edges.pairsinfo")
-    actual = GNPSMolecularFamilyLoader(filename).families()
-
-    actual.sort(key= lambda x: min(x.spectra_ids))
-
-    for i, x in enumerate(actual):
-        x.id = i
-        for spec_id in x.spectra_ids:
-            x.add_spectrum(spec_dict[spec_id])
-
-
-    for x in molecular_families:
-        for spec in x.spectra:
-            x.spectra_ids.add(spec.spectrum_id)
-
-    numpy.testing.assert_array_equal(actual, molecular_families)
+    mf_ids = [mf.family_id for mf in actual[:30]]
+    assert mf_ids == [
+        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
+        '14', '15', '16', '17', '18', '20', '21', '22', '23', '24', '26', '28',
+        '30', '31', '32', '33', '-1'
+    ]
+    num_spec_ids = [len(mf.spectra_ids) for mf in actual[:30]]
+    assert num_spec_ids == [
+        19, 48, 3, 3, 11, 4, 9, 3, 15, 3, 5, 2, 3, 3, 5, 3, 14, 4, 2, 2, 12, 2,
+        3, 5, 2, 4, 2, 2, 2, 1
+    ]
+    assert actual[0].spectra_ids == set(
+        ('13170', '13662', '15316', '15364', '16341', '17201', '17270',
+         '18120', '18172', '18748', '18831', '19005', '19673', '19719',
+         '20320', '20738', '21163', '21593', '23566'))

From ca848fd0bac2d77096f48e67975bfc0ba0a781c0 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 11:06:27 +0200
Subject: [PATCH 29/95] add method `has_strain` to MolecularFamily

---
 src/nplinker/metabolomics/molecular_family.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index 13d8430e..57cd686a 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -15,9 +15,9 @@ def __init__(self, family_id: str):
         self.id: int = -1
         self.family_id: str = family_id
         self.spectra: list[Spectrum] = []
-        self.family = None
         self.spectra_ids: set[str] = set()
 
+    # TODO: change property to attibute
     @property
     def strains(self) -> StrainCollection:
         """Get strains of spectra in the molecular family.
@@ -31,6 +31,18 @@ def strains(self) -> StrainCollection:
                 strains.add(strain)
         return strains
 
+    def has_strain(self, strain: str | Strain) -> bool:
+        """Check if the given strain exists.
+
+        Args:
+            strain(str | Strain): strain id or `Strain` object.
+
+        Returns:
+            bool: True when the given strain exist.
+        """
+        return strain in self.strains
+
+    # TODO: update the logics, mf should also be added to the spectrum object
     def add_spectrum(self, spectrum: Spectrum):
         """Add a spectrum to the spectra list.
 

From 583fa3384a8ae0d663a83a9eef354c8884343f1f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 11:06:50 +0200
Subject: [PATCH 30/95] Update metcalf_scoring.py

---
 src/nplinker/scoring/metcalf_scoring.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 65c45a5f..79e90fc4 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -7,10 +7,11 @@
 from nplinker.metabolomics.spectrum import Spectrum
 from nplinker.pickler import load_pickled_data
 from nplinker.pickler import save_pickled_data
-from nplinker.scoring.methods import ScoringMethod
-from nplinker.scoring.object_link import ObjectLink
 from nplinker.scoring.linking.data_linking import DataLinks
 from nplinker.scoring.linking.link_finder import LinkFinder
+from nplinker.scoring.methods import ScoringMethod
+from nplinker.scoring.object_link import ObjectLink
+
 
 logger = LogConfig.getLogger(__name__)
 
@@ -80,8 +81,7 @@ def setup(npl):
             MetcalfScoring.DATALINKS.load_data(npl._spectra, npl._gcfs,
                                                npl._strains, npl.molfams)
             # TODO fix crash with this set to True, see https://github.com/sdrogers/nplinker/issues/57
-            MetcalfScoring.DATALINKS.find_correlations(
-                include_singletons=False)
+            MetcalfScoring.DATALINKS.find_correlations()
             MetcalfScoring.LINKFINDER = LinkFinder()
             MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
                                                       type='spec-gcf')

From 6ebaf0dca5cdb32378ffaafaacac34b1d0c1852d Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 16:34:06 +0200
Subject: [PATCH 31/95] change array to dataframe in DataLinks

1. Change array to dataframe:
- self.M_gcf_strain -> self.gcf_strain_occurrence
- self. M_spec_strain -> self.spec_strain_occurrence
- self. M_fam_strain -> mf_strain_occurrence
2. update relevant methods to get the new dataframes
3. update logics of method `common_strains` using the new dataframes
---
 src/nplinker/scoring/linking/data_linking.py | 362 ++++++++-----------
 1 file changed, 144 insertions(+), 218 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index bec8b6d2..651390ee 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -1,17 +1,3 @@
-# Copyright 2021 The NPLinker Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 # Methods to find correlations between spectra/molecular families and
 # gene clusters/families (BGCs/GCFs)
 #
@@ -23,21 +9,21 @@
 # spec  stands for spectrum
 # fam   stands for molecular family
 
+from __future__ import annotations
 from collections import Counter
-from typing import Sequence
-# import packages
+from typing import Sequence, TYPE_CHECKING
 import numpy as np
 import pandas as pd
 from nplinker.genomics.gcf import GCF
+from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
+from nplinker.metabolomics.singleton_family import SingletonFamily
 from nplinker.metabolomics.spectrum import Spectrum
 from .data_linking_functions import calc_correlation_matrix
 
 
-SCORING_METHODS = ['metcalf', 'likescore', 'hg']
-
-from nplinker.logconfig import LogConfig
-
+if TYPE_CHECKING:
+    from nplinker.strain_collection import StrainCollection
 
 logger = LogConfig.getLogger(__name__)
 
@@ -51,21 +37,17 @@ class DataLinks():
     """
 
     def __init__(self):
-        # matrices that store co-occurences with respect to strains
-        # values = 1 where gcf/spec/fam occur in strain
-        # values = 0 where gcf/spec/fam do not occur in strain
-
-        # 2D array [gcf: int, strain: int]
-        self.M_gcf_strain = []
-        self.M_spec_strain = []
-        self.M_fam_strain = []
+        # DataFrame to store occurence of objects with respect to strains
+        # values = 1 where gcf/spec/fam occur in strain, 0 otherwise
+        self.gcf_strain_occurrence = pd.DataFrame()
+        self.spec_strain_occurrence = pd.DataFrame()
+        self.mf_strain_occurrence = pd.DataFrame()
 
         # mappings (lookup lists to map between different ids and categories
         self.mapping_spec = pd.DataFrame()
         self.mapping_gcf = pd.DataFrame()
         self.mapping_fam = pd.DataFrame()  # labels for strain-family matrix
         self.mapping_strain = pd.DataFrame()
-        self.family_members = []
 
         # correlation matrices for spectra <-> GCFs
         self.M_spec_gcf = [
@@ -79,48 +61,53 @@ def __init__(self):
         self.M_fam_notgcf = []
         self.M_notfam_gcf = []
 
-    def get_spec_pos(self, spec_id):
-        # get the position in the arrays of a spectrum
-        row = self.mapping_spec.loc[self.mapping_spec['original spec-id'] ==
-                                    float(spec_id)]
-        return int(row.iloc[0]['spec-id'])
-
-    def get_gcf_pos(self, gcf_id):
-        # TODO: fix this so the original ID is present in case of re-ordering
-        pass
-
-    def load_data(self, spectra, gcf_list, strain_list, molfams):
+    def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF],
+                  strains: StrainCollection,
+                  molfams: Sequence[MolecularFamily]):
         # load data from spectra, GCFs, and strains
         logger.debug("Create mappings between spectra, gcfs, and strains.")
         self.collect_mappings_spec(spectra)
-        # self.collect_mappings_spec_v2(molfams)
-        self.collect_mappings_gcf(gcf_list)
+        self.collect_mappings_gcf(gcfs)
         logger.debug(
-            "Create co-occurence matrices: spectra<->strains + and gcfs<->strains."
+            "Create co-occurence matrices: spectra<->strains, gcfs<->strains and mfs<->strains."
         )
-        self.matrix_strain_gcf(gcf_list, strain_list)
-        self.matrix_strain_spec(spectra, strain_list)
-
-    def find_correlations(self, include_singletons=False):
+        self._get_gcf_strain_occurrence(gcfs, strains)
+        self._get_spec_strain_occurrence(spectra, strains)
+        self._get_mf_strain_occurrence(molfams, strains)
+
+        self._get_mappings_from_occurrence()
+
+    def _get_mappings_from_occurrence(self):
+        self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence,
+                                                   axis=1)
+        self.mapping_spec["no of strains"] = np.sum(
+            self.spec_strain_occurrence, axis=1)
+        self.mapping_strain["no of spectra"] = np.sum(
+            self.spec_strain_occurrence, axis=0)
+        self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence,
+                                                   axis=1)
+
+    def find_correlations(self):
         # collect correlations/ co-occurences
         logger.debug("Create correlation matrices: spectra<->gcfs.")
         self.correlation_matrices(type='spec-gcf')
         logger.debug("Create correlation matrices: mol-families<->gcfs.")
-        self.data_family_mapping(include_singletons=include_singletons)
         self.correlation_matrices(type='fam-gcf')
 
-    def collect_mappings_spec(self, obj: Sequence[Spectrum]|Sequence[MolecularFamily]):
+    def collect_mappings_spec(self, obj: Sequence[Spectrum]
+                              | Sequence[MolecularFamily]):
         if isinstance(obj[0], Spectrum):
             mapping_spec = self._collect_mappings_from_spectra(obj)
         elif isinstance(obj[0], MolecularFamily):
             mapping_spec = self._collect_mappings_from_molecular_families(obj)
 
         # extend mapping tables:
-        self.mapping_spec["spec-id"] = mapping_spec[:, 0]
-        self.mapping_spec["original spec-id"] = mapping_spec[:, 1]
+        # TODO: why do we need the mappings???
+        # "spec-id" is defined as the index of the spectrum in the input data
         self.mapping_spec["fam-id"] = mapping_spec[:, 2]
 
-    def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]:
+    def _collect_mappings_from_spectra(self,
+                                       spectra) -> np.ndarray[np.float64]:
         # Collect most import mapping tables from input data
         mapping_spec = np.zeros((len(spectra), 3))
         mapping_spec[:, 0] = np.arange(0, len(spectra))
@@ -131,7 +118,9 @@ def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]:
 
         return mapping_spec
 
-    def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]:
+    def _collect_mappings_from_molecular_families(
+            self,
+            molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]:
         num_spectra = sum(len(x.spectra_ids) for x in molfams)
         mapping_spec = np.zeros((num_spectra, 3))
         mapping_spec[:, 0] = np.arange(0, num_spectra)
@@ -201,174 +190,115 @@ def collect_mappings_gcf(self, gcf_list):
                 ])
 
         # extend mapping tables:
-        self.mapping_gcf["gcf-id"] = np.arange(0, len(bigscape_bestguess))
         bigscape_guess, bigscape_guessscore = zip(*bigscape_bestguess)
         self.mapping_gcf["bgc class"] = bigscape_guess
-        self.mapping_gcf["bgc class score"] = bigscape_guessscore
 
-    def matrix_strain_gcf(self, gcf_list, strain_list):
-        # Collect co-ocurences in M_spec_strain matrix
-        M_gcf_strain = np.zeros((len(gcf_list), len(strain_list)))
+    def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF],
+                                   strains: StrainCollection) -> None:
+        """Get the occurence of strains in gcfs.
 
-        for i, gcf in enumerate(gcf_list):
-            for j, strain in enumerate(strain_list):
+        The occurence is a DataFrame with gcfs as rows and strains as columns,
+        where index is `gcf.gcf_id` and column name is `strain.id`. The values
+        are 1 if the gcf contains the strain and 0 otherwise.
+        """
+        df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))),
+                                     index=[gcf.gcf_id for gcf in gcfs],
+                                     columns=[strain.id for strain in strains])
+        for gcf in gcfs:
+            for strain in strains:
                 if gcf.has_strain(strain):
-                    M_gcf_strain[i, j] = 1
+                    df_gcf_strain.loc[gcf.gcf_id, strain.id] = 1
+        self.gcf_strain_occurrence = df_gcf_strain
 
-        self.M_gcf_strain = M_gcf_strain
-        # extend mapping tables:
-        self.mapping_gcf["no of strains"] = np.sum(self.M_gcf_strain, axis=1)
-        self.mapping_strain["no of gcfs"] = np.sum(self.M_gcf_strain, axis=0)
-
-    def matrix_strain_spec(self, spectra, strain_list):
-        # Collect co-ocurences in M_strains_specs matrix
-
-        M_spec_strain = np.zeros((len(spectra), len(strain_list)))
-        for i, spectrum in enumerate(spectra):
-            for j, s in enumerate(strain_list):
-                if spectrum.has_strain(s):
-                    M_spec_strain[i, j] = 1
-        self.M_spec_strain = M_spec_strain
+    def _get_spec_strain_occurrence(self, spectra: Sequence[Spectrum],
+                                    strains: StrainCollection) -> None:
+        """Get the occurence of strains in spectra.
 
-        # extend mapping tables:
-        self.mapping_spec["no of strains"] = np.sum(self.M_spec_strain, axis=1)
-        self.mapping_strain["no of spectra"] = np.sum(self.M_spec_strain,
-                                                      axis=0)
-        self.mapping_strain["strain name"] = [str(s) for s in strain_list]
-
-    def data_family_mapping(self, include_singletons=False):
-        # Create M_fam_strain matrix that gives co-occurences between mol. families and strains
-        # matrix dimensions are: number of families  x  number of strains
-
-        family_ids = np.unique(
-            self.mapping_spec["fam-id"])  # get unique family ids
-
-        # if singletons are included, check if there are a non-zero number of them (singleton
-        # families all have -1 as a family ID number)
-        if include_singletons and np.where(
-                self.mapping_spec["fam-id"] == -1)[0].shape[0] > 0:
-            # in this case the number of unique families is the number of singletons
-            # plus the number of normal families. the "-1" is (I think) to account for
-            # the single "-1" entry that will be present in "family_ids".
-            num_of_singletons = np.where(
-                self.mapping_spec["fam-id"] == -1)[0].shape[0]
-            num_unique_fams = num_of_singletons + len(family_ids) - 1
-        else:
-            # if no singletons included or present in the dataset, just take the number
-            # of regular molfams instead
-            num_of_singletons = 0
-            num_unique_fams = len(family_ids)
-
-        M_fam_strain = np.zeros((num_unique_fams, self.M_spec_strain.shape[1]))
-        strain_fam_labels = []
-        strain_fam_index = []
-
-        if num_of_singletons > 0:  # if singletons exist + included
-            M_fam_strain[(
-                num_unique_fams -
-                num_of_singletons):, :] = self.M_spec_strain[np.where(
-                    self.mapping_spec["fam-id"][:, 0] == -1)[0], :]
-
-        # go through families (except singletons) and collect member strain occurences
-        self.family_members = []
-        for i, fam_id in enumerate(
-                family_ids[np.where(family_ids != -1)].astype(int)):
-            family_members = np.where(
-                np.array(self.mapping_spec["fam-id"]) == fam_id)
-            self.family_members.append(family_members)
-            M_fam_strain[i, :] = np.sum(self.M_spec_strain[family_members, :],
-                                        axis=1)
-            strain_fam_labels.append(fam_id)
-            strain_fam_index.append(i)
-
-        add_singleton_entries = -1 in family_ids
-        # TODO: i think this breaks stuff below due to mismatches in the number of rows
-        # in the dataframes and matrices if there are no -1 family ids.
-        # discovered when trying to write some code to test scoring. is this ever
-        # likely to happen with a real dataset??
-        if add_singleton_entries:
-            strain_fam_labels.append([-1] * num_of_singletons)
-            strain_fam_index.append(i + 1)
-
-        # only looking for co-occurence, hence only 1 or 0
-        M_fam_strain[M_fam_strain > 1] = 1
-
-        self.M_fam_strain = M_fam_strain
-        # extend mapping table:
-        self.mapping_fam["family id"] = strain_fam_index
-        self.mapping_fam["original family id"] = strain_fam_labels
-        self.mapping_fam["no of strains"] = np.sum(self.M_fam_strain, axis=1)
-        num_members = [x[0].shape[0] for x in self.family_members]
-        # see above
-        if add_singleton_entries:
-            num_members.append(num_of_singletons)
-        self.mapping_fam["no of members"] = num_members
-        return self.family_members
-
-    def common_strains(self, metabolome_objects, gcfs, filter_no_shared=False) -> dict:
+        The occurence is a DataFrame with spectra as rows and strains as columns,
+        where index is `spectrum.spectrum_id` and column name is `strain.id`.
+        The values are 1 if the spectrum contains the strain and 0 otherwise.
         """
-        Obtain the set of common strains between all pairs from the lists objects_a
-        and objects_b.
-
-        The two parameters can be either lists or single instances of the 3 supported
-        object types (GCF, Spectrum, MolecularFamily). It's possible to use a single
-        object together with a list as well.
-
-        Returns a dict indexed by tuples of (Spectrum/MolecularFamily, GCF), where
-        the values are lists of strain indices which appear in both objects, which
-        can then be looked up in NPLinker.strains.
+        df_spec_strain = pd.DataFrame(
+            np.zeros((len(spectra), len(strains))),
+            index=[spectrum.spectrum_id for spectrum in spectra],
+            columns=[strain.id for strain in strains])
+        for spectrum in spectra:
+            for strain in strains:
+                if spectrum.has_strain(strain):
+                    df_spec_strain.loc[spectrum.spectrum_id, strain.id] = 1
+        self.spec_strain_occurrence = df_spec_strain
+
+    def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily],
+                                  strains: StrainCollection) -> None:
+        """Get the occurence of strains in molecular families.
+
+        The occurence is a DataFrame with molecular families as rows and
+        strains as columns, where index is `mf.family_id` and column name is
+        `strain.id`. The values are 1 if the molecular family contains the
+        strain and 0 otherwise.
+
+        Note that SingletonFamily objects are excluded from given `mfs`.
         """
+        # remove SingletonFamily objects
+        mfs = [mf for mf in mfs if not isinstance(mf, SingletonFamily)]
+
+        df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))),
+                                    index=[mf.family_id for mf in mfs],
+                                    columns=[strain.id for strain in strains])
+        for mf in mfs:
+            for strain in strains:
+                if mf.has_strain(strain):
+                    df_mf_strain.loc[mf.family_id, strain.id] = 1
+        self.mf_strain_occurrence = df_mf_strain
+
+    def get_common_strains(
+        self,
+        spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily],
+        gcfs: Sequence[GCF],
+        filter_no_shared: bool = False
+    ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]:
+        """Get common strains between given spectra/molecular families and GCFs.
+
+        Note that SingletonFamily objects are excluded from given `spectra_or_molfams`.
+
+        Args:
+            spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]):
+                A list of Spectrum or MolecularFamily objects.
+            gcfs(Sequence[GCF]): A list of GCF objects.
+            filter_no_shared(bool): If True, return only the pair of
+                spectrum/molecular family and GCF that have common strains;
+                otherwise, return all pairs no matter they have common strains
+                or not.
+
+        Returns:
+            dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
+            and values are a list of strain ids that appear in both objects.
+        """
+        if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)):
+            raise ValueError(
+                'Must provide Spectra or MolecularFamilies as the first argument!'
+            )
+        if not isinstance(gcfs[0], GCF):
+            raise ValueError('Must provide GCFs as the second argument!')
 
-        # TODO make this work for BGCs too?
-
-        is_list_a = isinstance(metabolome_objects, list)
-        is_list_b = isinstance(gcfs, list)
-
-        type_a = type(metabolome_objects[0]) if is_list_a else type(metabolome_objects)
-        type_b = type(gcfs[0]) if is_list_b else type(gcfs)
-
-        if type_a == type_b:
-            raise Exception('Must supply objects with different types!')
-
-        # to keep things slightly simpler, ensure the GCFs are always "b"
-        if type_a == GCF:
-            type_a, type_b = type_b, type_a
-            is_list_a, is_list_b = is_list_b, is_list_a
-            metabolome_objects, gcfs = gcfs, metabolome_objects
-
-        if not is_list_a:
-            metabolome_objects = [metabolome_objects]
-        if not is_list_b:
-            gcfs = [gcfs]
-
-        # retrieve object IDs
-        # TODO: issue #103 stop using gcf.id, but note that the ids_b should be
-        # a list of int
-        gcfs_id_list = [gcf.id for gcf in gcfs]
-        # these might be MolFams or Spectra, either way they'll have a .id attribute
-        ids_a = [obj.id for obj in metabolome_objects]
-
-        data_a = self.M_spec_strain if type_a == Spectrum else self.M_fam_strain
-
+        # Assume that 3 occurrence dataframes have same df.columns (strain ids)
+        strain_ids = self.gcf_strain_occurrence.columns
         results = {}
-        for i, obj_meta in enumerate(metabolome_objects):
-            for j, obj_gcf in enumerate(gcfs):
-                # just AND both arrays and extract the indices with positive results
-
-                # self.M_gcf_strain is a 2D np.array [index of gcf_list, index of strain_list]
-                # TODO: bug here. self.M_gcf_strain use the enumerate count of the gcf_list as index
-                # it's wrong to assume that enumerate count is same as the gcf.id.
-                # self.M_gcf_strain should use dataframe
-                # TODO: Bug: the result is 1D array of the enumerated count of strain_list
-                # for the common strains. Same here, the enumerate count is not
-                # the same as the strain.id
-                result = np.where(
-                    np.logical_and(data_a[ids_a[i]], self.M_gcf_strain[gcfs_id_list[j]]))[0]
-                # if we want to exclude results with no shared strains
-                if (filter_no_shared
-                        and len(result) > 0) or not filter_no_shared:
-                    results[(obj_meta, obj_gcf)] = result
-
+        for obj in spectra_or_molfams:
+            if isinstance(obj, SingletonFamily):
+                continue
+            for gcf in gcfs:
+                if isinstance(obj, Spectrum):
+                    shared_strains = strain_ids[np.logical_and(
+                        self.spec_strain_occurrence.loc[obj.spectrum_id],
+                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
+                else:
+                    shared_strains = strain_ids[np.logical_and(
+                        self.mf_strain_occurrence.loc[obj.family_id],
+                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
+                if filter_no_shared and len(shared_strains) == 0:
+                    continue
+                results[(obj, gcf)] = shared_strains.to_list()
         return results
 
     def correlation_matrices(self, type='spec-gcf'):
@@ -384,9 +314,9 @@ def correlation_matrices(self, type='spec-gcf'):
 
         # Make selection for scenario spec<->gcf or fam<->gcf
         if type == 'spec-gcf':
-            M_type1_strain = self.M_spec_strain
+            M_type1_strain = self.spec_strain_occurrence
         elif type == 'fam-gcf':
-            M_type1_strain = self.M_fam_strain
+            M_type1_strain = self.mf_strain_occurrence
         elif type == 'spec-bgc' or type == 'fam-bgc':
             raise Exception("Given types are not yet supported... ")
         else:
@@ -394,12 +324,11 @@ def correlation_matrices(self, type='spec-gcf'):
                 "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf', ..."
             )
 
-        logger.debug(
-            f"Calculating correlation matrices of type: {type}")
+        logger.debug(f"Calculating correlation matrices of type: {type}")
 
         # Calculate correlation matrix from co-occurence matrices
         M_type1_gcf, M_type1_notgcf, M_nottype1_gcf, M_nottype1_notgcf = calc_correlation_matrix(
-            M_type1_strain, self.M_gcf_strain)
+            M_type1_strain, self.gcf_strain_occurrence)
 
         # return results:
         if type == 'spec-gcf':
@@ -414,6 +343,3 @@ def correlation_matrices(self, type='spec-gcf'):
             self.M_notfam_notgcf = M_nottype1_notgcf
         else:
             raise Exception("No correct correlation matrix was created.")
-
-    # class data_links OUTPUT functions
-    # TODO add output functions (e.g. to search for mappings of individual specs, gcfs etc.)

From 2cd8e01b98b7fe83fd819a8933afabeb2e0d4d31 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 16:37:02 +0200
Subject: [PATCH 32/95] update references of the new dataframes from DataLinks

---
 src/nplinker/process_output.py                |  2 +-
 src/nplinker/scoring/linking/link_finder.py   | 24 +++++++++----------
 .../scoring/linking/link_likelihood.py        |  6 ++---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/nplinker/process_output.py b/src/nplinker/process_output.py
index 42420867..9892a583 100644
--- a/src/nplinker/process_output.py
+++ b/src/nplinker/process_output.py
@@ -51,7 +51,7 @@ def get_sig_spec(data_link, sig_links, scores, gcf_pos, min_n_strains=2):
     # Check if there are *any* strains in the GCF
     # No strains = MiBIG
     # Can also filter if only (e.g. 2 strains)
-    strain_sum = data_link.M_gcf_strain[gcf_pos, :].sum()
+    strain_sum = data_link.gcf_strain_occurrence[gcf_pos, :].sum()
     if strain_sum < min_n_strains:
         return []
     col = sig_links[:, gcf_pos]  # get the column
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index cf3563a5..71aaa2e6 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -18,8 +18,8 @@
 # import packages for plotting
 # TODO move plotting to separate module?
 try:
-    import seaborn as sns
     from matplotlib import pyplot as plt
+    import seaborn as sns
 except ImportError:
     print(
         'Warning: plotting functionality will not be available (missing matplotlib and/or seaborn)'
@@ -104,7 +104,7 @@ def metcalf_scoring(self,
 
         # Compute the expected values for all possible values of spec and gcf strains
         # we need the total number of strains
-        _, n_strains = data_links.M_gcf_strain.shape
+        _, n_strains = data_links.gcf_strain_occurrence.shape
         if self.metcalf_expected is None:
             sz = (n_strains + 1, n_strains + 1)
             self.metcalf_expected = np.zeros(sz)
@@ -177,7 +177,7 @@ def hg_scoring(self, data_links, type='spec-gcf'):
 
         if type == 'spec-gcf':
             num_strains = np.ones(
-                data_links.M_spec_gcf.shape) * data_links.M_gcf_strain.shape[1]
+                data_links.M_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
             overlap_counts = data_links.M_spec_gcf
             gcf_counts = overlap_counts + data_links.M_notspec_gcf
             spec_counts = overlap_counts + data_links.M_spec_notgcf
@@ -189,7 +189,7 @@ def hg_scoring(self, data_links, type='spec-gcf'):
             self.hg_spec_gcf = hg_scores
         elif type == 'fam-gcf':
             num_strains = np.ones(
-                data_links.M_fam_gcf.shape) * data_links.M_gcf_strain.shape[1]
+                data_links.M_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
             overlap_counts = data_links.M_fam_gcf
             gcf_counts = overlap_counts + data_links.M_notfam_gcf
             fam_counts = overlap_counts + data_links.M_fam_notgcf
@@ -350,7 +350,7 @@ def select_link_candidates(self,
         P_str = np.array(data_links.mapping_strain["no of spectra"])
         P_str = P_str / np.sum(P_str)
 
-        num_strains = data_links.M_gcf_strain.shape[1]
+        num_strains = data_links.gcf_strain_occurrence.shape[1]
 
         # Calculate the hypergeometric probability (as before)
         for i in range(link_candidates.shape[1]):
@@ -361,11 +361,11 @@ def select_link_candidates(self,
 
         # Calculate the GCF specific probability
         for i in range(link_candidates.shape[1]):
-            id_spec = int(link_candidates[0, i])
-            id_gcf = int(link_candidates[1, i])
+            id_spec = link_candidates[0, i]
+            id_gcf = link_candidates[1, i]
 
             # find set of strains which contain GCF with id link_candidates[1,i]
-            XG = np.where(data_links.M_gcf_strain[id_gcf, :] == 1)[0]
+            XG = np.where(data_links.gcf_strain_occurrence.loc[id_gcf, :] == 1)[0]
 
             link_candidates[10,
                             i] = pair_prob_approx(P_str, XG,
@@ -374,11 +374,11 @@ def select_link_candidates(self,
             # Calculate the link specific probability
             # Find strains where GCF and spectra/family co-occur
             if type == 'spec-gcf':
-                XGS = np.where((data_links.M_gcf_strain[id_gcf, :] == 1) &
-                               (data_links.M_spec_strain[id_spec, :] == 1))[0]
+                XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1) &
+                               (data_links.spec_strain_occurrence[id_spec, :] == 1))[0]
             elif type == 'fam-gcf':
-                XGS = np.where((data_links.M_gcf_strain[id_gcf, :] == 1)
-                               & (data_links.M_fam_strain[id_spec, :] == 1))[0]
+                XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1)
+                               & (data_links.mf_strain_occurrence[id_spec, :] == 1))[0]
             link_candidates[11,
                             i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]),
                                            int(Ny_list[id_spec]), num_strains)
diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py
index fa68f952..90446a18 100644
--- a/src/nplinker/scoring/linking/link_likelihood.py
+++ b/src/nplinker/scoring/linking/link_likelihood.py
@@ -49,12 +49,12 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'):
             M_type1_type2 = data_links.M_spec_gcf
             M_type1_nottype2 = data_links.M_spec_notgcf
             M_nottype1_type2 = data_links.M_notspec_gcf
-            M_type1_cond = data_links.M_spec_strain
+            M_type1_cond = data_links.spec_strain_occurrence
         elif type == 'fam-gcf':
             M_type1_type2 = data_links.M_fam_gcf
             M_type1_nottype2 = data_links.M_fam_notgcf
             M_nottype1_type2 = data_links.M_notfam_gcf
-            M_type1_cond = data_links.M_fam_strain
+            M_type1_cond = data_links.mf_strain_occurrence
         elif type == 'spec-bgc' or type == 'fam-bgc':
             raise Exception("Given types are not yet supported... ")
         else:
@@ -67,7 +67,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'):
         # Calculate likelihood matrices using calc_likelihood_matrix()
         P_type2_given_type1, P_type2_not_type1, P_type1_given_type2, \
             P_type1_not_type2 = calc_likelihood_matrix(M_type1_cond,
-                                                                              data_links.M_gcf_strain,
+                                                                              data_links.gcf_strain_occurrence,
                                                                               M_type1_type2,
                                                                               M_type1_nottype2,
                                                                               M_nottype1_type2)

From 2834345beb34aa3fd4694c651a9cbcdca66d557f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 16:38:20 +0200
Subject: [PATCH 33/95] update logics of `get_links` in NPLinker class

---
 src/nplinker/nplinker.py | 75 ++++++----------------------------------
 1 file changed, 10 insertions(+), 65 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 8c78f47f..cc80872b 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -28,6 +28,7 @@
 from .scoring.metcalf_scoring import MetcalfScoring
 from .scoring.np_class_scoring import NPClassScoring
 from .scoring.rosetta_scoring import RosettaScoring
+from .strain_collection import StrainCollection
 
 
 logger = LogConfig.getLogger(__name__)
@@ -289,7 +290,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
         self._bgcs = self._loader.bgcs
         self._gcfs = self._loader.gcfs
         self._mibig_bgc_dict = self._loader.mibig_bgc_dict
-        self._strains = self._loader.strains
+        self._strains: StrainCollection = self._loader.strains
         self._product_types = self._loader.product_types
         self._chem_classes = self._loader.chem_classes
         self._class_matches = self._loader.class_matches
@@ -433,24 +434,20 @@ def get_links(self, input_objects, scoring_methods, and_mode=True):
             targets = list(
                 filter(lambda x: not isinstance(x, BGC), link_data.keys()))
             if len(targets) > 0:
-                shared_strains = self._datalinks.common_strains([source],
-                                                                targets, True)
-                for objpair in shared_strains.keys():
-                    shared_strains[objpair] = [
-                        self._strains.lookup_index(x)
-                        for x in shared_strains[objpair]
-                    ]
-
                 if isinstance(source, GCF):
+                    shared_strains = self._datalinks.common_strains(targets, [source], True)
                     for target, link in link_data.items():
                         if (target, source) in shared_strains:
-                            link.shared_strains = shared_strains[(target,
-                                                                  source)]
+                            link.shared_strains = [
+                                self._strains.lookup(strain_id) for strain_id
+                                in shared_strains[(target, source)]]
                 else:
+                    shared_strains = self._datalinks.common_strains([source], targets, True)
                     for target, link in link_data.items():
                         if (source, target) in shared_strains:
-                            link.shared_strains = shared_strains[(source,
-                                                                  target)]
+                            link.shared_strains = [
+                                self._strains.lookup(strain_id) for strain_id
+                                in shared_strains[(source, target)]]
 
         logger.debug('Finished calculating shared strain information')
 
@@ -601,55 +598,3 @@ def scoring_method(self, name):
             self._scoring_methods_setup_complete[name] = True
 
         return self._scoring_methods.get(name, None)(self)
-
-
-if __name__ == "__main__":
-    # can set default logging configuration this way...
-    LogConfig.setLogLevel(logging.DEBUG)
-
-    # initialise NPLinker from the command-line arguments
-    npl = NPLinker(Args().get_args())
-
-    # load the dataset
-    if not npl.load_data():
-        print('Failed to load the dataset!')
-        sys.exit(-1)
-
-    # create a metcalf scoring object
-    mc = npl.scoring_method('metcalf')
-    if mc is not None:
-        # set a scoring cutoff threshold
-        mc.cutoff = 0.5
-
-        # pick some GCFs to get links for
-        test_gcfs = npl.gcfs[:10]
-
-        # tell nplinker to find links for this set of GCFs using metcalf
-        # scoring
-        results = npl.get_links(test_gcfs, mc)
-
-        # check if any links were found
-        if len(results) == 0:
-            print('No links found!')
-            sys.exit(0)
-
-        # the "result" object will be a LinkCollection, holding all the information
-        # returned by the scoring method(s) used
-        print(f'{len(results)} total links found')
-
-        # display some information about each object and its links
-        for obj, result in results.links.items():
-            print('Results for object: {}, {} total links, {} methods used'.
-                  format(obj, len(result), results.method_count))
-
-            # get links for this object, sorted by metcalf score
-            sorted_links = results.get_sorted_links(mc, obj)
-            for link_data in sorted_links:
-                print('  --> [{}] {} | {} | # shared_strains = {}'.format(
-                    ','.join(method.name for method in link_data.methods),
-                    link_data.target, link_data[mc],
-                    len(link_data.shared_strains)))
-
-    rs = npl.scoring_method('rosetta')
-    if rs is not None:
-        print('OK')

From ee5ff03f07c31a477555a75d6aef863c8796c3c0 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 16:38:35 +0200
Subject: [PATCH 34/95] Update test_nplinker.py

- add code to remove cached results
---
 tests/test_nplinker.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/tests/test_nplinker.py b/tests/test_nplinker.py
index 3b9171af..4399d24c 100644
--- a/tests/test_nplinker.py
+++ b/tests/test_nplinker.py
@@ -1,33 +1,39 @@
+import os
+from pathlib import Path
 import pytest
 from nplinker.nplinker import NPLinker
-import os
-
 from . import DATA_DIR
 
+
 @pytest.fixture(scope='module')
-def instance() -> NPLinker:
+def npl() -> NPLinker:
     npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
     npl.load_data()
+    # remove cached results before running tests
+    root_dir = Path(npl.root_dir)
+    score_cache = root_dir / 'metcalf' / 'metcalf_scores.pckl'
+    score_cache.unlink(missing_ok=True)
     return npl
 
 
-@pytest.mark.skipif(os.environ.get('CI') == 'true', reason="Skip when running on CI")
-def test_load_data(instance: NPLinker):
-    assert len(instance.bgcs) == 390
-    assert len(instance.gcfs) == 113
-    assert len(instance.spectra) == 25935
-    assert len(instance.molfams) == 25769
+@pytest.mark.skipif(os.environ.get('CI') == 'true',
+                    reason="Skip when running on CI")
+def test_load_data(npl: NPLinker):
+    assert len(npl.bgcs) == 390
+    assert len(npl.gcfs) == 113
+    assert len(npl.spectra) == 25935
+    assert len(npl.molfams) == 25769
 
 
-@pytest.mark.skipif(os.environ.get('CI') == 'true' , reason="Skip when running on CI")
-def test_get_links(instance: NPLinker):
-    mc = instance.scoring_method('metcalf')
+@pytest.mark.skipif(os.environ.get('CI') == 'true',
+                    reason="Skip when running on CI")
+def test_get_links(npl: NPLinker):
+    mc = npl.scoring_method('metcalf')
     mc.cutoff = 3.5
     mc.standardised = True
 
-    actual = instance.get_links(instance.gcfs, mc, and_mode=True)
+    actual = npl.get_links(npl.gcfs, mc, and_mode=True)
     assert len(actual) == len(actual.sources) == len(actual.links) == 101
-    
+
     actual.filter_links(lambda link: link[mc] > 5.0)
     assert len(actual.links) == 60
-

From ef9441863ad575cf68c79d880bf93651e6817a4e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 17:44:40 +0200
Subject: [PATCH 35/95] move SCORING_METHODS to LinkFinder

---
 src/nplinker/scoring/linking/link_finder.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 71aaa2e6..b14a6cfd 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -4,7 +4,6 @@
 from scipy.stats import hypergeom
 from nplinker.genomics.gcf import GCF
 from nplinker.metabolomics.spectrum import Spectrum
-from nplinker.scoring.linking.data_linking import SCORING_METHODS
 from .data_linking_functions import pair_prob_approx
 from .data_linking_functions import pair_prob_hg
 
@@ -12,9 +11,6 @@
 # CG: TODO get_links function does not work any more, need to update its logics
 
 
-# CG: TODO get_links function does not work any more, need to update its logics
-
-
 # import packages for plotting
 # TODO move plotting to separate module?
 try:
@@ -31,6 +27,7 @@
 
 logger = LogConfig.getLogger(__file__)
 
+SCORING_METHODS = ['metcalf', 'likescore', 'hg']
 
 class LinkFinder():
     """

From 1edabc5f332cbc1668a4f8509c24060f75c031a9 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 1 May 2023 17:45:52 +0200
Subject: [PATCH 36/95] update method name to `get_common_strains`

---
 src/nplinker/nplinker.py                 | 6 +++---
 src/nplinker/scoring/np_class_scoring.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index cc80872b..e47c35fb 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -435,14 +435,14 @@ def get_links(self, input_objects, scoring_methods, and_mode=True):
                 filter(lambda x: not isinstance(x, BGC), link_data.keys()))
             if len(targets) > 0:
                 if isinstance(source, GCF):
-                    shared_strains = self._datalinks.common_strains(targets, [source], True)
+                    shared_strains = self._datalinks.get_common_strains(targets, [source], True)
                     for target, link in link_data.items():
                         if (target, source) in shared_strains:
                             link.shared_strains = [
                                 self._strains.lookup(strain_id) for strain_id
                                 in shared_strains[(target, source)]]
                 else:
-                    shared_strains = self._datalinks.common_strains([source], targets, True)
+                    shared_strains = self._datalinks.get_common_strains([source], targets, True)
                     for target, link in link_data.items():
                         if (source, target) in shared_strains:
                             link.shared_strains = [
@@ -491,7 +491,7 @@ def get_common_strains(self, objects_a, objects_b, filter_no_shared=True):
 
         # this is a dict with structure:
         #   (Spectrum/MolecularFamily, GCF) => list of strain indices
-        common_strains_index_dict = self._datalinks.common_strains(
+        common_strains_index_dict = self._datalinks.get_common_strains(
             objects_a, objects_b, filter_no_shared)
 
         common_strains = {}
diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py
index 304651dc..b63b7b0a 100644
--- a/src/nplinker/scoring/np_class_scoring.py
+++ b/src/nplinker/scoring/np_class_scoring.py
@@ -339,7 +339,7 @@ def get_links(self, objects, link_collection):
                 MetcalfScoring.NAME).datalinks
         # this is a dict with structure:
         #   tup(Spectrum/MolecularFamily, GCF) => array of strain indices
-        common_strains = self.npl._datalinks.common_strains(
+        common_strains = self.npl._datalinks.get_common_strains(
             objects, targets, True)
         logger.info(f"Calculating NPClassScore for {len(objects)} objects to "
                     f"{len(targets)} targets ({len(common_strains)} pairwise "

From d7ad2c05751d3fda2f9e190873cb99fc831ba794 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 2 May 2023 16:39:12 +0200
Subject: [PATCH 37/95] refactor mapping dataframes in DataLinks

---
 src/nplinker/scoring/linking/data_linking.py | 117 ++-----------------
 1 file changed, 9 insertions(+), 108 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index 651390ee..e3c566ef 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -43,10 +43,11 @@ def __init__(self):
         self.spec_strain_occurrence = pd.DataFrame()
         self.mf_strain_occurrence = pd.DataFrame()
 
-        # mappings (lookup lists to map between different ids and categories
+        # mapping tables, check `_get_mappings_from_occurance` for details
+        # TODO: these mappings could be removed when refactoring LinkFinder
         self.mapping_spec = pd.DataFrame()
         self.mapping_gcf = pd.DataFrame()
-        self.mapping_fam = pd.DataFrame()  # labels for strain-family matrix
+        self.mapping_fam = pd.DataFrame()
         self.mapping_strain = pd.DataFrame()
 
         # correlation matrices for spectra <-> GCFs
@@ -64,28 +65,27 @@ def __init__(self):
     def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF],
                   strains: StrainCollection,
                   molfams: Sequence[MolecularFamily]):
-        # load data from spectra, GCFs, and strains
-        logger.debug("Create mappings between spectra, gcfs, and strains.")
-        self.collect_mappings_spec(spectra)
-        self.collect_mappings_gcf(gcfs)
         logger.debug(
             "Create co-occurence matrices: spectra<->strains, gcfs<->strains and mfs<->strains."
         )
         self._get_gcf_strain_occurrence(gcfs, strains)
         self._get_spec_strain_occurrence(spectra, strains)
         self._get_mf_strain_occurrence(molfams, strains)
-
         self._get_mappings_from_occurrence()
 
     def _get_mappings_from_occurrence(self):
+        # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs
         self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence,
                                                    axis=1)
+        # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs
         self.mapping_spec["no of strains"] = np.sum(
             self.spec_strain_occurrence, axis=1)
-        self.mapping_strain["no of spectra"] = np.sum(
-            self.spec_strain_occurrence, axis=0)
+        # pd.Series with index = mf.family_id and value = number of strains where mf occurs
         self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence,
                                                    axis=1)
+        # pd.Series with index = strain.id and value = number of spectra in strain
+        self.mapping_strain["no of spectra"] = np.sum(
+            self.spec_strain_occurrence, axis=0)
 
     def find_correlations(self):
         # collect correlations/ co-occurences
@@ -94,105 +94,6 @@ def find_correlations(self):
         logger.debug("Create correlation matrices: mol-families<->gcfs.")
         self.correlation_matrices(type='fam-gcf')
 
-    def collect_mappings_spec(self, obj: Sequence[Spectrum]
-                              | Sequence[MolecularFamily]):
-        if isinstance(obj[0], Spectrum):
-            mapping_spec = self._collect_mappings_from_spectra(obj)
-        elif isinstance(obj[0], MolecularFamily):
-            mapping_spec = self._collect_mappings_from_molecular_families(obj)
-
-        # extend mapping tables:
-        # TODO: why do we need the mappings???
-        # "spec-id" is defined as the index of the spectrum in the input data
-        self.mapping_spec["fam-id"] = mapping_spec[:, 2]
-
-    def _collect_mappings_from_spectra(self,
-                                       spectra) -> np.ndarray[np.float64]:
-        # Collect most import mapping tables from input data
-        mapping_spec = np.zeros((len(spectra), 3))
-        mapping_spec[:, 0] = np.arange(0, len(spectra))
-
-        for i, spectrum in enumerate(spectra):
-            mapping_spec[i, 1] = spectrum.id
-            mapping_spec[i, 2] = spectrum.family.family_id
-
-        return mapping_spec
-
-    def _collect_mappings_from_molecular_families(
-            self,
-            molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]:
-        num_spectra = sum(len(x.spectra_ids) for x in molfams)
-        mapping_spec = np.zeros((num_spectra, 3))
-        mapping_spec[:, 0] = np.arange(0, num_spectra)
-
-        inverted_mappings = {}
-        for item in molfams:
-            for spectrum_id in item.spectra_ids:
-                inverted_mappings[spectrum_id] = item.family_id
-
-        for i, key in enumerate(inverted_mappings):
-            mapping_spec[i, 1] = key
-            mapping_spec[i, 2] = inverted_mappings[key]
-
-        return mapping_spec
-
-    def collect_mappings_gcf(self, gcf_list):
-        """
-        Find classes of gene cluster (nrps, pksi etc.)
-        collect most likely class (most occuring name, preferentially not "Others")
-        additional score shows fraction of chosen class among all given ones
-        """
-
-        # TODO: not only collect bigclass types but also product predictions
-        bigscape_bestguess = []
-        for i, gcf in enumerate(gcf_list):
-            #bigscape_class = []
-            #for i, bgc in enumerate(gcf_list[i].bgcs):
-            #    # bigscape_class.append(gcf_list[i].bgc_list[m].bigscape_class)
-            #    bigscape_class.append(bgc.bigscape_class)
-            #    class_counter = Counter(bigscape_class)
-
-            # TODO: this might need properly rewritten due to changes in the way GCF/BGC
-            # objects store bigscape class information and handle hybrid BGCs (see genomics.py). the
-            # original version i've left above iterates over every BGC in the current GCF
-            # and extracts its .bigscape_class attribute, but now each BGC can have multiple
-            # classes if it happens to be a hybrid and i'm not sure the version below
-            # still makes sense.
-            #
-            # doesn't seem to be very important for calculating the metcalf scores though so not urgent for now.
-            bigscape_class = []
-            for bgc in gcf.bgcs:
-                bigscape_class.extend(bgc.bigscape_classes)
-            class_counter = Counter(bigscape_class)
-
-            # try not to select "Others":
-            if class_counter.most_common(1)[0][0] is None:
-                bigscape_bestguess.append(["Others", 0])
-            elif class_counter.most_common(
-                    1)[0][0] == "Others" and class_counter.most_common(
-                        1)[0][1] < len(bigscape_class):
-                if class_counter.most_common(2)[1][0] is None:
-                    bigscape_bestguess.append([
-                        class_counter.most_common(1)[0][0],
-                        class_counter.most_common(1)[0][1] /
-                        len(bigscape_class)
-                    ])
-                else:
-                    bigscape_bestguess.append([
-                        class_counter.most_common(2)[1][0],
-                        class_counter.most_common(2)[1][1] /
-                        len(bigscape_class)
-                    ])
-            else:
-                bigscape_bestguess.append([
-                    class_counter.most_common(1)[0][0],
-                    class_counter.most_common(1)[0][1] / len(bigscape_class)
-                ])
-
-        # extend mapping tables:
-        bigscape_guess, bigscape_guessscore = zip(*bigscape_bestguess)
-        self.mapping_gcf["bgc class"] = bigscape_guess
-
     def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF],
                                    strains: StrainCollection) -> None:
         """Get the occurence of strains in gcfs.

From 4f8d8115cd4e7da8edc8c29ce35e58681e0a92ce Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 2 May 2023 16:39:42 +0200
Subject: [PATCH 38/95] add TODOs and deprecation to LinkFinder

---
 src/nplinker/scoring/linking/link_finder.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index b14a6cfd..0e61be88 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -1,4 +1,5 @@
 
+from deprecated import deprecated
 import numpy as np
 import pandas as pd
 from scipy.stats import hypergeom
@@ -387,6 +388,7 @@ def select_link_candidates(self,
         # add other potentially relevant knowdledge
         # If this will grow to more collected information -> create separate function/class
         bgc_class = []
+        # TODO CG: bgc class should be obtained from GCF object
         for i in link_candidates_pd["GCF id"].astype(int):
             bgc_class.append(data_links.mapping_gcf["bgc class"][i])
         link_candidates_pd["BGC class"] = bgc_class
@@ -574,6 +576,7 @@ def get_links(self,
 
         return links
 
+    @deprecated(version="1.3.3", reason="The unworkable method will be removed")
     def create_cytoscape_files(self,
                                data_links,
                                network_filename,

From ab7268a96e393593506d9545f27529b68ddc37f0 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 2 May 2023 17:38:30 +0200
Subject: [PATCH 39/95] refactor cooccurrence in DataLinks

---
 src/nplinker/scoring/linking/data_linking.py  | 81 +++++++------------
 src/nplinker/scoring/linking/link_finder.py   | 52 ++++++------
 .../scoring/linking/link_likelihood.py        | 12 +--
 3 files changed, 62 insertions(+), 83 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index e3c566ef..b188d7b8 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -37,7 +37,7 @@ class DataLinks():
     """
 
     def __init__(self):
-        # DataFrame to store occurence of objects with respect to strains
+        # DataFrame to store occurrence of objects with respect to strains
         # values = 1 where gcf/spec/fam occur in strain, 0 otherwise
         self.gcf_strain_occurrence = pd.DataFrame()
         self.spec_strain_occurrence = pd.DataFrame()
@@ -50,17 +50,6 @@ def __init__(self):
         self.mapping_fam = pd.DataFrame()
         self.mapping_strain = pd.DataFrame()
 
-        # correlation matrices for spectra <-> GCFs
-        self.M_spec_gcf = [
-        ]  # = int: Number of strains where spec_x and gcf_y co_occure
-        self.M_spec_notgcf = [
-        ]  # = int: Number of strains where spec_x and NOT-gcf_y co_occure
-        self.M_notspec_gcf = [
-        ]  # = int: Number of strains where NOT-spec_x and gcf_y co_occure
-        # and the same for mol.families <-> GCFs
-        self.M_fam_gcf = []
-        self.M_fam_notgcf = []
-        self.M_notfam_gcf = []
 
     def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF],
                   strains: StrainCollection,
@@ -90,9 +79,9 @@ def _get_mappings_from_occurrence(self):
     def find_correlations(self):
         # collect correlations/ co-occurences
         logger.debug("Create correlation matrices: spectra<->gcfs.")
-        self.correlation_matrices(type='spec-gcf')
+        self._get_cooccurrence(link_type='spec-gcf')
         logger.debug("Create correlation matrices: mol-families<->gcfs.")
-        self.correlation_matrices(type='fam-gcf')
+        self._get_cooccurrence(link_type='fam-gcf')
 
     def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF],
                                    strains: StrainCollection) -> None:
@@ -202,45 +191,35 @@ def get_common_strains(
                 results[(obj, gcf)] = shared_strains.to_list()
         return results
 
-    def correlation_matrices(self, type='spec-gcf'):
-        """
-        Collect co-occurrences accros strains:
-        IF type='spec-gcf':
-            number of co-occurences of spectra and GCFS
-            --> Output: M_spec_gcf matrix
-        IF type='fam-gcf':
-            number of co-occurences of mol.families and GCFS
-            --> Output: M_fam_gcf matrix
-        """
+    def _get_cooccurrence(self, link_type: str = 'spec-gcf'):
+        """Calculate co-occurrence for given link types across strains.
 
-        # Make selection for scenario spec<->gcf or fam<->gcf
-        if type == 'spec-gcf':
-            M_type1_strain = self.spec_strain_occurrence
-        elif type == 'fam-gcf':
-            M_type1_strain = self.mf_strain_occurrence
-        elif type == 'spec-bgc' or type == 'fam-bgc':
-            raise Exception("Given types are not yet supported... ")
+        Args:
+            link_type(str): Type of link to calculate co-occurrence for,
+                either 'spec-gcf' or 'fam-gcf'.
+        """
+        if link_type == 'spec-gcf':
+            met_strain_occurrence = self.spec_strain_occurrence
+        elif link_type == 'fam-gcf':
+            met_strain_occurrence = self.mf_strain_occurrence
         else:
-            raise Exception(
-                "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf', ..."
+            raise ValueError(
+                f"Link type {link_type} is not supported. Use 'spec-gcf' or 'fam-gcf' instead."
             )
 
-        logger.debug(f"Calculating correlation matrices of type: {type}")
-
-        # Calculate correlation matrix from co-occurence matrices
-        M_type1_gcf, M_type1_notgcf, M_nottype1_gcf, M_nottype1_notgcf = calc_correlation_matrix(
-            M_type1_strain, self.gcf_strain_occurrence)
-
-        # return results:
-        if type == 'spec-gcf':
-            self.M_spec_gcf = M_type1_gcf
-            self.M_spec_notgcf = M_type1_notgcf
-            self.M_notspec_gcf = M_nottype1_gcf
-            self.M_notspec_notgcf = M_nottype1_notgcf
-        elif type == 'fam-gcf':
-            self.M_fam_gcf = M_type1_gcf
-            self.M_fam_notgcf = M_type1_notgcf
-            self.M_notfam_gcf = M_nottype1_gcf
-            self.M_notfam_notgcf = M_nottype1_notgcf
+        logger.debug(f"Calculating correlation matrices of type: {link_type}")
+
+        met_gcf_corr, met_notgcf_corr, notmet_gcf_corr, notmet_notgcf_corr = calc_correlation_matrix(
+            met_strain_occurrence, self.gcf_strain_occurrence)
+
+        if link_type == 'spec-gcf':
+            # co-occurrence of spectrum and GCF across strains
+            self.cooccurrence_spec_gcf = met_gcf_corr
+            self.cooccurrence_spec_notgcf = met_notgcf_corr
+            self.cooccurrence_notspec_gcf = notmet_gcf_corr
+            self.cooccurrence_notspec_notgcf = notmet_notgcf_corr
         else:
-            raise Exception("No correct correlation matrix was created.")
+            self.cooccurrence_fam_gcf = met_gcf_corr
+            self.cooccurrence_fam_notgcf = met_notgcf_corr
+            self.cooccurrence_notfam_gcf = notmet_gcf_corr
+            self.cooccurrence_notfam_notgcf = notmet_notgcf_corr
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 0e61be88..fb198457 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -141,19 +141,19 @@ def metcalf_scoring(self,
         # at expected_metcalf[3,6] and sqrt of the variance in the same position
 
         if type == 'spec-gcf':
-            metcalf_scores = np.zeros(data_links.M_spec_gcf.shape)
-            metcalf_scores = (data_links.M_spec_gcf * both +
-                              data_links.M_spec_notgcf * type1_not_gcf +
-                              data_links.M_notspec_gcf * gcf_not_type1 +
-                              data_links.M_notspec_notgcf * not_type1_not_gcf)
+            metcalf_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape)
+            metcalf_scores = (data_links.cooccurrence_spec_gcf * both +
+                              data_links.cooccurrence_spec_notgcf * type1_not_gcf +
+                              data_links.cooccurrence_notspec_gcf * gcf_not_type1 +
+                              data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf)
             self.metcalf_spec_gcf = metcalf_scores
 
         elif type == 'fam-gcf':
-            metcalf_scores = np.zeros(data_links.M_fam_gcf.shape)
-            metcalf_scores = (data_links.M_fam_gcf * both +
-                              data_links.M_fam_notgcf * type1_not_gcf +
-                              data_links.M_notfam_gcf * gcf_not_type1 +
-                              data_links.M_notfam_notgcf * not_type1_not_gcf)
+            metcalf_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape)
+            metcalf_scores = (data_links.cooccurrence_fam_gcf * both +
+                              data_links.cooccurrence_fam_notgcf * type1_not_gcf +
+                              data_links.cooccurrence_notfam_gcf * gcf_not_type1 +
+                              data_links.cooccurrence_notfam_notgcf * not_type1_not_gcf)
 
             self.metcalf_fam_gcf = metcalf_scores
         return metcalf_scores
@@ -168,17 +168,17 @@ def hg_scoring(self, data_links, type='spec-gcf'):
         # Instead of "number of strains only in GCF", it requires "number of strains in the
         # GCF PLUS the number shared between the GCF and the other object".
         # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain,
-        # M_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead
+        # cooccurrence_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead
         # of "3", because the spectrum only has 2 distinct strains vs the GCF.
-        # To fix this the M_spec_gcf/M_fam_gcf matrix can just be added onto the others to give
+        # To fix this the cooccurrence_spec_gcf/cooccurrence_fam_gcf matrix can just be added onto the others to give
         # the correct totals.
 
         if type == 'spec-gcf':
             num_strains = np.ones(
-                data_links.M_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
-            overlap_counts = data_links.M_spec_gcf
-            gcf_counts = overlap_counts + data_links.M_notspec_gcf
-            spec_counts = overlap_counts + data_links.M_spec_notgcf
+                data_links.cooccurrence_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
+            overlap_counts = data_links.cooccurrence_spec_gcf
+            gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf
+            spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf
             hg_scores = hypergeom.sf(overlap_counts,
                                      num_strains,
                                      gcf_counts,
@@ -187,10 +187,10 @@ def hg_scoring(self, data_links, type='spec-gcf'):
             self.hg_spec_gcf = hg_scores
         elif type == 'fam-gcf':
             num_strains = np.ones(
-                data_links.M_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
-            overlap_counts = data_links.M_fam_gcf
-            gcf_counts = overlap_counts + data_links.M_notfam_gcf
-            fam_counts = overlap_counts + data_links.M_fam_notgcf
+                data_links.cooccurrence_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
+            overlap_counts = data_links.cooccurrence_fam_gcf
+            gcf_counts = overlap_counts + data_links.cooccurrence_notfam_gcf
+            fam_counts = overlap_counts + data_links.cooccurrence_fam_notgcf
             hg_scores = hypergeom.sf(overlap_counts,
                                      num_strains,
                                      gcf_counts,
@@ -224,19 +224,19 @@ def likelihood_scoring(self,
         """
 
         if type == 'spec-gcf':
-            likelihood_scores = np.zeros(data_links.M_spec_gcf.shape)
+            likelihood_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape)
             likelihood_scores = (
                 likelihoods.P_gcf_given_spec *
                 (1 - likelihoods.P_spec_not_gcf) *
-                (1 - np.exp(-alpha_weighing * data_links.M_spec_gcf)))
+                (1 - np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf)))
 
             self.likescores_spec_gcf = likelihood_scores
 
         elif type == 'fam-gcf':
-            likelihood_scores = np.zeros(data_links.M_fam_gcf.shape)
+            likelihood_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape)
             likelihood_scores = (
                 likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) *
-                (1 - np.exp(-alpha_weighing * data_links.M_fam_gcf)))
+                (1 - np.exp(-alpha_weighing * data_links.cooccurrence_fam_gcf)))
 
             self.likescores_fam_gcf = likelihood_scores
         return likelihood_scores
@@ -275,7 +275,7 @@ def select_link_candidates(self,
             P_gcf_not_type1 = likelihoods.P_gcf_not_spec
             P_type1_given_gcf = likelihoods.P_spec_given_gcf
             P_type1_not_gcf = likelihoods.P_spec_not_gcf
-            M_type1_gcf = data_links.M_spec_gcf
+            M_type1_gcf = data_links.cooccurrence_spec_gcf
             metcalf_scores = self.metcalf_spec_gcf
             likescores = self.likescores_spec_gcf
             index_names = [
@@ -290,7 +290,7 @@ def select_link_candidates(self,
             P_gcf_not_type1 = likelihoods.P_gcf_not_fam
             P_type1_given_gcf = likelihoods.P_fam_given_gcf
             P_type1_not_gcf = likelihoods.P_fam_not_gcf
-            M_type1_gcf = data_links.M_fam_gcf
+            M_type1_gcf = data_links.cooccurrence_fam_gcf
             metcalf_scores = self.metcalf_fam_gcf
             likescores = self.likescores_fam_gcf
             index_names = [
diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py
index 90446a18..c8758d19 100644
--- a/src/nplinker/scoring/linking/link_likelihood.py
+++ b/src/nplinker/scoring/linking/link_likelihood.py
@@ -46,14 +46,14 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'):
 
         # Make selection for scenario spec<->gcf or fam<->gcf
         if type == 'spec-gcf':
-            M_type1_type2 = data_links.M_spec_gcf
-            M_type1_nottype2 = data_links.M_spec_notgcf
-            M_nottype1_type2 = data_links.M_notspec_gcf
+            M_type1_type2 = data_links.cooccurrence_spec_gcf
+            M_type1_nottype2 = data_links.cooccurrence_spec_notgcf
+            M_nottype1_type2 = data_links.cooccurrence_notspec_gcf
             M_type1_cond = data_links.spec_strain_occurrence
         elif type == 'fam-gcf':
-            M_type1_type2 = data_links.M_fam_gcf
-            M_type1_nottype2 = data_links.M_fam_notgcf
-            M_nottype1_type2 = data_links.M_notfam_gcf
+            M_type1_type2 = data_links.cooccurrence_fam_gcf
+            M_type1_nottype2 = data_links.cooccurrence_fam_notgcf
+            M_nottype1_type2 = data_links.cooccurrence_notfam_gcf
             M_type1_cond = data_links.mf_strain_occurrence
         elif type == 'spec-bgc' or type == 'fam-bgc':
             raise Exception("Given types are not yet supported... ")

From 995d6d21024f922fc7dc437c264e8af06edf5f0e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 2 May 2023 17:52:49 +0200
Subject: [PATCH 40/95] merge `load_data` and `find_correlations` to init in
 DataLinks

---
 src/nplinker/scoring/linking/data_linking.py | 172 +++++++++----------
 src/nplinker/scoring/metcalf_scoring.py      |   5 +-
 tests/scoring/test_scoring.py                |   8 +-
 3 files changed, 80 insertions(+), 105 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index b188d7b8..07009975 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -1,14 +1,3 @@
-# Methods to find correlations between spectra/molecular families and
-# gene clusters/families (BGCs/GCFs)
-#
-# (still at very much protoype/exploration stage)
-#
-# Naming:
-# M_*   stands for a matrix format
-# map_* stands for a simple mapping lookup table
-# spec  stands for spectrum
-# fam   stands for molecular family
-
 from __future__ import annotations
 from collections import Counter
 from typing import Sequence, TYPE_CHECKING
@@ -35,54 +24,78 @@ class DataLinks():
     2) Mappings: Lookup-tables that link different ids and categories
     3) Correlation matrices that show how often spectra/families and GCFs co-occur
     """
-
-    def __init__(self):
-        # DataFrame to store occurrence of objects with respect to strains
-        # values = 1 where gcf/spec/fam occur in strain, 0 otherwise
-        self.gcf_strain_occurrence = pd.DataFrame()
-        self.spec_strain_occurrence = pd.DataFrame()
-        self.mf_strain_occurrence = pd.DataFrame()
-
-        # mapping tables, check `_get_mappings_from_occurance` for details
-        # TODO: these mappings could be removed when refactoring LinkFinder
-        self.mapping_spec = pd.DataFrame()
-        self.mapping_gcf = pd.DataFrame()
-        self.mapping_fam = pd.DataFrame()
-        self.mapping_strain = pd.DataFrame()
-
-
-    def load_data(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF],
-                  strains: StrainCollection,
-                  molfams: Sequence[MolecularFamily]):
+    def __init__(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF],
+                 strains: StrainCollection,
+                 molfams: Sequence[MolecularFamily]):
         logger.debug(
-            "Create co-occurence matrices: spectra<->strains, gcfs<->strains and mfs<->strains."
+            "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains."
         )
+        # DataFrame to store occurrence of gcfs/spectra/mfs with respect to strains
+        # values = 1 where gcf/spec/fam occur in strain, 0 otherwise
         self._get_gcf_strain_occurrence(gcfs, strains)
         self._get_spec_strain_occurrence(spectra, strains)
         self._get_mf_strain_occurrence(molfams, strains)
-        self._get_mappings_from_occurrence()
 
-    def _get_mappings_from_occurrence(self):
-        # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs
-        self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence,
-                                                   axis=1)
-        # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs
-        self.mapping_spec["no of strains"] = np.sum(
-            self.spec_strain_occurrence, axis=1)
-        # pd.Series with index = mf.family_id and value = number of strains where mf occurs
-        self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence,
-                                                   axis=1)
-        # pd.Series with index = strain.id and value = number of spectra in strain
-        self.mapping_strain["no of spectra"] = np.sum(
-            self.spec_strain_occurrence, axis=0)
+        # DataFrame to store mapping tables, check `_get_mappings_from_occurance` for details
+        # TODO: these mappings could be removed when refactoring LinkFinder
+        self._get_mappings_from_occurrence()
 
-    def find_correlations(self):
-        # collect correlations/ co-occurences
+        # np.array to store co-occurrence of "spectra<->gcf" or "mfs<->gcf"
         logger.debug("Create correlation matrices: spectra<->gcfs.")
         self._get_cooccurrence(link_type='spec-gcf')
         logger.debug("Create correlation matrices: mol-families<->gcfs.")
         self._get_cooccurrence(link_type='fam-gcf')
 
+    def get_common_strains(
+        self,
+        spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily],
+        gcfs: Sequence[GCF],
+        filter_no_shared: bool = False
+    ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]:
+        """Get common strains between given spectra/molecular families and GCFs.
+
+        Note that SingletonFamily objects are excluded from given `spectra_or_molfams`.
+
+        Args:
+            spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]):
+                A list of Spectrum or MolecularFamily objects.
+            gcfs(Sequence[GCF]): A list of GCF objects.
+            filter_no_shared(bool): If True, return only the pair of
+                spectrum/molecular family and GCF that have common strains;
+                otherwise, return all pairs no matter they have common strains
+                or not.
+
+        Returns:
+            dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
+            and values are a list of strain ids that appear in both objects.
+        """
+        if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)):
+            raise ValueError(
+                'Must provide Spectra or MolecularFamilies as the first argument!'
+            )
+        if not isinstance(gcfs[0], GCF):
+            raise ValueError('Must provide GCFs as the second argument!')
+
+        # Assume that 3 occurrence dataframes have same df.columns (strain ids)
+        strain_ids = self.gcf_strain_occurrence.columns
+        results = {}
+        for obj in spectra_or_molfams:
+            if isinstance(obj, SingletonFamily):
+                continue
+            for gcf in gcfs:
+                if isinstance(obj, Spectrum):
+                    shared_strains = strain_ids[np.logical_and(
+                        self.spec_strain_occurrence.loc[obj.spectrum_id],
+                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
+                else:
+                    shared_strains = strain_ids[np.logical_and(
+                        self.mf_strain_occurrence.loc[obj.family_id],
+                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
+                if filter_no_shared and len(shared_strains) == 0:
+                    continue
+                results[(obj, gcf)] = shared_strains.to_list()
+        return results
+
     def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF],
                                    strains: StrainCollection) -> None:
         """Get the occurence of strains in gcfs.
@@ -141,55 +154,24 @@ def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily],
                     df_mf_strain.loc[mf.family_id, strain.id] = 1
         self.mf_strain_occurrence = df_mf_strain
 
-    def get_common_strains(
-        self,
-        spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily],
-        gcfs: Sequence[GCF],
-        filter_no_shared: bool = False
-    ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]:
-        """Get common strains between given spectra/molecular families and GCFs.
-
-        Note that SingletonFamily objects are excluded from given `spectra_or_molfams`.
-
-        Args:
-            spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]):
-                A list of Spectrum or MolecularFamily objects.
-            gcfs(Sequence[GCF]): A list of GCF objects.
-            filter_no_shared(bool): If True, return only the pair of
-                spectrum/molecular family and GCF that have common strains;
-                otherwise, return all pairs no matter they have common strains
-                or not.
-
-        Returns:
-            dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
-            and values are a list of strain ids that appear in both objects.
-        """
-        if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)):
-            raise ValueError(
-                'Must provide Spectra or MolecularFamilies as the first argument!'
-            )
-        if not isinstance(gcfs[0], GCF):
-            raise ValueError('Must provide GCFs as the second argument!')
+    def _get_mappings_from_occurrence(self):
+        self.mapping_spec = pd.DataFrame()
+        self.mapping_gcf = pd.DataFrame()
+        self.mapping_fam = pd.DataFrame()
+        self.mapping_strain = pd.DataFrame()
 
-        # Assume that 3 occurrence dataframes have same df.columns (strain ids)
-        strain_ids = self.gcf_strain_occurrence.columns
-        results = {}
-        for obj in spectra_or_molfams:
-            if isinstance(obj, SingletonFamily):
-                continue
-            for gcf in gcfs:
-                if isinstance(obj, Spectrum):
-                    shared_strains = strain_ids[np.logical_and(
-                        self.spec_strain_occurrence.loc[obj.spectrum_id],
-                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
-                else:
-                    shared_strains = strain_ids[np.logical_and(
-                        self.mf_strain_occurrence.loc[obj.family_id],
-                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
-                if filter_no_shared and len(shared_strains) == 0:
-                    continue
-                results[(obj, gcf)] = shared_strains.to_list()
-        return results
+        # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs
+        self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence,
+                                                   axis=1)
+        # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs
+        self.mapping_spec["no of strains"] = np.sum(
+            self.spec_strain_occurrence, axis=1)
+        # pd.Series with index = mf.family_id and value = number of strains where mf occurs
+        self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence,
+                                                   axis=1)
+        # pd.Series with index = strain.id and value = number of spectra in strain
+        self.mapping_strain["no of spectra"] = np.sum(
+            self.spec_strain_occurrence, axis=0)
 
     def _get_cooccurrence(self, link_type: str = 'spec-gcf'):
         """Calculate co-occurrence for given link types across strains.
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 79e90fc4..b435635b 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -77,11 +77,8 @@ def setup(npl):
             logger.info(
                 'MetcalfScoring.setup preprocessing dataset (this may take some time)'
             )
-            MetcalfScoring.DATALINKS = DataLinks()
-            MetcalfScoring.DATALINKS.load_data(npl._spectra, npl._gcfs,
+            MetcalfScoring.DATALINKS = DataLinks(npl._spectra, npl._gcfs,
                                                npl._strains, npl.molfams)
-            # TODO fix crash with this set to True, see https://github.com/sdrogers/nplinker/issues/57
-            MetcalfScoring.DATALINKS.find_correlations()
             MetcalfScoring.LINKFINDER = LinkFinder()
             MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
                                                       type='spec-gcf')
diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py
index fe75611f..e43aa5ac 100644
--- a/tests/scoring/test_scoring.py
+++ b/tests/scoring/test_scoring.py
@@ -71,9 +71,7 @@ def do_scoring_old(gcfs, spectra, strains, standardised):
 
 
 def do_scoring_new(gcfs, spectra, strains, standardised):
-    datalinks = DataLinks()
-    datalinks.load_data(spectra, gcfs, strains)
-    datalinks.find_correlations()
+    datalinks = DataLinks(spectra, gcfs, strains)
     lf = LinkFinder()
     scores = lf.metcalf_scoring(datalinks)
 
@@ -104,9 +102,7 @@ def do_scoring_old_hg(gcfs, spectra, strains):
 
 
 def do_scoring_new_hg(gcfs, spectra, strains):
-    datalinks = DataLinks()
-    datalinks.load_data(spectra, gcfs, strains)
-    datalinks.find_correlations()
+    datalinks = DataLinks(spectra, gcfs, strains)
     lf = LinkFinder()
     scores = lf.hg_scoring(datalinks)
     return scores

From 742e6e5b917186c9400d431eb891fa1ec601fa9c Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 10:16:52 +0200
Subject: [PATCH 41/95] refactor DataLinks attributes

- Move assignment of attributes to `__init__`
- Rename attributes
- Replace `fam` or `molfam` with `mf` to refer to molecular family
- Add docstrings
---
 src/nplinker/process_output.py                |   2 +-
 src/nplinker/scoring/linking/data_linking.py  | 167 +++++++++++-------
 src/nplinker/scoring/linking/link_finder.py   |  80 ++++-----
 .../scoring/linking/link_likelihood.py        |  22 +--
 src/nplinker/scoring/metcalf_scoring.py       |   8 +-
 tests/scoring/test_scoring.py                 |   4 +-
 6 files changed, 157 insertions(+), 126 deletions(-)

diff --git a/src/nplinker/process_output.py b/src/nplinker/process_output.py
index 9892a583..27f23929 100644
--- a/src/nplinker/process_output.py
+++ b/src/nplinker/process_output.py
@@ -51,7 +51,7 @@ def get_sig_spec(data_link, sig_links, scores, gcf_pos, min_n_strains=2):
     # Check if there are *any* strains in the GCF
     # No strains = MiBIG
     # Can also filter if only (e.g. 2 strains)
-    strain_sum = data_link.gcf_strain_occurrence[gcf_pos, :].sum()
+    strain_sum = data_link.occurrence_gcf_strain[gcf_pos, :].sum()
     if strain_sum < min_n_strains:
         return []
     col = sig_links[:, gcf_pos]  # get the column
diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index 07009975..3c79f46f 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from collections import Counter
 from typing import Sequence, TYPE_CHECKING
 import numpy as np
 import pandas as pd
@@ -18,46 +17,93 @@
 
 
 class DataLinks():
-    """
-    DataLinks collects and structures co-occurence data
-    1) Co-occurences of spectra, families, and GCFs with respect to strains
-    2) Mappings: Lookup-tables that link different ids and categories
-    3) Correlation matrices that show how often spectra/families and GCFs co-occur
-    """
-    def __init__(self, spectra: Sequence[Spectrum], gcfs: Sequence[GCF],
-                 strains: StrainCollection,
-                 molfams: Sequence[MolecularFamily]):
+
+    def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
+                 mfs: Sequence[MolecularFamily],
+                 strains: StrainCollection):
+        """DataLinks class to store occurrence and co-occurrence information.
+
+        Occurrence refers to the presence of a spectrum/gcf/mf in a strain.
+        Co-occurrence refers to the presence of a spectrum/mf and a gcf in a strain.
+
+        Args:
+            gcfs(Sequence[GCF]): A list of GCF objects.
+            spectra(Sequence[Spectrum]): A list of Spectrum objects.
+            mfs(Sequence[MolecularFamily]): A list of MolecularFamily objects.
+            strains(StrainCollection): A StrainCollection object.
+
+        Attributes:
+            occurrence_gcf_strain(pd.DataFrame): A DataFrame to store occurrence of
+                gcfs with respect to strains.
+            occurrence_spec_strain(pd.DataFrame): A DataFrame to store occurrence of
+                spectra with respect to strains.
+            occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of
+                molecular families with respect to strains.
+            cooccurrence_spec_gcf(np.array): A 2D numpy array to store co-occurrence
+                of spectra<->gcfs.
+            cooccurrence_spec_notgcf(np.array): A 2D numpy array to store co-occurrence
+                of spectra<->not gcfs.
+            cooccurrence_notspec_gcf(np.array): A 2D numpy array to store co-occurrence
+                of not spectra<->gcfs.
+            cooccurrence_notspec_notgcf(np.array): A 2D numpy array to store co-occurrence
+                of not spectra<->not gcfs.
+            cooccurrence_mf_gcf(np.array): A 2D numpy array to store co-occurrence
+                of molecular families<->gcfs.
+            cooccurrence_mf_notgcf(np.array): A 2D numpy array to store co-occurrence
+                of molecular families<->not gcfs.
+            cooccurrence_notmf_gcf(np.array): A 2D numpy array to store co-occurrence
+                of not molecular families<->gcfs.
+            cooccurrence_notmf_notgcf(np.array): A 2D numpy array to store co-occurrence
+                of not molecular families<->not gcfs.
+            mapping_gcf(pd.DataFrame): A DataFrame to store mappings for gcfs.
+            mapping_spec(pd.DataFrame): A DataFrame to store mappings for spectra.
+            mapping_mf(pd.DataFrame): A DataFrame to store mappings for molecular families.
+            mapping_strain(pd.DataFrame): A DataFrame to store mappings for strains.
+        """
         logger.debug(
             "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains."
         )
         # DataFrame to store occurrence of gcfs/spectra/mfs with respect to strains
         # values = 1 where gcf/spec/fam occur in strain, 0 otherwise
-        self._get_gcf_strain_occurrence(gcfs, strains)
-        self._get_spec_strain_occurrence(spectra, strains)
-        self._get_mf_strain_occurrence(molfams, strains)
+        self.occurrence_gcf_strain = self._get_occurrence_gcf_strain(
+            gcfs, strains)
+        self.occurrence_spec_strain = self._get_occurrence_spec_strain(
+            spectra, strains)
+        self.occurrence_mf_strain = self._get_occurrence_mf_strain(
+            mfs, strains)
 
         # DataFrame to store mapping tables, check `_get_mappings_from_occurance` for details
         # TODO: these mappings could be removed when refactoring LinkFinder
+        self.mapping_spec = pd.DataFrame()
+        self.mapping_gcf = pd.DataFrame()
+        self.mapping_fam = pd.DataFrame()
+        self.mapping_strain = pd.DataFrame()
         self._get_mappings_from_occurrence()
 
         # np.array to store co-occurrence of "spectra<->gcf" or "mfs<->gcf"
         logger.debug("Create correlation matrices: spectra<->gcfs.")
-        self._get_cooccurrence(link_type='spec-gcf')
+        (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf,
+         self.cooccurrence_notspec_gcf,
+         self.cooccurrence_notspec_notgcf) = self._get_cooccurrence(
+             link_type='spec-gcf')
         logger.debug("Create correlation matrices: mol-families<->gcfs.")
-        self._get_cooccurrence(link_type='fam-gcf')
+        (self.cooccurrence_mf_gcf, self.cooccurrence_mf_notgcf,
+         self.cooccurrence_notmf_gcf,
+         self.cooccurrence_notmf_notgcf) = self._get_cooccurrence(
+             link_type='mf-gcf')
 
     def get_common_strains(
         self,
-        spectra_or_molfams: Sequence[Spectrum] | Sequence[MolecularFamily],
+        spectra_or_mfs: Sequence[Spectrum] | Sequence[MolecularFamily],
         gcfs: Sequence[GCF],
         filter_no_shared: bool = False
     ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]:
         """Get common strains between given spectra/molecular families and GCFs.
 
-        Note that SingletonFamily objects are excluded from given `spectra_or_molfams`.
+        Note that SingletonFamily objects are excluded from given `spectra_or_mfs`.
 
         Args:
-            spectra_or_molfams(Sequence[Spectrum] | Sequence[MolecularFamily]):
+            spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]):
                 A list of Spectrum or MolecularFamily objects.
             gcfs(Sequence[GCF]): A list of GCF objects.
             filter_no_shared(bool): If True, return only the pair of
@@ -69,7 +115,7 @@ def get_common_strains(
             dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
             and values are a list of strain ids that appear in both objects.
         """
-        if not isinstance(spectra_or_molfams[0], (Spectrum, MolecularFamily)):
+        if not isinstance(spectra_or_mfs[0], (Spectrum, MolecularFamily)):
             raise ValueError(
                 'Must provide Spectra or MolecularFamilies as the first argument!'
             )
@@ -77,27 +123,27 @@ def get_common_strains(
             raise ValueError('Must provide GCFs as the second argument!')
 
         # Assume that 3 occurrence dataframes have same df.columns (strain ids)
-        strain_ids = self.gcf_strain_occurrence.columns
+        strain_ids = self.occurrence_gcf_strain.columns
         results = {}
-        for obj in spectra_or_molfams:
+        for obj in spectra_or_mfs:
             if isinstance(obj, SingletonFamily):
                 continue
             for gcf in gcfs:
                 if isinstance(obj, Spectrum):
                     shared_strains = strain_ids[np.logical_and(
-                        self.spec_strain_occurrence.loc[obj.spectrum_id],
-                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
+                        self.occurrence_spec_strain.loc[obj.spectrum_id],
+                        self.occurrence_gcf_strain.loc[gcf.gcf_id])]
                 else:
                     shared_strains = strain_ids[np.logical_and(
-                        self.mf_strain_occurrence.loc[obj.family_id],
-                        self.gcf_strain_occurrence.loc[gcf.gcf_id])]
+                        self.occurrence_mf_strain.loc[obj.family_id],
+                        self.occurrence_gcf_strain.loc[gcf.gcf_id])]
                 if filter_no_shared and len(shared_strains) == 0:
                     continue
                 results[(obj, gcf)] = shared_strains.to_list()
         return results
 
-    def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF],
-                                   strains: StrainCollection) -> None:
+    def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF],
+                                   strains: StrainCollection) -> pd.DataFrame:
         """Get the occurence of strains in gcfs.
 
         The occurence is a DataFrame with gcfs as rows and strains as columns,
@@ -111,10 +157,10 @@ def _get_gcf_strain_occurrence(self, gcfs: Sequence[GCF],
             for strain in strains:
                 if gcf.has_strain(strain):
                     df_gcf_strain.loc[gcf.gcf_id, strain.id] = 1
-        self.gcf_strain_occurrence = df_gcf_strain
+        return df_gcf_strain
 
-    def _get_spec_strain_occurrence(self, spectra: Sequence[Spectrum],
-                                    strains: StrainCollection) -> None:
+    def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum],
+                                    strains: StrainCollection) -> pd.DataFrame:
         """Get the occurence of strains in spectra.
 
         The occurence is a DataFrame with spectra as rows and strains as columns,
@@ -129,10 +175,10 @@ def _get_spec_strain_occurrence(self, spectra: Sequence[Spectrum],
             for strain in strains:
                 if spectrum.has_strain(strain):
                     df_spec_strain.loc[spectrum.spectrum_id, strain.id] = 1
-        self.spec_strain_occurrence = df_spec_strain
+        return df_spec_strain
 
-    def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily],
-                                  strains: StrainCollection) -> None:
+    def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily],
+                                  strains: StrainCollection) -> pd.DataFrame:
         """Get the occurence of strains in molecular families.
 
         The occurence is a DataFrame with molecular families as rows and
@@ -152,56 +198,41 @@ def _get_mf_strain_occurrence(self, mfs: Sequence[MolecularFamily],
             for strain in strains:
                 if mf.has_strain(strain):
                     df_mf_strain.loc[mf.family_id, strain.id] = 1
-        self.mf_strain_occurrence = df_mf_strain
+        return df_mf_strain
 
     def _get_mappings_from_occurrence(self):
-        self.mapping_spec = pd.DataFrame()
-        self.mapping_gcf = pd.DataFrame()
-        self.mapping_fam = pd.DataFrame()
-        self.mapping_strain = pd.DataFrame()
 
         # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs
-        self.mapping_gcf["no of strains"] = np.sum(self.gcf_strain_occurrence,
-                                                   axis=1)
+        self.mapping_gcf["no of strains"] = self.occurrence_gcf_strain.sum(
+            axis=1)
         # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs
-        self.mapping_spec["no of strains"] = np.sum(
-            self.spec_strain_occurrence, axis=1)
+        self.mapping_spec["no of strains"] = self.occurrence_spec_strain.sum(
+            axis=1)
         # pd.Series with index = mf.family_id and value = number of strains where mf occurs
-        self.mapping_fam["no of strains"] = np.sum(self.mf_strain_occurrence,
-                                                   axis=1)
+        self.mapping_fam["no of strains"] = self.occurrence_mf_strain.sum(
+            axis=1)
         # pd.Series with index = strain.id and value = number of spectra in strain
-        self.mapping_strain["no of spectra"] = np.sum(
-            self.spec_strain_occurrence, axis=0)
+        self.mapping_strain["no of spectra"] = self.occurrence_spec_strain.sum(
+            axis=0)
 
-    def _get_cooccurrence(self, link_type: str = 'spec-gcf'):
+    def _get_cooccurrence(
+        self,
+        link_type: str = 'spec-gcf'
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
         """Calculate co-occurrence for given link types across strains.
 
         Args:
             link_type(str): Type of link to calculate co-occurrence for,
-                either 'spec-gcf' or 'fam-gcf'.
+                either 'spec-gcf' or 'mf-gcf'.
         """
         if link_type == 'spec-gcf':
-            met_strain_occurrence = self.spec_strain_occurrence
-        elif link_type == 'fam-gcf':
-            met_strain_occurrence = self.mf_strain_occurrence
+            met_strain_occurrence = self.occurrence_spec_strain
+        elif link_type == 'mf-gcf':
+            met_strain_occurrence = self.occurrence_mf_strain
         else:
             raise ValueError(
-                f"Link type {link_type} is not supported. Use 'spec-gcf' or 'fam-gcf' instead."
+                f"Link type {link_type} is not supported. Use 'spec-gcf' or 'mf-gcf' instead."
             )
-
         logger.debug(f"Calculating correlation matrices of type: {link_type}")
-
-        met_gcf_corr, met_notgcf_corr, notmet_gcf_corr, notmet_notgcf_corr = calc_correlation_matrix(
-            met_strain_occurrence, self.gcf_strain_occurrence)
-
-        if link_type == 'spec-gcf':
-            # co-occurrence of spectrum and GCF across strains
-            self.cooccurrence_spec_gcf = met_gcf_corr
-            self.cooccurrence_spec_notgcf = met_notgcf_corr
-            self.cooccurrence_notspec_gcf = notmet_gcf_corr
-            self.cooccurrence_notspec_notgcf = notmet_notgcf_corr
-        else:
-            self.cooccurrence_fam_gcf = met_gcf_corr
-            self.cooccurrence_fam_notgcf = met_notgcf_corr
-            self.cooccurrence_notfam_gcf = notmet_gcf_corr
-            self.cooccurrence_notfam_notgcf = notmet_notgcf_corr
+        return calc_correlation_matrix(met_strain_occurrence,
+                                       self.occurrence_gcf_strain)
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index fb198457..7fcdae03 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -72,17 +72,17 @@ def get_scores(self, method, type_):
         if method == 'metcalf':
             if type_ == 'spec-gcf':
                 return self.metcalf_spec_gcf
-            elif type_ == 'fam-gcf':
+            elif type_ == 'mf-gcf':
                 return self.metcalf_fam_gcf
         elif method == 'likescore':
             if type_ == 'spec-gcf':
                 return self.likescores_spec_gcf
-            elif type_ == 'fam-gcf':
+            elif type_ == 'mf-gcf':
                 return self.likescores_fam_gcf
         elif method == 'hg':
             if type_ == 'spec-gcf':
                 return self.hg_spec_gcf
-            elif type_ == 'fam-gcf':
+            elif type_ == 'mf-gcf':
                 return self.hg_fam_gcf
 
         raise Exception(
@@ -102,7 +102,7 @@ def metcalf_scoring(self,
 
         # Compute the expected values for all possible values of spec and gcf strains
         # we need the total number of strains
-        _, n_strains = data_links.gcf_strain_occurrence.shape
+        _, n_strains = data_links.occurrence_gcf_strain.shape
         if self.metcalf_expected is None:
             sz = (n_strains + 1, n_strains + 1)
             self.metcalf_expected = np.zeros(sz)
@@ -148,12 +148,12 @@ def metcalf_scoring(self,
                               data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf)
             self.metcalf_spec_gcf = metcalf_scores
 
-        elif type == 'fam-gcf':
-            metcalf_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape)
-            metcalf_scores = (data_links.cooccurrence_fam_gcf * both +
-                              data_links.cooccurrence_fam_notgcf * type1_not_gcf +
-                              data_links.cooccurrence_notfam_gcf * gcf_not_type1 +
-                              data_links.cooccurrence_notfam_notgcf * not_type1_not_gcf)
+        elif type == 'mf-gcf':
+            metcalf_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape)
+            metcalf_scores = (data_links.cooccurrence_mf_gcf * both +
+                              data_links.cooccurrence_mf_notgcf * type1_not_gcf +
+                              data_links.cooccurrence_notmf_gcf * gcf_not_type1 +
+                              data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf)
 
             self.metcalf_fam_gcf = metcalf_scores
         return metcalf_scores
@@ -170,12 +170,12 @@ def hg_scoring(self, data_links, type='spec-gcf'):
         # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain,
         # cooccurrence_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead
         # of "3", because the spectrum only has 2 distinct strains vs the GCF.
-        # To fix this the cooccurrence_spec_gcf/cooccurrence_fam_gcf matrix can just be added onto the others to give
+        # To fix this the cooccurrence_spec_gcf/cooccurrence_mf_gcf matrix can just be added onto the others to give
         # the correct totals.
 
         if type == 'spec-gcf':
             num_strains = np.ones(
-                data_links.cooccurrence_spec_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
+                data_links.cooccurrence_spec_gcf.shape) * data_links.occurrence_gcf_strain.shape[1]
             overlap_counts = data_links.cooccurrence_spec_gcf
             gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf
             spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf
@@ -185,12 +185,12 @@ def hg_scoring(self, data_links, type='spec-gcf'):
                                      spec_counts,
                                      loc=1)
             self.hg_spec_gcf = hg_scores
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             num_strains = np.ones(
-                data_links.cooccurrence_fam_gcf.shape) * data_links.gcf_strain_occurrence.shape[1]
-            overlap_counts = data_links.cooccurrence_fam_gcf
-            gcf_counts = overlap_counts + data_links.cooccurrence_notfam_gcf
-            fam_counts = overlap_counts + data_links.cooccurrence_fam_notgcf
+                data_links.cooccurrence_mf_gcf.shape) * data_links.occurrence_gcf_strain.shape[1]
+            overlap_counts = data_links.cooccurrence_mf_gcf
+            gcf_counts = overlap_counts + data_links.cooccurrence_notmf_gcf
+            fam_counts = overlap_counts + data_links.cooccurrence_mf_notgcf
             hg_scores = hypergeom.sf(overlap_counts,
                                      num_strains,
                                      gcf_counts,
@@ -232,11 +232,11 @@ def likelihood_scoring(self,
 
             self.likescores_spec_gcf = likelihood_scores
 
-        elif type == 'fam-gcf':
-            likelihood_scores = np.zeros(data_links.cooccurrence_fam_gcf.shape)
+        elif type == 'mf-gcf':
+            likelihood_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape)
             likelihood_scores = (
                 likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) *
-                (1 - np.exp(-alpha_weighing * data_links.cooccurrence_fam_gcf)))
+                (1 - np.exp(-alpha_weighing * data_links.cooccurrence_mf_gcf)))
 
             self.likescores_fam_gcf = likelihood_scores
         return likelihood_scores
@@ -247,11 +247,11 @@ def select_link_candidates(self,
                                P_cutoff=0.8,
                                main_score='likescore',
                                score_cutoff=0,
-                               type='fam-gcf'):
+                               type='mf-gcf'):
         """
         Look for potential best candidate for links between
         IF type='spec-gcf': GCFs and spectra
-        IF type='fam-gcf': GCFs and mol.families
+        IF type='mf-gcf': GCFs and mol.families
 
         Parameters
         ----------
@@ -269,7 +269,7 @@ def select_link_candidates(self,
             score >= score_cutoff
         """
 
-        # Select scenario: spec<->gcf or fam<->gcf
+        # Select scenario: spec<->gcf or mf<->gcf
         if type == 'spec-gcf':
             P_gcf_given_type1 = likelihoods.P_gcf_given_spec
             P_gcf_not_type1 = likelihoods.P_gcf_not_spec
@@ -285,12 +285,12 @@ def select_link_candidates(self,
                 "link prob specific"
             ]
 
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             P_gcf_given_type1 = likelihoods.P_gcf_given_fam
             P_gcf_not_type1 = likelihoods.P_gcf_not_fam
             P_type1_given_gcf = likelihoods.P_fam_given_gcf
             P_type1_not_gcf = likelihoods.P_fam_not_gcf
-            M_type1_gcf = data_links.cooccurrence_fam_gcf
+            M_type1_gcf = data_links.cooccurrence_mf_gcf
             metcalf_scores = self.metcalf_fam_gcf
             likescores = self.likescores_fam_gcf
             index_names = [
@@ -304,7 +304,7 @@ def select_link_candidates(self,
             raise Exception("Given types are not yet supported... ")
         else:
             raise Exception(
-                "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf'..."
+                "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..."
             )
 
         dim1, dim2 = P_gcf_given_type1.shape
@@ -341,14 +341,14 @@ def select_link_candidates(self,
         Nx_list = data_links.mapping_gcf["no of strains"]
         if type == 'spec-gcf':
             Ny_list = data_links.mapping_spec["no of strains"]
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             Ny_list = data_links.mapping_fam["no of strains"]
 
         # Calculate probabilities of finding a spectrum in a certain strain
         P_str = np.array(data_links.mapping_strain["no of spectra"])
         P_str = P_str / np.sum(P_str)
 
-        num_strains = data_links.gcf_strain_occurrence.shape[1]
+        num_strains = data_links.occurrence_gcf_strain.shape[1]
 
         # Calculate the hypergeometric probability (as before)
         for i in range(link_candidates.shape[1]):
@@ -363,7 +363,7 @@ def select_link_candidates(self,
             id_gcf = link_candidates[1, i]
 
             # find set of strains which contain GCF with id link_candidates[1,i]
-            XG = np.where(data_links.gcf_strain_occurrence.loc[id_gcf, :] == 1)[0]
+            XG = np.where(data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0]
 
             link_candidates[10,
                             i] = pair_prob_approx(P_str, XG,
@@ -372,11 +372,11 @@ def select_link_candidates(self,
             # Calculate the link specific probability
             # Find strains where GCF and spectra/family co-occur
             if type == 'spec-gcf':
-                XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1) &
-                               (data_links.spec_strain_occurrence[id_spec, :] == 1))[0]
-            elif type == 'fam-gcf':
-                XGS = np.where((data_links.gcf_strain_occurrence[id_gcf, :] == 1)
-                               & (data_links.mf_strain_occurrence[id_spec, :] == 1))[0]
+                XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1) &
+                               (data_links.occurrence_spec_strain[id_spec, :] == 1))[0]
+            elif type == 'mf-gcf':
+                XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1)
+                               & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0]
             link_candidates[11,
                             i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]),
                                            int(Ny_list[id_spec]), num_strains)
@@ -401,7 +401,7 @@ def select_link_candidates(self,
         # return results
         if type == 'spec-gcf':
             self.link_candidates_gcf_spec = link_candidates_pd
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             self.link_candidates_gcf_fam = link_candidates_pd
         else:
             raise Exception("No candidate selection was created.")
@@ -580,7 +580,7 @@ def get_links(self,
     def create_cytoscape_files(self,
                                data_links,
                                network_filename,
-                               link_type='fam-gcf',
+                               link_type='mf-gcf',
                                score_type='metcalf'):
         """
         Create network file for import into Cytoscape.
@@ -603,7 +603,7 @@ def create_cytoscape_files(self,
         import networkx as nx
         NPlinker_net = nx.Graph()
 
-        if link_type == 'fam-gcf':
+        if link_type == 'mf-gcf':
             link_candidates = self.link_candidates_gcf_fam
             type1str = 'family_id'
         elif link_type == 'spec-gcf':
@@ -669,7 +669,7 @@ def plot_candidates(self,
                         P_cutoff=0.8,
                         score_type='likescore',
                         score_cutoff=0,
-                        type='fam-gcf'):
+                        type='mf-gcf'):
         """
         Plot best rated correlations between gcfs and spectra/families
         plot in form of seaborn clustermap
@@ -690,7 +690,7 @@ def plot_candidates(self,
             selected_ids = np.where(
                 (link_candidates["P(gcf|spec)"] > P_cutoff)
                 & (link_candidates[scorestr] > score_cutoff))[0]
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             link_candidates = self.link_candidates_gcf_fam
             selected_ids = np.where(
                 (link_candidates["P(gcf|fam)"] > P_cutoff)
@@ -731,7 +731,7 @@ def plot_candidates(self,
                                columns=mapping_gcfs.astype(int))
         if type == 'spec-gcf':
             M_links.index.name = 'spectrum number'
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             M_links.index.name = 'molecular family number'
         M_links.columns.name = 'gene cluster family (GCF)'
 
diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py
index c8758d19..830e0676 100644
--- a/src/nplinker/scoring/linking/link_likelihood.py
+++ b/src/nplinker/scoring/linking/link_likelihood.py
@@ -39,27 +39,27 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'):
         IF type='spec-gcf':
         P(GCF_x | spec_y), P(spec_y | GCF_x),
         P(GCF_x | not spec_y), P(spec_y | not GCF_x)
-        IF type='fam-gcf':
+        IF type='mf-gcf':
         P(GCF_x | fam_y), P(fam_y | GCF_x),
         P(GCF_x | not fam_y), P(fam_y | not GCF_x)
         """
 
-        # Make selection for scenario spec<->gcf or fam<->gcf
+        # Make selection for scenario spec<->gcf or mf<->gcf
         if type == 'spec-gcf':
             M_type1_type2 = data_links.cooccurrence_spec_gcf
             M_type1_nottype2 = data_links.cooccurrence_spec_notgcf
             M_nottype1_type2 = data_links.cooccurrence_notspec_gcf
-            M_type1_cond = data_links.spec_strain_occurrence
-        elif type == 'fam-gcf':
-            M_type1_type2 = data_links.cooccurrence_fam_gcf
-            M_type1_nottype2 = data_links.cooccurrence_fam_notgcf
-            M_nottype1_type2 = data_links.cooccurrence_notfam_gcf
-            M_type1_cond = data_links.mf_strain_occurrence
+            M_type1_cond = data_links.occurrence_spec_strain
+        elif type == 'mf-gcf':
+            M_type1_type2 = data_links.cooccurrence_mf_gcf
+            M_type1_nottype2 = data_links.cooccurrence_mf_notgcf
+            M_nottype1_type2 = data_links.cooccurrence_notmf_gcf
+            M_type1_cond = data_links.occurrence_mf_strain
         elif type == 'spec-bgc' or type == 'fam-bgc':
             raise Exception("Given types are not yet supported... ")
         else:
             raise Exception(
-                "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf'..."
+                "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..."
             )
 
         logger.debug(
@@ -67,7 +67,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'):
         # Calculate likelihood matrices using calc_likelihood_matrix()
         P_type2_given_type1, P_type2_not_type1, P_type1_given_type2, \
             P_type1_not_type2 = calc_likelihood_matrix(M_type1_cond,
-                                                                              data_links.gcf_strain_occurrence,
+                                                                              data_links.occurrence_gcf_strain,
                                                                               M_type1_type2,
                                                                               M_type1_nottype2,
                                                                               M_nottype1_type2)
@@ -76,7 +76,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'):
             self.P_gcf_not_spec = P_type2_not_type1
             self.P_spec_given_gcf = P_type1_given_type2
             self.P_spec_not_gcf = P_type1_not_type2
-        elif type == 'fam-gcf':
+        elif type == 'mf-gcf':
             self.P_gcf_given_fam = P_type2_given_type1
             self.P_gcf_not_fam = P_type2_not_type1
             self.P_fam_given_gcf = P_type1_given_type2
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index b435635b..00c04487 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -77,13 +77,13 @@ def setup(npl):
             logger.info(
                 'MetcalfScoring.setup preprocessing dataset (this may take some time)'
             )
-            MetcalfScoring.DATALINKS = DataLinks(npl._spectra, npl._gcfs,
-                                               npl._strains, npl.molfams)
+            MetcalfScoring.DATALINKS = DataLinks(npl._gcfs, npl._spectra,
+                                                 npl._molfams, npl._strains)
             MetcalfScoring.LINKFINDER = LinkFinder()
             MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
                                                       type='spec-gcf')
             MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
-                                                      type='fam-gcf')
+                                                      type='mf-gcf')
             logger.debug('MetcalfScoring.setup caching results')
             save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS,
                                MetcalfScoring.LINKFINDER), cache_file)
@@ -238,7 +238,7 @@ def get_links(self, objects, link_collection):
                 'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs={}, results={}'
                 .format(len(objects), results[0].shape))
             # for GCF input, results contains two arrays of shape (3, x),
-            # which contain spec-gcf and fam-gcf links respectively
+            # which contain spec-gcf and mf-gcf links respectively
             result_gcf_spec, result_gcf_fam = results[0], results[1]
 
             for res, type_ in [(result_gcf_spec, Spectrum),
diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py
index e43aa5ac..8c46a28e 100644
--- a/tests/scoring/test_scoring.py
+++ b/tests/scoring/test_scoring.py
@@ -71,7 +71,7 @@ def do_scoring_old(gcfs, spectra, strains, standardised):
 
 
 def do_scoring_new(gcfs, spectra, strains, standardised):
-    datalinks = DataLinks(spectra, gcfs, strains)
+    datalinks = DataLinks(gcfs, spectra, strains)
     lf = LinkFinder()
     scores = lf.metcalf_scoring(datalinks)
 
@@ -102,7 +102,7 @@ def do_scoring_old_hg(gcfs, spectra, strains):
 
 
 def do_scoring_new_hg(gcfs, spectra, strains):
-    datalinks = DataLinks(spectra, gcfs, strains)
+    datalinks = DataLinks(gcfs, spectra, strains)
     lf = LinkFinder()
     scores = lf.hg_scoring(datalinks)
     return scores

From 66cda2c0b45a2ce3adb06c22c7b91b95be6ff00d Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 12:14:47 +0200
Subject: [PATCH 42/95] Delete test_data_links.py

---
 tests/scoring/test_data_links.py | 44 --------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 tests/scoring/test_data_links.py

diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
deleted file mode 100644
index e794a29b..00000000
--- a/tests/scoring/test_data_links.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import pytest
-from nplinker.metabolomics.gnps.gnps_molecular_family_loader import \
-    GNPSMolecularFamilyLoader
-from nplinker.metabolomics.metabolomics import make_families
-from nplinker.metabolomics.spectrum import Spectrum
-from nplinker.scoring.linking.data_linking import DataLinks
-from .. import DATA_DIR
-
-
-@pytest.fixture
-def spec_with_families(spec_dict) -> dict[str, Spectrum]:
-    make_families(spec_dict.values())
-    return spec_dict
-
-@pytest.fixture
-def molecular_families_gnps():
-    filename = os.path.join(DATA_DIR, "edges.pairsinfo")
-    sut = GNPSMolecularFamilyLoader(filename)
-    return sut.families()
-
-def test_collect_mappings_from_spectra(spec_with_families):
-    sut = DataLinks()
-    actual = sut._collect_mappings_from_spectra(spec_with_families.values())
-
-    assert actual.shape == (25935,3)
-
-
-def test_collect_mappings_from_molecular_families(molecular_families_gnps):
-    sut = DataLinks()
-    actual = sut._collect_mappings_from_molecular_families(molecular_families_gnps)
-
-    assert actual.shape == (25935,3)
-
-
-def test_mappings_are_equal(spec_with_families, molecular_families_gnps):
-    sut = DataLinks()
-    sut._collect_mappings_from_spectra(spec_with_families.values())
-    actual = sut.mapping_spec
-
-    sut._collect_mappings_from_molecular_families(molecular_families_gnps)
-    expected = sut.mapping_spec
-
-    assert actual.eq(expected).all(axis=None)

From 7094d0d99da482db64b2803077c46392a65ea69a Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 11:49:53 +0200
Subject: [PATCH 43/95] update get_common_strains methods

- update parameters to be more clear and specific
- change strain id in returned dict to strain objects
-  update docstrings
---
 src/nplinker/nplinker.py                     | 104 +++++++------------
 src/nplinker/scoring/linking/data_linking.py |  16 +--
 src/nplinker/scoring/np_class_scoring.py     |  10 +-
 3 files changed, 52 insertions(+), 78 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index e47c35fb..bc598723 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -1,20 +1,8 @@
-# Copyright 2021 The NPLinker Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+from __future__ import annotations
 import copy
 import logging
 import sys
+from typing import TYPE_CHECKING
 from .config import Args
 from .config import Config
 from .genomics import BGC
@@ -30,6 +18,9 @@
 from .scoring.rosetta_scoring import RosettaScoring
 from .strain_collection import StrainCollection
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from .strains import Strain
 
 logger = LogConfig.getLogger(__name__)
 
@@ -274,8 +265,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
         else:
             # CG: only reload genomics data when changing bigscape cutoff
             # TODO: this part should be removed, reload everything if bigscape data changes.
-            logger.debug(
-                f'load_data with new cutoff = {new_bigscape_cutoff}')
+            logger.debug(f'load_data with new cutoff = {new_bigscape_cutoff}')
             # 1. change the cutoff (which by itself doesn't do anything)
             self._loader._bigscape_cutoff = new_bigscape_cutoff
             # 2. reload the strain mappings (MiBIG filtering may have removed strains
@@ -435,19 +425,19 @@ def get_links(self, input_objects, scoring_methods, and_mode=True):
                 filter(lambda x: not isinstance(x, BGC), link_data.keys()))
             if len(targets) > 0:
                 if isinstance(source, GCF):
-                    shared_strains = self._datalinks.get_common_strains(targets, [source], True)
+                    shared_strains = self._datalinks.get_common_strains(
+                        targets, [source], True)
                     for target, link in link_data.items():
                         if (target, source) in shared_strains:
-                            link.shared_strains = [
-                                self._strains.lookup(strain_id) for strain_id
-                                in shared_strains[(target, source)]]
+                            link.shared_strains = shared_strains[(target,
+                                                                  source)]
                 else:
-                    shared_strains = self._datalinks.get_common_strains([source], targets, True)
+                    shared_strains = self._datalinks.get_common_strains(
+                        [source], targets, True)
                     for target, link in link_data.items():
                         if (source, target) in shared_strains:
-                            link.shared_strains = [
-                                self._strains.lookup(strain_id) for strain_id
-                                in shared_strains[(source, target)]]
+                            link.shared_strains = shared_strains[(source,
+                                                                  target)]
 
         logger.debug('Finished calculating shared strain information')
 
@@ -455,55 +445,32 @@ def get_links(self, input_objects, scoring_methods, and_mode=True):
             len(link_collection)))
         return link_collection
 
-    def get_common_strains(self, objects_a, objects_b, filter_no_shared=True):
-        """Retrive strains shared by arbitrary pairs of objects.
-
-        Two lists of objects are required as input. Typically one list will be
-        MolecularFamily or Spectrum objects and the other GCF (which list is which
-        doesn't matter).
+    def get_common_strains(
+        self,
+        met: Sequence[Spectrum] | Sequence[MolecularFamily],
+        gcfs: Sequence[GCF],
+        filter_no_shared: bool = True
+    ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]:
+        """Get common strains between given spectra/molecular families and GCFs.
 
-        The return value is a dict mapping pairs of objects to lists of Strain objects
-        shared by that pair. This list may be empty if ``filter_no_shared`` is False,
-        indicating no shared strains were found.
-
-        If ``filter_no_shared`` is True, every entry in the dict with no shared strains
-        will be removed before it is returned, so the only entries will be those for
-        which shared strains exist.
+        Note that SingletonFamily objects are excluded from given molecular families.
 
         Args:
-            objects_a (list): a list of Spectrum/MolecularFamily/GCF objects
-            objects_b (list): a list of Spectrum/MolecularFamily/GCF objects
-            filter_no_shared (bool): if True, remove result entries for which no shared strains exist
+            met(Sequence[Spectrum] | Sequence[MolecularFamily]):
+                A list of Spectrum or MolecularFamily objects.
+            gcfs(Sequence[GCF]): A list of GCF objects.
+            filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF
+                without common strains will be removed from the returned dict;
 
         Returns:
-            A dict mapping pairs of objects (obj1, obj2) to lists of Strain objects.
-
-            NOTE: The ordering of the pairs is *fixed* to be (metabolomic, genomic). In
-            other words, if objects_a = [GCF1, GC2, ...] and objects_b = [Spectrum1,
-            Spectrum2, ...], the object pairs will be (Spectrum1, GCF1), (Spectrum2,
-            GCF2), and so on. The same applies if objects_a and objects_b are swapped,
-            the metabolomic objects (Spectrum or MolecularFamily) will be the obj1
-            entry in each pair.
+            dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
+            and values are a list of shared Strain objects.
         """
         if not self._datalinks:
             self._datalinks = self.scoring_method(
                 MetcalfScoring.NAME).datalinks
-
-        # this is a dict with structure:
-        #   (Spectrum/MolecularFamily, GCF) => list of strain indices
-        common_strains_index_dict = self._datalinks.get_common_strains(
-            objects_a, objects_b, filter_no_shared)
-
-        common_strains = {}
-        # replace the lists of strain indices with actual strain objects
-        # TODO: bug here, the index value of common_strains_index_dict is
-        #       not the same as the index value of self._strains
-        # Solution: lookup with strain.id instead of index
-        for key in common_strains_index_dict:
-            common_strains[key] = [
-                self._strains.lookup_index(x) for x in common_strains_index_dict[key]
-            ]
-
+        common_strains = self._datalinks.get_common_strains(
+            met, gcfs, filter_no_shared)
         return common_strains
 
     def has_bgc(self, bgc_id):
@@ -512,15 +479,18 @@ def has_bgc(self, bgc_id):
 
     def lookup_bgc(self, bgc_id):
         """If BGC ``bgc_id`` exists, return it. Otherwise return None"""
-        return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc(bgc_id) else None
+        return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc(
+            bgc_id) else None
 
     def lookup_gcf(self, gcf_id):
         """If GCF ``gcf_id`` exists, return it. Otherwise return None"""
-        return self._gcfs[self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None
+        return self._gcfs[
+            self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None
 
     def lookup_spectrum(self, name):
         """If Spectrum ``name`` exists, return it. Otherwise return None"""
-        return self._spectra[self._spec_lookup[name]] if name in self._spec_lookup else None
+        return self._spectra[
+            self._spec_lookup[name]] if name in self._spec_lookup else None
 
     @property
     def strains(self):
diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index 3c79f46f..daa46915 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -12,6 +12,8 @@
 
 if TYPE_CHECKING:
     from nplinker.strain_collection import StrainCollection
+    from nplinker.strains import Strain
+
 
 logger = LogConfig.getLogger(__name__)
 
@@ -60,6 +62,8 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
             mapping_mf(pd.DataFrame): A DataFrame to store mappings for molecular families.
             mapping_strain(pd.DataFrame): A DataFrame to store mappings for strains.
         """
+        self._strains = strains
+
         logger.debug(
             "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains."
         )
@@ -97,7 +101,7 @@ def get_common_strains(
         spectra_or_mfs: Sequence[Spectrum] | Sequence[MolecularFamily],
         gcfs: Sequence[GCF],
         filter_no_shared: bool = False
-    ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[str]]:
+    ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]:
         """Get common strains between given spectra/molecular families and GCFs.
 
         Note that SingletonFamily objects are excluded from given `spectra_or_mfs`.
@@ -106,14 +110,12 @@ def get_common_strains(
             spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]):
                 A list of Spectrum or MolecularFamily objects.
             gcfs(Sequence[GCF]): A list of GCF objects.
-            filter_no_shared(bool): If True, return only the pair of
-                spectrum/molecular family and GCF that have common strains;
-                otherwise, return all pairs no matter they have common strains
-                or not.
+            filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF
+                without common strains will be removed from the returned dict;
 
         Returns:
             dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
-            and values are a list of strain ids that appear in both objects.
+            and values are a list of shared Strain objects.
         """
         if not isinstance(spectra_or_mfs[0], (Spectrum, MolecularFamily)):
             raise ValueError(
@@ -139,7 +141,7 @@ def get_common_strains(
                         self.occurrence_gcf_strain.loc[gcf.gcf_id])]
                 if filter_no_shared and len(shared_strains) == 0:
                     continue
-                results[(obj, gcf)] = shared_strains.to_list()
+                results[(obj, gcf)] = [self._strains.lookup(strain_id) for strain_id in shared_strains]
         return results
 
     def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF],
diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py
index b63b7b0a..79d1a71f 100644
--- a/src/nplinker/scoring/np_class_scoring.py
+++ b/src/nplinker/scoring/np_class_scoring.py
@@ -337,10 +337,12 @@ def get_links(self, objects, link_collection):
         if not self.npl._datalinks:
             self.npl._datalinks = self.npl.scoring_method(
                 MetcalfScoring.NAME).datalinks
-        # this is a dict with structure:
-        #   tup(Spectrum/MolecularFamily, GCF) => array of strain indices
-        common_strains = self.npl._datalinks.get_common_strains(
-            objects, targets, True)
+        if obj_is_gen:
+            common_strains = self.npl.get_common_strains(
+                targets, objects)
+        else:
+            common_strains = self.npl.get_common_strains(
+                objects, targets)
         logger.info(f"Calculating NPClassScore for {len(objects)} objects to "
                     f"{len(targets)} targets ({len(common_strains)} pairwise "
                     f"interactions that share at least 1 strain). This might "

From 0b137468bab7fbadd57709751d2aec90c3de4bd1 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 12:08:57 +0200
Subject: [PATCH 44/95] remove lookup_index method from StrainCollection (#90)

- remove method `lookup_index`
- remove attribute `_strain_dict_index`
---
 src/nplinker/strain_collection.py | 18 +-----------------
 tests/test_strain_collection.py   | 13 +------------
 2 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 35e63daf..16bca126 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -1,5 +1,4 @@
 import csv
-import os
 from os import PathLike
 from pathlib import Path
 from typing import Iterator
@@ -9,7 +8,6 @@
 from .utils import list_dirs
 from .utils import list_files
 
-
 logger = LogConfig.getLogger(__name__)
 
 
@@ -20,7 +18,6 @@ def __init__(self):
         self._strains: list[Strain] = []
         # dict of strain name (id and alias) to strain object
         self._strain_dict_name: dict[str, Strain] = {}
-        self._strain_dict_index: dict[int, Strain] = {}
 
     def __repr__(self) -> str:
         return str(self)
@@ -38,7 +35,7 @@ def __len__(self) -> int:
     def __eq__(self, other) -> bool:
         if isinstance(other, StrainCollection):
             return (self._strains == other._strains
-                    and self._strain_dict_index == other._strain_dict_index)
+                    and self._strain_dict_name == other._strain_dict_name)
         return NotImplemented
 
     def __contains__(self, item: str | Strain) -> bool:
@@ -74,7 +71,6 @@ def add(self, strain: Strain) -> None:
                 existing.add_alias(alias)
                 self._strain_dict_name[alias] = existing
         else:
-            self._strain_dict_index[len(self)] = strain
             self._strains.append(strain)
             self._strain_dict_name[strain.id] = strain
             for alias in strain.aliases:
@@ -88,7 +84,6 @@ def remove(self, strain: Strain):
         """
         if strain.id in self._strain_dict_name:
             self._strains.remove(strain)
-            # remove from dict id
             del self._strain_dict_name[strain.id]
             for alias in strain.aliases:
                 del self._strain_dict_name[alias]
@@ -102,17 +97,6 @@ def filter(self, strain_set: set[Strain]):
             if strain not in strain_set:
                 self.remove(strain)
 
-    def lookup_index(self, index: int) -> Strain:
-        """Return the strain from lookup by index.
-
-        Args:
-            index(int): Position index from which to retrieve the strain
-
-        Returns:
-            Strain: Strain identified by the given index.
-        """
-        return self._strain_dict_index[index]
-
     def lookup(self, name: str) -> Strain:
         """Lookup a strain by name (id or alias).
 
diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py
index bd900a0f..dcdad3df 100644
--- a/tests/test_strain_collection.py
+++ b/tests/test_strain_collection.py
@@ -41,13 +41,13 @@ def test_iter(collection: StrainCollection, strain: Strain):
     for actual in collection:
         assert actual == strain
 
+
 def test_add(strain: Strain):
     sut = StrainCollection()
     sut.add(strain)
     assert strain in sut
     for alias in strain.aliases:
         assert alias in sut
-    assert sut._strain_dict_index[0] == strain
 
 
 def test_remove(collection: StrainCollection, strain: Strain):
@@ -56,9 +56,6 @@ def test_remove(collection: StrainCollection, strain: Strain):
     with pytest.raises(KeyError):
         _ = collection._strain_dict_name[strain.id]
     assert strain not in collection
-    # TODO: issue #90
-    # with pytest.raises(KeyError):
-    #     collection.lookup_index(0)
 
 
 def test_filter(collection: StrainCollection, strain: Strain):
@@ -70,13 +67,6 @@ def test_filter(collection: StrainCollection, strain: Strain):
     assert len(collection) == 1
 
 
-def test_lookup_index(collection: StrainCollection, strain: Strain):
-    actual = collection.lookup_index(0)
-    assert actual == strain
-    with pytest.raises(KeyError):
-        collection.lookup_index(1)
-
-
 def test_lookup(collection: StrainCollection, strain: Strain):
     assert collection.lookup(strain.id) == strain
     for alias in strain.aliases:
@@ -89,7 +79,6 @@ def test_add_from_file():
     sut = StrainCollection()
     sut.add_from_file(DATA_DIR / "strain_mappings.csv")
     assert len(sut) == 27
-    assert len(sut.lookup_index(1).aliases) == 29
 
 
 def test_save_to_file(collection: StrainCollection, tmp_path):

From 96d221041bb58a504b73b9f50dda48e6145d92ba Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 14:39:54 +0200
Subject: [PATCH 45/95] Remove integer id from GCF

---
 src/nplinker/genomics/gcf.py                | 2 +-
 src/nplinker/loader.py                      | 6 ------
 src/nplinker/scoring/linking/link_finder.py | 5 ++---
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
index 5ea61486..62669bf5 100644
--- a/src/nplinker/genomics/gcf.py
+++ b/src/nplinker/genomics/gcf.py
@@ -3,6 +3,7 @@
 from nplinker.logconfig import LogConfig
 from nplinker.strain_collection import StrainCollection
 
+
 if TYPE_CHECKING:
     from nplinker.strains import Strain
     from .bgc import BGC
@@ -35,7 +36,6 @@ def __init__(self, gcf_id: str, /) -> None:
         self.bigscape_class: str | None = None
         # CG TODO: remove attribute id, see issue 103
         #    https://github.com/NPLinker/nplinker/issues/103
-        self.id: int | None = None
         self.bgc_ids: set[str] = set()
         self.strains: StrainCollection = StrainCollection()
 
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index d4865630..3f44b4df 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -659,12 +659,6 @@ def _load_genomics(self):
             antismash_bgc_loader.get_files(),
             self._bigscape_cutoff)
 
-        # CG TODO: remove the gcf.id, see issue 103
-        #    https://github.com/NPLinker/nplinker/issues/103
-        # This is only place to set gcf.id value.
-        for i, gcf in enumerate(self.gcfs):
-            gcf.id = i
-
         #----------------------------------------------------------------------
         # CG: write unknown strains in genomics to file
         #----------------------------------------------------------------------
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 7fcdae03..5997a08f 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -439,15 +439,14 @@ def get_links(self,
             query_size = 1
 
         # Check type of input_object:
+        # TODO CG: replace integer ids with string ids
         # If GCF:
         if isinstance(input_object[0], GCF):
             input_type = "gcf"
             link_levels = [0, 1]
 
             # Get necessary ids
-            # CG: TODO update the logics here:
-            #   don't use integer gcf.id, use string gcf.gcf_id instead.
-            input_ids = np.array([gcf.id for gcf in input_object],
+            input_ids = np.array([gcf.gcf_id for gcf in input_object],
                                  dtype=np.int32)
 
             if main_score == 'likescore':

From 44d2c675b46e1e6225ce3f3e6dcddd3c6d6bdf31 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 14:51:30 +0200
Subject: [PATCH 46/95] update lookup methods and attributes in NPLikner class

---
 src/nplinker/nplinker.py | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index bc598723..02799ed0 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -122,6 +122,7 @@ def __init__(self, userconfig=None):
         self._class_matches = None
 
         self._bgc_lookup = {}
+        self._gcf_lookup = {}
         self._spec_lookup = {}
 
         self._scoring_methods = {}
@@ -286,25 +287,17 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
         self._class_matches = self._loader.class_matches
 
         logger.debug('Generating lookup tables: genomics')
-        self._bgc_lookup = {}
-        for i, bgc in enumerate(self._bgcs):
-            self._bgc_lookup[bgc.bgc_id] = i
-
-        self._gcf_lookup = {}
-        for i, gcf in enumerate(self._gcfs):
-            self._gcf_lookup[gcf.gcf_id] = i
+        self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs}
+        self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs}
 
         # don't need to do these two if cutoff changed (indicating genomics data
         # was reloaded but not metabolomics)
         if new_bigscape_cutoff is None:
             logger.debug('Generating lookup tables: metabolomics')
-            self._spec_lookup = {}
-            for i, spec in enumerate(self._spectra):
-                self._spec_lookup[spec.spectrum_id] = i
-
-            self._molfam_lookup = {}
-            for i, molfam in enumerate(self._molfams):
-                self._molfam_lookup[molfam.id] = i
+            self._spec_lookup = {
+                spec.spectrum_id: spec
+                for spec in self._spectra
+            }
 
         logger.debug('load_data: completed')
         return True
@@ -479,18 +472,15 @@ def has_bgc(self, bgc_id):
 
     def lookup_bgc(self, bgc_id):
         """If BGC ``bgc_id`` exists, return it. Otherwise return None"""
-        return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc(
-            bgc_id) else None
+        return self._bgc_lookup.get(bgc_id, None)
 
     def lookup_gcf(self, gcf_id):
         """If GCF ``gcf_id`` exists, return it. Otherwise return None"""
-        return self._gcfs[
-            self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None
+        return self._gcf_lookup.get(gcf_id, None)
 
-    def lookup_spectrum(self, name):
+    def lookup_spectrum(self, spectrum_id):
         """If Spectrum ``name`` exists, return it. Otherwise return None"""
-        return self._spectra[
-            self._spec_lookup[name]] if name in self._spec_lookup else None
+        return self._spec_lookup.get(spectrum_id, None)
 
     @property
     def strains(self):

From 92aee38528709994e7814bd579931d69e2bec6fd Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 15:26:40 +0200
Subject: [PATCH 47/95] change cooccurrence from array to DataFrame in
 DataLinks

---
 src/nplinker/scoring/linking/data_linking.py | 47 +++++++++++++-------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index daa46915..0edec183 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -14,15 +14,13 @@
     from nplinker.strain_collection import StrainCollection
     from nplinker.strains import Strain
 
-
 logger = LogConfig.getLogger(__name__)
 
 
 class DataLinks():
 
     def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
-                 mfs: Sequence[MolecularFamily],
-                 strains: StrainCollection):
+                 mfs: Sequence[MolecularFamily], strains: StrainCollection):
         """DataLinks class to store occurrence and co-occurrence information.
 
         Occurrence refers to the presence of a spectrum/gcf/mf in a strain.
@@ -41,21 +39,21 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
                 spectra with respect to strains.
             occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of
                 molecular families with respect to strains.
-            cooccurrence_spec_gcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_spec_gcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of spectra<->gcfs.
-            cooccurrence_spec_notgcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_spec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of spectra<->not gcfs.
-            cooccurrence_notspec_gcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_notspec_gcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of not spectra<->gcfs.
-            cooccurrence_notspec_notgcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_notspec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of not spectra<->not gcfs.
-            cooccurrence_mf_gcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_mf_gcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of molecular families<->gcfs.
-            cooccurrence_mf_notgcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_mf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of molecular families<->not gcfs.
-            cooccurrence_notmf_gcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_notmf_gcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of not molecular families<->gcfs.
-            cooccurrence_notmf_notgcf(np.array): A 2D numpy array to store co-occurrence
+            cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of not molecular families<->not gcfs.
             mapping_gcf(pd.DataFrame): A DataFrame to store mappings for gcfs.
             mapping_spec(pd.DataFrame): A DataFrame to store mappings for spectra.
@@ -84,7 +82,7 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
         self.mapping_strain = pd.DataFrame()
         self._get_mappings_from_occurrence()
 
-        # np.array to store co-occurrence of "spectra<->gcf" or "mfs<->gcf"
+        # DataFrame to store co-occurrence of "spectra<->gcf" or "mfs<->gcf"
         logger.debug("Create correlation matrices: spectra<->gcfs.")
         (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf,
          self.cooccurrence_notspec_gcf,
@@ -141,7 +139,10 @@ def get_common_strains(
                         self.occurrence_gcf_strain.loc[gcf.gcf_id])]
                 if filter_no_shared and len(shared_strains) == 0:
                     continue
-                results[(obj, gcf)] = [self._strains.lookup(strain_id) for strain_id in shared_strains]
+                results[(obj, gcf)] = [
+                    self._strains.lookup(strain_id)
+                    for strain_id in shared_strains
+                ]
         return results
 
     def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF],
@@ -220,7 +221,7 @@ def _get_mappings_from_occurrence(self):
     def _get_cooccurrence(
         self,
         link_type: str = 'spec-gcf'
-    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """Calculate co-occurrence for given link types across strains.
 
         Args:
@@ -236,5 +237,19 @@ def _get_cooccurrence(
                 f"Link type {link_type} is not supported. Use 'spec-gcf' or 'mf-gcf' instead."
             )
         logger.debug(f"Calculating correlation matrices of type: {link_type}")
-        return calc_correlation_matrix(met_strain_occurrence,
-                                       self.occurrence_gcf_strain)
+        m1, m2, m3, m4 = calc_correlation_matrix(met_strain_occurrence,
+                                                 self.occurrence_gcf_strain)
+        df_met_gcf = pd.DataFrame(m1,
+                                  index=met_strain_occurrence.index,
+                                  columns=self.occurrence_gcf_strain.index)
+        df_met_notgcf = pd.DataFrame(m2,
+                                     index=met_strain_occurrence.index,
+                                     columns=self.occurrence_gcf_strain.index)
+        df_notmet_gcf = pd.DataFrame(m3,
+                                     index=met_strain_occurrence.index,
+                                     columns=self.occurrence_gcf_strain.index)
+        df_notmet_notgcf = pd.DataFrame(
+            m4,
+            index=met_strain_occurrence.index,
+            columns=self.occurrence_gcf_strain.index)
+        return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf

From d23994034f5a24027f6f49658c6691f5b7a3a125 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 15:41:56 +0200
Subject: [PATCH 48/95] format link_finder.py

---
 src/nplinker/scoring/linking/link_finder.py | 56 +++++++++++----------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 5997a08f..d0100e05 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -1,4 +1,3 @@
-
 from deprecated import deprecated
 import numpy as np
 import pandas as pd
@@ -8,10 +7,8 @@
 from .data_linking_functions import pair_prob_approx
 from .data_linking_functions import pair_prob_hg
 
-
 # CG: TODO get_links function does not work any more, need to update its logics
 
-
 # import packages for plotting
 # TODO move plotting to separate module?
 try:
@@ -25,11 +22,11 @@
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
 
-
 logger = LogConfig.getLogger(__file__)
 
 SCORING_METHODS = ['metcalf', 'likescore', 'hg']
 
+
 class LinkFinder():
     """
     Class to:
@@ -142,18 +139,20 @@ def metcalf_scoring(self,
 
         if type == 'spec-gcf':
             metcalf_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape)
-            metcalf_scores = (data_links.cooccurrence_spec_gcf * both +
-                              data_links.cooccurrence_spec_notgcf * type1_not_gcf +
-                              data_links.cooccurrence_notspec_gcf * gcf_not_type1 +
-                              data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf)
+            metcalf_scores = (
+                data_links.cooccurrence_spec_gcf * both +
+                data_links.cooccurrence_spec_notgcf * type1_not_gcf +
+                data_links.cooccurrence_notspec_gcf * gcf_not_type1 +
+                data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf)
             self.metcalf_spec_gcf = metcalf_scores
 
         elif type == 'mf-gcf':
             metcalf_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape)
-            metcalf_scores = (data_links.cooccurrence_mf_gcf * both +
-                              data_links.cooccurrence_mf_notgcf * type1_not_gcf +
-                              data_links.cooccurrence_notmf_gcf * gcf_not_type1 +
-                              data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf)
+            metcalf_scores = (
+                data_links.cooccurrence_mf_gcf * both +
+                data_links.cooccurrence_mf_notgcf * type1_not_gcf +
+                data_links.cooccurrence_notmf_gcf * gcf_not_type1 +
+                data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf)
 
             self.metcalf_fam_gcf = metcalf_scores
         return metcalf_scores
@@ -174,8 +173,8 @@ def hg_scoring(self, data_links, type='spec-gcf'):
         # the correct totals.
 
         if type == 'spec-gcf':
-            num_strains = np.ones(
-                data_links.cooccurrence_spec_gcf.shape) * data_links.occurrence_gcf_strain.shape[1]
+            num_strains = np.ones(data_links.cooccurrence_spec_gcf.shape
+                                  ) * data_links.occurrence_gcf_strain.shape[1]
             overlap_counts = data_links.cooccurrence_spec_gcf
             gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf
             spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf
@@ -186,8 +185,8 @@ def hg_scoring(self, data_links, type='spec-gcf'):
                                      loc=1)
             self.hg_spec_gcf = hg_scores
         elif type == 'mf-gcf':
-            num_strains = np.ones(
-                data_links.cooccurrence_mf_gcf.shape) * data_links.occurrence_gcf_strain.shape[1]
+            num_strains = np.ones(data_links.cooccurrence_mf_gcf.shape
+                                  ) * data_links.occurrence_gcf_strain.shape[1]
             overlap_counts = data_links.cooccurrence_mf_gcf
             gcf_counts = overlap_counts + data_links.cooccurrence_notmf_gcf
             fam_counts = overlap_counts + data_links.cooccurrence_mf_notgcf
@@ -224,11 +223,13 @@ def likelihood_scoring(self,
         """
 
         if type == 'spec-gcf':
-            likelihood_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape)
+            likelihood_scores = np.zeros(
+                data_links.cooccurrence_spec_gcf.shape)
             likelihood_scores = (
                 likelihoods.P_gcf_given_spec *
                 (1 - likelihoods.P_spec_not_gcf) *
-                (1 - np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf)))
+                (1 -
+                 np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf)))
 
             self.likescores_spec_gcf = likelihood_scores
 
@@ -363,7 +364,8 @@ def select_link_candidates(self,
             id_gcf = link_candidates[1, i]
 
             # find set of strains which contain GCF with id link_candidates[1,i]
-            XG = np.where(data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0]
+            XG = np.where(
+                data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0]
 
             link_candidates[10,
                             i] = pair_prob_approx(P_str, XG,
@@ -372,11 +374,13 @@ def select_link_candidates(self,
             # Calculate the link specific probability
             # Find strains where GCF and spectra/family co-occur
             if type == 'spec-gcf':
-                XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1) &
-                               (data_links.occurrence_spec_strain[id_spec, :] == 1))[0]
+                XGS = np.where(
+                    (data_links.occurrence_gcf_strain[id_gcf, :] == 1)
+                    & (data_links.occurrence_spec_strain[id_spec, :] == 1))[0]
             elif type == 'mf-gcf':
-                XGS = np.where((data_links.occurrence_gcf_strain[id_gcf, :] == 1)
-                               & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0]
+                XGS = np.where(
+                    (data_links.occurrence_gcf_strain[id_gcf, :] == 1)
+                    & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0]
             link_candidates[11,
                             i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]),
                                            int(Ny_list[id_spec]), num_strains)
@@ -520,8 +524,7 @@ def get_links(self,
                         hg_scores[linklevel] >= score_cutoff)
                 else:
                     # should never happen
-                    raise Exception(
-                        f'Unknown scoring type! "{main_score}"')
+                    raise Exception(f'Unknown scoring type! "{main_score}"')
             else:
                 # TODO is this best way to get same output as above code?
                 # to keep the remainder of the method identical in the case of no cutoff
@@ -575,7 +578,8 @@ def get_links(self,
 
         return links
 
-    @deprecated(version="1.3.3", reason="The unworkable method will be removed")
+    @deprecated(version="1.3.3",
+                reason="The unworkable method will be removed")
     def create_cytoscape_files(self,
                                data_links,
                                network_filename,

From 8ed8ce9e7444ab3ddf252d090085358fb11dda35 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 3 May 2023 16:32:59 +0200
Subject: [PATCH 49/95] temp replace array with dataframe in LinkFinder for
 metcalf scoring

---
 src/nplinker/scoring/linking/link_finder.py | 46 +++++++--------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index d0100e05..3fa9bf88 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -7,6 +7,7 @@
 from .data_linking_functions import pair_prob_approx
 from .data_linking_functions import pair_prob_hg
 
+
 # CG: TODO get_links function does not work any more, need to update its logics
 
 # import packages for plotting
@@ -22,6 +23,7 @@
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
 
+
 logger = LogConfig.getLogger(__file__)
 
 SCORING_METHODS = ['metcalf', 'likescore', 'hg']
@@ -46,8 +48,8 @@ def __init__(self):
         """
 
         # metcalf scores
-        self.metcalf_spec_gcf = []
-        self.metcalf_fam_gcf = []
+        self.metcalf_spec_gcf = pd.DataFrame()
+        self.metcalf_fam_gcf = pd.DataFrame()
 
         # likelihood scores
         self.likescores_spec_gcf = []
@@ -138,7 +140,6 @@ def metcalf_scoring(self,
         # at expected_metcalf[3,6] and sqrt of the variance in the same position
 
         if type == 'spec-gcf':
-            metcalf_scores = np.zeros(data_links.cooccurrence_spec_gcf.shape)
             metcalf_scores = (
                 data_links.cooccurrence_spec_gcf * both +
                 data_links.cooccurrence_spec_notgcf * type1_not_gcf +
@@ -147,7 +148,6 @@ def metcalf_scoring(self,
             self.metcalf_spec_gcf = metcalf_scores
 
         elif type == 'mf-gcf':
-            metcalf_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape)
             metcalf_scores = (
                 data_links.cooccurrence_mf_gcf * both +
                 data_links.cooccurrence_mf_notgcf * type1_not_gcf +
@@ -155,7 +155,6 @@ def metcalf_scoring(self,
                 data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf)
 
             self.metcalf_fam_gcf = metcalf_scores
-        return metcalf_scores
 
     def hg_scoring(self, data_links, type='spec-gcf'):
         """
@@ -446,13 +445,9 @@ def get_links(self,
         # TODO CG: replace integer ids with string ids
         # If GCF:
         if isinstance(input_object[0], GCF):
+            input_ids = [gcf.gcf_id for gcf in input_object]
             input_type = "gcf"
             link_levels = [0, 1]
-
-            # Get necessary ids
-            input_ids = np.array([gcf.gcf_id for gcf in input_object],
-                                 dtype=np.int32)
-
             if main_score == 'likescore':
                 likescores = [
                     # TODO CG: use dataframe instead of numpy array
@@ -461,48 +456,32 @@ def get_links(self,
                 ]
             elif main_score == 'metcalf':
                 metcalf_scores = [
-                    self.metcalf_spec_gcf[:, input_ids],
-                    self.metcalf_fam_gcf[:, input_ids]
+                    self.metcalf_spec_gcf.loc[:, input_ids],
+                    self.metcalf_fam_gcf.loc[:, input_ids]
                 ]
             elif main_score == 'hg':
                 hg_scores = [
                     self.hg_spec_gcf[:, input_ids], self.hg_fam_gcf[:,
                                                                     input_ids]
                 ]
-
-        # If Spectrum:
         elif isinstance(input_object[0], Spectrum):
-            # Get necessary ids
-            input_ids = np.array([spec.id for spec in input_object],
-                                 dtype=np.int32)
-
+            input_ids = [spec.spectrum_id for spec in input_object]
             input_type = "spec"
             link_levels = [0]
             if main_score == 'likescore':
                 likescores = [self.likescores_spec_gcf[input_ids, :], []]
             elif main_score == 'metcalf':
-                metcalf_scores = [self.metcalf_spec_gcf[input_ids, :], []]
+                metcalf_scores = [self.metcalf_spec_gcf.loc[input_ids, :], []]
             elif main_score == 'hg':
                 hg_scores = [self.hg_spec_gcf[input_ids, :], []]
-        # If MolecularFamily:
         elif isinstance(input_object[0], MolecularFamily):
-
-            # Get necessary ids
-            # TODO: include Singletons, maybe optinal
-            #input_ids = np.zeros(query_size)
-            #mapping_fam_id = data_links.mapping_fam["original family id"]
-            #for i, family in enumerate(input_object):
-            #    input_ids[i] = np.where(mapping_fam_id == int(family.family_id))[0]
-            #input_ids =  input_ids.astype(int)
-            input_ids = np.array([mf.id for mf in input_object],
-                                 dtype=np.int32)
-
+            input_ids = [mf.family_id for mf in input_object]
             input_type = "fam"
             link_levels = [1]
             if main_score == 'likescore':
                 likescores = [[], self.likescores_fam_gcf[input_ids, :]]
             elif main_score == 'metcalf':
-                metcalf_scores = [[], self.metcalf_fam_gcf[input_ids, :]]
+                metcalf_scores = [[], self.metcalf_fam_gcf.loc[input_ids, :]]
             elif main_score == 'hg':
                 hg_scores = [[], self.hg_fam_gcf[input_ids, :]]
         else:
@@ -517,6 +496,8 @@ def get_links(self,
                     candidate_ids = np.where(
                         likescores[linklevel] >= score_cutoff)
                 elif main_score == 'metcalf':
+                    # get a tuple of arrays of indices where the metcalf score is above the cutoff
+                    # the first array is the row indices, the second is the column indices
                     candidate_ids = np.where(
                         metcalf_scores[linklevel] >= score_cutoff)
                 elif main_score == 'hg':
@@ -532,6 +513,7 @@ def get_links(self,
                 # currently abusing np.where like this
                 candidate_ids = np.where(metcalf_scores[linklevel] != np.nan)
 
+            # TODO: 230503 continue replace array with dataframe
             link_candidates = np.zeros((3, candidate_ids[0].shape[0]))
 
             # this is supposed to construct a (3, x) array, where:

From e4bdd958ea457672e39a808b52564a37d955fc89 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 8 May 2023 10:33:18 +0200
Subject: [PATCH 50/95] refactor `LinkFinder.get_scores` method

---
 src/nplinker/scoring/linking/link_finder.py | 41 ++++++++++++++-------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 3fa9bf88..3b6c788d 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -67,26 +67,39 @@ def __init__(self):
         self.metcalf_expected = None
         self.metcalf_variance = None
 
-    def get_scores(self, method, type_):
-        if method == 'metcalf':
-            if type_ == 'spec-gcf':
+    def get_scores(self, scoring_method: str, link_type: str) -> pd.DataFrame:
+        """Get the scores for a given method and link type.
+
+        Args:
+            scoring_method (str): The scoring method to use. Available methods
+                are 'metcalf', 'likescore', and 'hg'.
+            link_type (str): The type of link to get scores for. Available
+                types are 'spec-gcf' and 'mf-gcf'.
+
+        Returns:
+            pd.DataFrame: The scores for the given method and link type.
+
+        Raises:
+            ValueError: If the scoring method or link type is unknown.
+        """
+        if scoring_method == 'metcalf':
+            if link_type == 'spec-gcf':
                 return self.metcalf_spec_gcf
-            elif type_ == 'mf-gcf':
+            if link_type == 'mf-gcf':
                 return self.metcalf_fam_gcf
-        elif method == 'likescore':
-            if type_ == 'spec-gcf':
+        if scoring_method == 'likescore':
+            if link_type == 'spec-gcf':
                 return self.likescores_spec_gcf
-            elif type_ == 'mf-gcf':
+            if link_type == 'mf-gcf':
                 return self.likescores_fam_gcf
-        elif method == 'hg':
-            if type_ == 'spec-gcf':
+        if scoring_method == 'hg':
+            if link_type == 'spec-gcf':
                 return self.hg_spec_gcf
-            elif type_ == 'mf-gcf':
+            if link_type == 'mf-gcf':
                 return self.hg_fam_gcf
-
-        raise Exception(
-            'Unknown method or type (method="{}", type="{}")'.format(
-                method, type_))
+        raise ValueError(
+            f'Unknown method or type method="{scoring_method}", type="{link_type}"'
+        )
 
     def metcalf_scoring(self,
                         data_links,

From 14a1aaccc8d1627eda6cacf4f87f2c8905c3e238 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 8 May 2023 11:31:29 +0200
Subject: [PATCH 51/95] refactor `LinkFinder.metcalf_scoring` method

- rename parameter name
- wrap parameters for weights to one parameter
- extract private method `_cal_mean_std`
---
 src/nplinker/scoring/linking/link_finder.py | 160 ++++++++++----------
 src/nplinker/scoring/metcalf_scoring.py     |  12 +-
 tests/scoring/test_scoring.py               |   4 +-
 3 files changed, 91 insertions(+), 85 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 3b6c788d..7722c2ca 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
 from deprecated import deprecated
 import numpy as np
 import pandas as pd
@@ -24,9 +26,13 @@
 from nplinker.metabolomics.molecular_family import MolecularFamily
 
 
+if TYPE_CHECKING:
+    from .data_linking import DataLinks
+
 logger = LogConfig.getLogger(__file__)
 
-SCORING_METHODS = ['metcalf', 'likescore', 'hg']
+SCORE_TYPES = ['metcalf', 'likescore', 'hg']
+LINK_TYPES = ['spec-gcf', 'mf-gcf']
 
 
 class LinkFinder():
@@ -64,15 +70,15 @@ def __init__(self):
         self.link_candidates_gcf_fam = []
 
         # metcalf caching
-        self.metcalf_expected = None
-        self.metcalf_variance = None
+        self.metcalf_mean = None
+        self.metcalf_std = None
 
-    def get_scores(self, scoring_method: str, link_type: str) -> pd.DataFrame:
-        """Get the scores for a given method and link type.
+    def get_scores(self, score_type: str, link_type: str) -> pd.DataFrame:
+        """Get the scores for given scoring type and link type.
 
         Args:
-            scoring_method (str): The scoring method to use. Available methods
-                are 'metcalf', 'likescore', and 'hg'.
+            score_type (str): The type of scoring method to use. Available
+                scoring methods are 'metcalf', 'likescore', and 'hg'.
             link_type (str): The type of link to get scores for. Available
                 types are 'spec-gcf' and 'mf-gcf'.
 
@@ -80,94 +86,94 @@ def get_scores(self, scoring_method: str, link_type: str) -> pd.DataFrame:
             pd.DataFrame: The scores for the given method and link type.
 
         Raises:
-            ValueError: If the scoring method or link type is unknown.
+            ValueError: If an invalid score type or link type is given.
         """
-        if scoring_method == 'metcalf':
+        if score_type not in SCORE_TYPES:
+            raise ValueError(
+                f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"'
+            )
+        if link_type not in LINK_TYPES:
+            raise ValueError(
+                f'Invalid link type "{link_type}". Must be one of "{LINK_TYPES}"'
+            )
+        if score_type == 'metcalf':
             if link_type == 'spec-gcf':
                 return self.metcalf_spec_gcf
             if link_type == 'mf-gcf':
                 return self.metcalf_fam_gcf
-        if scoring_method == 'likescore':
+        if score_type == 'likescore':
             if link_type == 'spec-gcf':
                 return self.likescores_spec_gcf
             if link_type == 'mf-gcf':
                 return self.likescores_fam_gcf
-        if scoring_method == 'hg':
+        if score_type == 'hg':
             if link_type == 'spec-gcf':
                 return self.hg_spec_gcf
             if link_type == 'mf-gcf':
                 return self.hg_fam_gcf
-        raise ValueError(
-            f'Unknown method or type method="{scoring_method}", type="{link_type}"'
-        )
-
-    def metcalf_scoring(self,
-                        data_links,
-                        both=10,
-                        type1_not_gcf=-10,
-                        gcf_not_type1=0,
-                        not_type1_not_gcf=1,
-                        type='spec-gcf'):
-        """
-        Calculate metcalf scores from DataLinks() co-occurence matrices
+
+    def metcalf_scoring(
+        self,
+        data_links: DataLinks,
+        link_type: str = 'spec-gcf',
+        scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1)
+    ) -> None:
+        """Calculate metcalf scores.
+
+        Args:
+            data_links (DataLinks): The DataLinks object to use for scoring.
+            link_type (str, optional): The type of link to score. Available
+                types are 'spec-gcf' and 'mf-gcf'. Defaults to 'spec-gcf'.
+            scoring_weights (tuple[int,int,int,int], optional): The weights to
+                use for Metcalf scoring. The weights are applied to
+                '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'.
+                Defaults to (10, -10, 0, 1).
         """
+        if link_type == 'spec-gcf':
+            self.metcalf_spec_gcf = (
+                data_links.cooccurrence_spec_gcf * scoring_weights[0] +
+                data_links.cooccurrence_spec_notgcf * scoring_weights[1] +
+                data_links.cooccurrence_notspec_gcf * scoring_weights[2] +
+                data_links.cooccurrence_notspec_notgcf * scoring_weights[3])
+        if link_type == 'mf-gcf':
+            self.metcalf_fam_gcf = (
+                data_links.cooccurrence_mf_gcf * scoring_weights[0] +
+                data_links.cooccurrence_mf_notgcf * scoring_weights[1] +
+                data_links.cooccurrence_notmf_gcf * scoring_weights[2] +
+                data_links.cooccurrence_notmf_notgcf * scoring_weights[3])
 
+        if self.metcalf_mean is None or self.metcalf_std is None:
+            self.metcalf_mean, self.metcalf_std = self._cal_mean_std(
+                data_links, scoring_weights)
+
+    def _cal_mean_std(self, data_links, scoring_weights):
         # Compute the expected values for all possible values of spec and gcf strains
         # we need the total number of strains
         _, n_strains = data_links.occurrence_gcf_strain.shape
-        if self.metcalf_expected is None:
-            sz = (n_strains + 1, n_strains + 1)
-            self.metcalf_expected = np.zeros(sz)
-            self.metcalf_variance = np.zeros(sz)
-
-            for n in range(n_strains + 1):
-                for m in range(n_strains + 1):
-                    max_overlap = min(n, m)
-                    min_overlap = max(
-                        0,
-                        n + m - n_strains)  # minimum possible strain overlap
-                    expected_value = 0
-                    expected_sq = 0
-                    for o in range(min_overlap, max_overlap + 1):
-                        o_prob = hypergeom.pmf(o, n_strains, n, m)
-                        # compute metcalf for n strains in type 1 and m in gcf
-                        score = o * both
-                        score += type1_not_gcf * (n - o)
-                        score += gcf_not_type1 * (m - o)
-                        score += not_type1_not_gcf * (n_strains - (n + m - o))
-                        expected_value += o_prob * score
-                        expected_sq += o_prob * (score**2)
-
-                    self.metcalf_expected[n, m] = expected_value
-                    expected_sq = expected_sq - expected_value**2
-                    if expected_sq < 1e-09:
-                        expected_sq = 1
-                    self.metcalf_variance[n, m] = expected_sq
-
-            self.metcalf_variance_sqrt = np.sqrt(self.metcalf_variance)
-
-        # now, we would like an option to take any actual score an subtract the
-        # expected value and then divide by the square root of the variance
-        # e.g. if we have a score computed between a type 1 object that has
-        # 3 strains, and a gcf with 6 strains, we should use the expected value
-        # at expected_metcalf[3,6] and sqrt of the variance in the same position
-
-        if type == 'spec-gcf':
-            metcalf_scores = (
-                data_links.cooccurrence_spec_gcf * both +
-                data_links.cooccurrence_spec_notgcf * type1_not_gcf +
-                data_links.cooccurrence_notspec_gcf * gcf_not_type1 +
-                data_links.cooccurrence_notspec_notgcf * not_type1_not_gcf)
-            self.metcalf_spec_gcf = metcalf_scores
-
-        elif type == 'mf-gcf':
-            metcalf_scores = (
-                data_links.cooccurrence_mf_gcf * both +
-                data_links.cooccurrence_mf_notgcf * type1_not_gcf +
-                data_links.cooccurrence_notmf_gcf * gcf_not_type1 +
-                data_links.cooccurrence_notmf_notgcf * not_type1_not_gcf)
-
-            self.metcalf_fam_gcf = metcalf_scores
+        sz = (n_strains + 1, n_strains + 1)
+        mean = np.zeros(sz)
+        variance = np.zeros(sz)
+        for n in range(n_strains + 1):
+            for m in range(n_strains + 1):
+                max_overlap = min(n, m)
+                min_overlap = max(0, n + m - n_strains)
+                expected_value = 0
+                expected_sq = 0
+                for o in range(min_overlap, max_overlap + 1):
+                    o_prob = hypergeom.pmf(o, n_strains, n, m)
+                    # compute metcalf for n strains in type 1 and m in gcf
+                    score = o * scoring_weights[0]
+                    score += scoring_weights[1] * (n - o)
+                    score += scoring_weights[2] * (m - o)
+                    score += scoring_weights[3] * (n_strains - (n + m - o))
+                    expected_value += o_prob * score
+                    expected_sq += o_prob * (score**2)
+                mean[n, m] = expected_value
+                expected_sq = expected_sq - expected_value**2
+                if expected_sq < 1e-09:
+                    expected_sq = 1
+                variance[n, m] = expected_sq
+        return mean, np.sqrt(variance)
 
     def hg_scoring(self, data_links, type='spec-gcf'):
         """
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 00c04487..4daa06f0 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -81,9 +81,9 @@ def setup(npl):
                                                  npl._molfams, npl._strains)
             MetcalfScoring.LINKFINDER = LinkFinder()
             MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
-                                                      type='spec-gcf')
+                                                      link_type='spec-gcf')
             MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
-                                                      type='mf-gcf')
+                                                      link_type='mf-gcf')
             logger.debug('MetcalfScoring.setup caching results')
             save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS,
                                MetcalfScoring.LINKFINDER), cache_file)
@@ -117,8 +117,8 @@ def _metcalf_postprocess_met(self, linkfinder, results, input_type):
             gen_strains = len(gcf.strains)
 
             # lookup expected + variance values based on strain counts
-            expected = linkfinder.metcalf_expected[met_strains][gen_strains]
-            variance_sqrt = linkfinder.metcalf_variance_sqrt[met_strains][
+            expected = linkfinder.metcalf_mean[met_strains][gen_strains]
+            variance_sqrt = linkfinder.metcalf_std[met_strains][
                 gen_strains]
 
             # calculate the final score based on the basic Metcalf score for these two
@@ -163,9 +163,9 @@ def _metcalf_postprocess_gen(self, linkfinder, results, input_type):
                 met_strains = len(met_obj.strains)
 
                 # lookup expected + variance values based on strain counts
-                expected = linkfinder.metcalf_expected[met_strains][
+                expected = linkfinder.metcalf_mean[met_strains][
                     gen_strains]
-                variance_sqrt = linkfinder.metcalf_variance_sqrt[met_strains][
+                variance_sqrt = linkfinder.metcalf_std[met_strains][
                     gen_strains]
 
                 # calculate the final score based on the basic Metcalf score for these two
diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py
index 8c46a28e..2825ad9e 100644
--- a/tests/scoring/test_scoring.py
+++ b/tests/scoring/test_scoring.py
@@ -84,8 +84,8 @@ def do_scoring_new(gcfs, spectra, strains, standardised):
                 # (note that spectrum = type 1 object here)
                 met_strains = len(spec.strains)
                 gen_strains = len(gcf.strains)
-                expected = lf.metcalf_expected[met_strains][gen_strains]
-                variance = lf.metcalf_variance[met_strains][gen_strains]
+                expected = lf.metcalf_mean[met_strains][gen_strains]
+                variance = lf.metcalf_std[met_strains][gen_strains]
                 scores[j][i] = (scores[j][i] - expected) / np.sqrt(variance)
     return scores
 

From d2d6a1033dfed33efbb1c911245b068070e1e5ee Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 8 May 2023 12:12:27 +0200
Subject: [PATCH 52/95] refactor get_links

---
 src/nplinker/scoring/linking/link_finder.py | 234 ++++++++------------
 src/nplinker/scoring/metcalf_scoring.py     |   4 +-
 2 files changed, 95 insertions(+), 143 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 7722c2ca..b409d8d6 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -339,7 +339,7 @@ def select_link_candidates(self,
         else:
             raise Exception(
                 'Wrong scoring type given. Must be one of: {}'.format(
-                    SCORING_METHODS))
+                    SCORE_TYPES))
 
         logger.debug(candidate_ids[0].shape[0], " candidates selected with ",
                      index_names[2], " >= ", P_cutoff, " and a link score >= ",
@@ -429,156 +429,108 @@ def select_link_candidates(self,
             raise Exception("No candidate selection was created.")
         return link_candidates_pd
 
-    def get_links(self,
-                  data_links,
-                  input_object,
-                  main_score='likescore',
-                  score_cutoff=0.5):
-        """
-        Output likely links for 'input_object'
+    @staticmethod
+    def _isinstance(*objects, obj_type=GCF) -> bool:
+        return all(isinstance(x, obj_type) for x in objects)
 
-        Parameters
-        ----------
-        input_objects: object()
-            Object or list of objects of either class: spectra, families, or GCFs
-        main_score: str
-            Which main score to use ('metcalf', 'likescore')
-        score_cutoff:
-            Thresholds to conly consider candidates for which:
-            score >= score_cutoff
-        """
+    def get_links(
+        self,
+        *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
+        | tuple[MolecularFamily, ...],
+        score_type: str = 'likescore',
+        score_cutoff: float = 0.5
+    ) -> list[tuple[tuple[str], tuple[str], tuple[float]]]:
+        """Get scores for links between objects.
 
-        if main_score not in SCORING_METHODS:
-            raise Exception(
-                'Wrong scoring type given. Must be one of: {}'.format(
-                    SCORING_METHODS))
+        Args:
+            objects(tuple): GCF, Spectrum or MolecularFamily objects.
+            score_type(str): Type of score to use for link calculation.
+                Must be one of 'likescore', 'metcalf' and 'hg'.
+            score_cutoff(float): Minimum score to consider a link (≥score_cutoff).
+                Default is 0.5.
+        Returns:
+            list: List of tuples containing the ids of the linked objects and the score.
+                The tuple contains three tuples:
+                - the first tuple contains the ids of the input/source objects,
+                - the second tuple contains the ids of the target objects,
+                - the third tuple contains the scores.
 
-        # Check if input is list:
-        if isinstance(input_object, list):
-            query_size = len(input_object)
-        else:
-            input_object = [input_object]
-            query_size = 1
-
-        # Check type of input_object:
-        # TODO CG: replace integer ids with string ids
-        # If GCF:
-        if isinstance(input_object[0], GCF):
-            input_ids = [gcf.gcf_id for gcf in input_object]
-            input_type = "gcf"
-            link_levels = [0, 1]
-            if main_score == 'likescore':
-                likescores = [
-                    # TODO CG: use dataframe instead of numpy array
-                    self.likescores_spec_gcf[:, input_ids],
-                    self.likescores_fam_gcf[:, input_ids]
-                ]
-            elif main_score == 'metcalf':
-                metcalf_scores = [
-                    self.metcalf_spec_gcf.loc[:, input_ids],
-                    self.metcalf_fam_gcf.loc[:, input_ids]
-                ]
-            elif main_score == 'hg':
-                hg_scores = [
-                    self.hg_spec_gcf[:, input_ids], self.hg_fam_gcf[:,
-                                                                    input_ids]
-                ]
-        elif isinstance(input_object[0], Spectrum):
-            input_ids = [spec.spectrum_id for spec in input_object]
-            input_type = "spec"
-            link_levels = [0]
-            if main_score == 'likescore':
-                likescores = [self.likescores_spec_gcf[input_ids, :], []]
-            elif main_score == 'metcalf':
-                metcalf_scores = [self.metcalf_spec_gcf.loc[input_ids, :], []]
-            elif main_score == 'hg':
-                hg_scores = [self.hg_spec_gcf[input_ids, :], []]
-        elif isinstance(input_object[0], MolecularFamily):
-            input_ids = [mf.family_id for mf in input_object]
-            input_type = "fam"
-            link_levels = [1]
-            if main_score == 'likescore':
-                likescores = [[], self.likescores_fam_gcf[input_ids, :]]
-            elif main_score == 'metcalf':
-                metcalf_scores = [[], self.metcalf_fam_gcf.loc[input_ids, :]]
-            elif main_score == 'hg':
-                hg_scores = [[], self.hg_fam_gcf[input_ids, :]]
+        Raises:
+            TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
+            ValueError: If score type is not one of 'likescore', 'metcalf' and 'hg'.
+        """
+        if self._isinstance(*objects, GCF):
+            obj_type = 'gcf'
+        elif self._isinstance(*objects, Spectrum):
+            obj_type = 'spec'
+        elif self._isinstance(*objects, MolecularFamily):
+            obj_type = 'fam'
         else:
-            raise Exception(
-                "Input_object must be Spectrum, MolecularFamily, or GCF object (single or list)."
+            raise TypeError(
+                'Input objects must be GCF, Spectrum or MolecularFamily objects.'
+            )
+
+        if score_type not in SCORE_TYPES:
+            raise ValueError(
+                f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"'
             )
 
         links = []
-        for linklevel in link_levels:
-            if score_cutoff is not None:
-                if main_score == 'likescore':
-                    candidate_ids = np.where(
-                        likescores[linklevel] >= score_cutoff)
-                elif main_score == 'metcalf':
-                    # get a tuple of arrays of indices where the metcalf score is above the cutoff
-                    # the first array is the row indices, the second is the column indices
-                    candidate_ids = np.where(
-                        metcalf_scores[linklevel] >= score_cutoff)
-                elif main_score == 'hg':
-                    candidate_ids = np.where(
-                        hg_scores[linklevel] >= score_cutoff)
-                else:
-                    # should never happen
-                    raise Exception(f'Unknown scoring type! "{main_score}"')
-            else:
-                # TODO is this best way to get same output as above code?
-                # to keep the remainder of the method identical in the case of no cutoff
-                # being supplied, while still returning all the candidate links, I'm
-                # currently abusing np.where like this
-                candidate_ids = np.where(metcalf_scores[linklevel] != np.nan)
-
-            # TODO: 230503 continue replace array with dataframe
-            link_candidates = np.zeros((3, candidate_ids[0].shape[0]))
-
-            # this is supposed to construct a (3, x) array, where:
-            # - 1st index gives list of source/input object IDs
-            # - 2nd index gives list of destination/link object IDs
-            # - 3rd index gives list of scores for the link between the given pair of objects
-            # - x = number of links found
-
-            # if there is only a single object given as input, things are pretty simple here:
-            if query_size == 1:
-                # first, can set every index of the input object ID array to the
-                # single object ID we've been give
-                link_candidates[0, :] = input_ids
-                # then, based on input type copy the other object IDs from candidate_ids
-                if input_type == 'gcf':
-                    link_candidates[1, :] = candidate_ids[0].astype(int)
-                else:
-                    link_candidates[1, :] = candidate_ids[1].astype(int)
-            else:
-                # if there is a list of input objects, things are slightly more complex
-                # - the "input IDs" element of the output array now needs to be set by
-                #   a lookup into the original input_ids array based on candidate_ids
-                # - the "output IDs" element is taken directly from the other element
-                #   of candidate IDs
-                if input_type == 'gcf':
-                    link_candidates[0, :] = input_ids[candidate_ids[1].astype(
-                        int)]
-                    link_candidates[1, :] = candidate_ids[0].astype(int)
-                else:
-                    link_candidates[0, :] = input_ids[candidate_ids[0].astype(
-                        int)]
-                    link_candidates[1, :] = candidate_ids[1].astype(int)
-
-            # finally, copy in the actual scores too
-            if main_score == 'likescore':
-                link_candidates[2, :] = likescores[linklevel][candidate_ids]
-            elif main_score == 'metcalf':
-                link_candidates[
-                    2, :] = metcalf_scores[linklevel][candidate_ids]
-            elif main_score == 'hg':
-                link_candidates[2, :] = hg_scores[linklevel][candidate_ids]
-
-            links.append(link_candidates)
+        # TODO CG: replace integer ids with string ids for `hg` and `likescore`
+        if obj_type == 'gcf':
+            obj_ids = [gcf.gcf_id for gcf in objects]
+            if score_type == 'likescore':
+                all_scores = (self.likescores_spec_gcf[:, obj_ids],
+                              self.likescores_fam_gcf[:, obj_ids])
+            if score_type == 'metcalf':
+                all_scores = (self.metcalf_spec_gcf.loc[:, obj_ids],
+                              self.metcalf_fam_gcf.loc[:, obj_ids])
+            if score_type == 'hg':
+                all_scores = (self.hg_spec_gcf[:, obj_ids],
+                              self.hg_fam_gcf[:, obj_ids])
+            for scores in all_scores:
+                links.append(self._get_scores_source_gcf(scores, score_cutoff))
+
+        if obj_type == 'spec':
+            obj_ids = [spec.spectrum_id for spec in objects]
+            if score_type == 'likescore':
+                all_scores = self.likescores_spec_gcf[obj_ids, :]
+            if score_type == 'metcalf':
+                all_scores = self.metcalf_spec_gcf.loc[obj_ids, :]
+            if score_type == 'hg':
+                all_scores = self.hg_spec_gcf[obj_ids, :]
+            links.append(self._get_scores_source_met(all_scores, score_cutoff))
+
+        if obj_type == 'fam':
+            obj_ids = [mf.family_id for mf in objects]
+            if score_type == 'likescore':
+                all_scores = self.likescores_fam_gcf[obj_ids, :]
+            if score_type == 'metcalf':
+                all_scores = self.metcalf_fam_gcf.loc[obj_ids, :]
+            if score_type == 'hg':
+                all_scores = self.hg_fam_gcf[obj_ids:]
+            links.append(self._get_scores_source_met(all_scores, score_cutoff))
 
         return links
 
+    def _get_scores_source_gcf(self, scores, score_cutoff):
+        candidate_met_gcf_indexes = np.where(scores >= score_cutoff)
+        src_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list()
+        target_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list()
+        scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist(
+        )
+        return tuple(src_obj_ids), tuple(target_obj_ids), tuple(
+            scores_candidate)
+
+    def _get_scores_source_met(self, scores, score_cutoff):
+        candidate_met_gcf_indexes = np.where(scores >= score_cutoff)
+        src_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list()
+        target_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list()
+        scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist(
+        )
+        return tuple(src_obj_ids), tuple(target_obj_ids), tuple(
+            scores_candidate)
+
     @deprecated(version="1.3.3",
                 reason="The unworkable method will be removed")
     def create_cytoscape_files(self,
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 4daa06f0..9fb76fe6 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -203,13 +203,13 @@ def get_links(self, objects, link_collection):
         logger.debug('MetcalfScoring: standardised = {}'.format(
             self.standardised))
         if not self.standardised:
-            results = linkfinder.get_links(datalinks, objects, self.name,
+            results = linkfinder.get_links(objects, self.name,
                                            self.cutoff)
         else:
             # get the basic Metcalf scores BUT ignore the cutoff value here by setting
             # it to None. The actual user-supplied cutoff value is applied further down
             # once the standardised scores for these results have been calculated.
-            results = linkfinder.get_links(datalinks, objects, self.name, None)
+            results = linkfinder.get_links(objects, self.name, None)
 
             # The "results" object varies slightly depending on the input provided
             # to the LinkFinder class:

From 2078488f41dadaa74255486ead4367d34c5bc738 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 9 May 2023 10:14:11 +0200
Subject: [PATCH 53/95] remove unused methods and scorings from LinkFinder

-  remove unused `likescore` and `hg` scoring types
- remove all unused methods
---
 src/nplinker/scoring/linking/link_finder.py | 574 +-------------------
 1 file changed, 13 insertions(+), 561 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index b409d8d6..a97775bd 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -1,29 +1,12 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
-from deprecated import deprecated
 import numpy as np
 import pandas as pd
 from scipy.stats import hypergeom
 from nplinker.genomics.gcf import GCF
-from nplinker.metabolomics.spectrum import Spectrum
-from .data_linking_functions import pair_prob_approx
-from .data_linking_functions import pair_prob_hg
-
-
-# CG: TODO get_links function does not work any more, need to update its logics
-
-# import packages for plotting
-# TODO move plotting to separate module?
-try:
-    from matplotlib import pyplot as plt
-    import seaborn as sns
-except ImportError:
-    print(
-        'Warning: plotting functionality will not be available (missing matplotlib and/or seaborn)'
-    )
-
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
+from nplinker.metabolomics.spectrum import Spectrum
 
 
 if TYPE_CHECKING:
@@ -31,7 +14,6 @@
 
 logger = LogConfig.getLogger(__file__)
 
-SCORE_TYPES = ['metcalf', 'likescore', 'hg']
 LINK_TYPES = ['spec-gcf', 'mf-gcf']
 
 
@@ -52,67 +34,15 @@ def __init__(self):
         Separate tables will exist for different linking scenarios, such as
         gcfs <-> spectra OR gcf <-> mol.families
         """
-
         # metcalf scores
-        self.metcalf_spec_gcf = pd.DataFrame()
-        self.metcalf_fam_gcf = pd.DataFrame()
-
-        # likelihood scores
-        self.likescores_spec_gcf = []
-        self.likescores_fam_gcf = []
-
-        # hg scores
-        self.hg_spec_gcf = []
-        self.hg_fam_gcf = []
-
-        # link candidate tables
-        self.link_candidates_gcf_spec = []
-        self.link_candidates_gcf_fam = []
+        self.raw_score_spec_gcf = pd.DataFrame()
+        self.raw_score_fam_gcf = pd.DataFrame()
 
         # metcalf caching
         self.metcalf_mean = None
         self.metcalf_std = None
 
-    def get_scores(self, score_type: str, link_type: str) -> pd.DataFrame:
-        """Get the scores for given scoring type and link type.
-
-        Args:
-            score_type (str): The type of scoring method to use. Available
-                scoring methods are 'metcalf', 'likescore', and 'hg'.
-            link_type (str): The type of link to get scores for. Available
-                types are 'spec-gcf' and 'mf-gcf'.
-
-        Returns:
-            pd.DataFrame: The scores for the given method and link type.
-
-        Raises:
-            ValueError: If an invalid score type or link type is given.
-        """
-        if score_type not in SCORE_TYPES:
-            raise ValueError(
-                f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"'
-            )
-        if link_type not in LINK_TYPES:
-            raise ValueError(
-                f'Invalid link type "{link_type}". Must be one of "{LINK_TYPES}"'
-            )
-        if score_type == 'metcalf':
-            if link_type == 'spec-gcf':
-                return self.metcalf_spec_gcf
-            if link_type == 'mf-gcf':
-                return self.metcalf_fam_gcf
-        if score_type == 'likescore':
-            if link_type == 'spec-gcf':
-                return self.likescores_spec_gcf
-            if link_type == 'mf-gcf':
-                return self.likescores_fam_gcf
-        if score_type == 'hg':
-            if link_type == 'spec-gcf':
-                return self.hg_spec_gcf
-            if link_type == 'mf-gcf':
-                return self.hg_fam_gcf
-
-    def metcalf_scoring(
+    def cal_score(
         self,
         data_links: DataLinks,
         link_type: str = 'spec-gcf',
@@ -130,13 +60,13 @@ def metcalf_scoring(
                 Defaults to (10, -10, 0, 1).
         """
         if link_type == 'spec-gcf':
-            self.metcalf_spec_gcf = (
+            self.raw_score_spec_gcf = (
                 data_links.cooccurrence_spec_gcf * scoring_weights[0] +
                 data_links.cooccurrence_spec_notgcf * scoring_weights[1] +
                 data_links.cooccurrence_notspec_gcf * scoring_weights[2] +
                 data_links.cooccurrence_notspec_notgcf * scoring_weights[3])
         if link_type == 'mf-gcf':
-            self.metcalf_fam_gcf = (
+            self.raw_score_fam_gcf = (
                 data_links.cooccurrence_mf_gcf * scoring_weights[0] +
                 data_links.cooccurrence_mf_notgcf * scoring_weights[1] +
                 data_links.cooccurrence_notmf_gcf * scoring_weights[2] +
@@ -175,277 +105,16 @@ def _cal_mean_std(self, data_links, scoring_weights):
                 variance[n, m] = expected_sq
         return mean, np.sqrt(variance)
 
-    def hg_scoring(self, data_links, type='spec-gcf'):
-        """
-        Calculate metcalf scores from DataLinks() co-occurence matrices
-        """
-
-        # NOTE:can't use the correlation matrices directly for this scoring method because
-        # it seems to require more inclusive counts of the strains in each object.
-        # Instead of "number of strains only in GCF", it requires "number of strains in the
-        # GCF PLUS the number shared between the GCF and the other object".
-        # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain,
-        # cooccurrence_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead
-        # of "3", because the spectrum only has 2 distinct strains vs the GCF.
-        # To fix this the cooccurrence_spec_gcf/cooccurrence_mf_gcf matrix can just be added onto the others to give
-        # the correct totals.
-
-        if type == 'spec-gcf':
-            num_strains = np.ones(data_links.cooccurrence_spec_gcf.shape
-                                  ) * data_links.occurrence_gcf_strain.shape[1]
-            overlap_counts = data_links.cooccurrence_spec_gcf
-            gcf_counts = overlap_counts + data_links.cooccurrence_notspec_gcf
-            spec_counts = overlap_counts + data_links.cooccurrence_spec_notgcf
-            hg_scores = hypergeom.sf(overlap_counts,
-                                     num_strains,
-                                     gcf_counts,
-                                     spec_counts,
-                                     loc=1)
-            self.hg_spec_gcf = hg_scores
-        elif type == 'mf-gcf':
-            num_strains = np.ones(data_links.cooccurrence_mf_gcf.shape
-                                  ) * data_links.occurrence_gcf_strain.shape[1]
-            overlap_counts = data_links.cooccurrence_mf_gcf
-            gcf_counts = overlap_counts + data_links.cooccurrence_notmf_gcf
-            fam_counts = overlap_counts + data_links.cooccurrence_mf_notgcf
-            hg_scores = hypergeom.sf(overlap_counts,
-                                     num_strains,
-                                     gcf_counts,
-                                     fam_counts,
-                                     loc=1)
-            self.hg_fam_gcf = hg_scores
-
-        return hg_scores
-
-    def likelihood_scoring(self,
-                           data_links,
-                           likelihoods,
-                           alpha_weighing=0.5,
-                           type='spec-gcf'):
-        """
-        Calculate likelihood scores from DataLinks() co-occurence matrices.
-
-        Idea:
-            Score reflect the directionality BGC-->compound-->spectrum, which
-            suggests that the most relevant likelihoods are:
-            P(gcf|type1) - If type1 is the result of only one particular gene cluster,
-                            this value should be high (close or equal to 1)
-            P(type1|not gcf) - Following the same logic, this value should be very
-                                small or 0 ("no gene cluster, no compound")
-
-        Score:
-            Score = P(gcf|type1) * (1 - P(type1|not gcf) * weighing function
-
-            weighing function is here a function of the number of strains they co-occur.
-            weighing function = (1 - exp(-alpha_weighing * num_of_co-occurrences)
-        """
-
-        if type == 'spec-gcf':
-            likelihood_scores = np.zeros(
-                data_links.cooccurrence_spec_gcf.shape)
-            likelihood_scores = (
-                likelihoods.P_gcf_given_spec *
-                (1 - likelihoods.P_spec_not_gcf) *
-                (1 -
-                 np.exp(-alpha_weighing * data_links.cooccurrence_spec_gcf)))
-
-            self.likescores_spec_gcf = likelihood_scores
-
-        elif type == 'mf-gcf':
-            likelihood_scores = np.zeros(data_links.cooccurrence_mf_gcf.shape)
-            likelihood_scores = (
-                likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) *
-                (1 - np.exp(-alpha_weighing * data_links.cooccurrence_mf_gcf)))
-
-            self.likescores_fam_gcf = likelihood_scores
-        return likelihood_scores
-
-    def select_link_candidates(self,
-                               data_links,
-                               likelihoods,
-                               P_cutoff=0.8,
-                               main_score='likescore',
-                               score_cutoff=0,
-                               type='mf-gcf'):
-        """
-        Look for potential best candidate for links between
-        IF type='spec-gcf': GCFs and spectra
-        IF type='mf-gcf': GCFs and mol.families
-
-        Parameters
-        ----------
-        data_links: DataLinks() object
-            containing co-occurence matrices
-        likelihood: LinkLikelihood() object
-            containing co-occurence likelihoods
-        P_cutoff: float
-            Thresholds to conly consider candidates for which:
-            P_gcf_given_type1 >= P_cutoff
-        main_score: str
-            Which main score to use ('metcalf', 'likescore')
-        score_cutoff:
-            Thresholds to conly consider candidates for which:
-            score >= score_cutoff
-        """
-
-        # Select scenario: spec<->gcf or mf<->gcf
-        if type == 'spec-gcf':
-            P_gcf_given_type1 = likelihoods.P_gcf_given_spec
-            P_gcf_not_type1 = likelihoods.P_gcf_not_spec
-            P_type1_given_gcf = likelihoods.P_spec_given_gcf
-            P_type1_not_gcf = likelihoods.P_spec_not_gcf
-            M_type1_gcf = data_links.cooccurrence_spec_gcf
-            metcalf_scores = self.metcalf_spec_gcf
-            likescores = self.likescores_spec_gcf
-            index_names = [
-                "spectrum_id", "GCF id", "P(gcf|spec)", "P(spec|gcf)",
-                "P(gcf|not spec)", "P(spec|not gcf)", "co-occur in # strains",
-                "metcalf score", "likelihood score", "HG prob", "link prob",
-                "link prob specific"
-            ]
-
-        elif type == 'mf-gcf':
-            P_gcf_given_type1 = likelihoods.P_gcf_given_fam
-            P_gcf_not_type1 = likelihoods.P_gcf_not_fam
-            P_type1_given_gcf = likelihoods.P_fam_given_gcf
-            P_type1_not_gcf = likelihoods.P_fam_not_gcf
-            M_type1_gcf = data_links.cooccurrence_mf_gcf
-            metcalf_scores = self.metcalf_fam_gcf
-            likescores = self.likescores_fam_gcf
-            index_names = [
-                "family_id", "GCF id", "P(gcf|fam)", "P(fam|gcf)",
-                "P(gcf|not fam)", "P(fam|not gcf)", "co-occur in # strains",
-                "metcalf score", "likelihood score", "HG prob", "link prob",
-                "link prob specific"
-            ]
-
-        elif type == 'spec-bgc' or type == 'fam-bgc':
-            raise Exception("Given types are not yet supported... ")
-        else:
-            raise Exception(
-                "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..."
-            )
-
-        dim1, dim2 = P_gcf_given_type1.shape
-
-        # PRE-SELECTION:
-        # Select candidates with P_gcf_given_spec >= P_cutoff AND score >= score_cutoff
-        if main_score == 'likescore':
-            candidate_ids = np.where((P_gcf_given_type1[:, :] >= P_cutoff)
-                                     & (likescores >= score_cutoff))
-        elif main_score == 'metcalf':
-            candidate_ids = np.where((P_gcf_given_type1[:, :] >= P_cutoff)
-                                     & (metcalf_scores >= score_cutoff))
-        else:
-            raise Exception(
-                'Wrong scoring type given. Must be one of: {}'.format(
-                    SCORE_TYPES))
-
-        logger.debug(candidate_ids[0].shape[0], " candidates selected with ",
-                     index_names[2], " >= ", P_cutoff, " and a link score >= ",
-                     score_cutoff, ".")
-
-        link_candidates = np.zeros((12, candidate_ids[0].shape[0]))
-        link_candidates[0, :] = candidate_ids[0]  # spectrum/fam number
-        link_candidates[1, :] = candidate_ids[1]  # gcf id
-        link_candidates[2, :] = P_gcf_given_type1[candidate_ids]
-        link_candidates[3, :] = P_type1_given_gcf[candidate_ids]
-        link_candidates[4, :] = P_gcf_not_type1[candidate_ids]
-        link_candidates[5, :] = P_type1_not_gcf[candidate_ids]
-        link_candidates[6, :] = M_type1_gcf[candidate_ids]
-        link_candidates[7, :] = metcalf_scores[candidate_ids]
-        link_candidates[8, :] = likescores[candidate_ids]
-
-        # Calculate probability to find similar link by chance
-        Nx_list = data_links.mapping_gcf["no of strains"]
-        if type == 'spec-gcf':
-            Ny_list = data_links.mapping_spec["no of strains"]
-        elif type == 'mf-gcf':
-            Ny_list = data_links.mapping_fam["no of strains"]
-
-        # Calculate probabilities of finding a spectrum in a certain strain
-        P_str = np.array(data_links.mapping_strain["no of spectra"])
-        P_str = P_str / np.sum(P_str)
-
-        num_strains = data_links.occurrence_gcf_strain.shape[1]
-
-        # Calculate the hypergeometric probability (as before)
-        for i in range(link_candidates.shape[1]):
-            link_candidates[9, i] = pair_prob_hg(
-                link_candidates[6, i], num_strains,
-                Nx_list[link_candidates[1, i]],
-                Ny_list[int(link_candidates[0, i])])
-
-        # Calculate the GCF specific probability
-        for i in range(link_candidates.shape[1]):
-            id_spec = link_candidates[0, i]
-            id_gcf = link_candidates[1, i]
-
-            # find set of strains which contain GCF with id link_candidates[1,i]
-            XG = np.where(
-                data_links.occurrence_gcf_strain.loc[id_gcf, :] == 1)[0]
-
-            link_candidates[10,
-                            i] = pair_prob_approx(P_str, XG,
-                                                  int(Ny_list[id_spec]),
-                                                  int(link_candidates[6, i]))
-            # Calculate the link specific probability
-            # Find strains where GCF and spectra/family co-occur
-            if type == 'spec-gcf':
-                XGS = np.where(
-                    (data_links.occurrence_gcf_strain[id_gcf, :] == 1)
-                    & (data_links.occurrence_spec_strain[id_spec, :] == 1))[0]
-            elif type == 'mf-gcf':
-                XGS = np.where(
-                    (data_links.occurrence_gcf_strain[id_gcf, :] == 1)
-                    & (data_links.occurrence_mf_strain[id_spec, :] == 1))[0]
-            link_candidates[11,
-                            i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]),
-                                           int(Ny_list[id_spec]), num_strains)
-
-        # Transform into pandas Dataframe (to avoid index confusions):
-        link_candidates_pd = pd.DataFrame(link_candidates.transpose(1, 0),
-                                          columns=index_names)
-
-        # add other potentially relevant knowdledge
-        # If this will grow to more collected information -> create separate function/class
-        bgc_class = []
-        # TODO CG: bgc class should be obtained from GCF object
-        for i in link_candidates_pd["GCF id"].astype(int):
-            bgc_class.append(data_links.mapping_gcf["bgc class"][i])
-        link_candidates_pd["BGC class"] = bgc_class
-
-        # Change some columns to int
-        link_candidates_pd.iloc[:, [0, 1, 6, 7]] = link_candidates_pd.iloc[:, [
-            0, 1, 6, 7
-        ]].astype(int)
-
-        # return results
-        if type == 'spec-gcf':
-            self.link_candidates_gcf_spec = link_candidates_pd
-        elif type == 'mf-gcf':
-            self.link_candidates_gcf_fam = link_candidates_pd
-        else:
-            raise Exception("No candidate selection was created.")
-        return link_candidates_pd
-
-    @staticmethod
-    def _isinstance(*objects, obj_type=GCF) -> bool:
-        return all(isinstance(x, obj_type) for x in objects)
-
     def get_links(
         self,
         *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
         | tuple[MolecularFamily, ...],
-        score_type: str = 'likescore',
         score_cutoff: float = 0.5
     ) -> list[tuple[tuple[str], tuple[str], tuple[float]]]:
         """Get scores for links between objects.
 
         Args:
             objects(tuple): GCF, Spectrum or MolecularFamily objects.
-            score_type(str): Type of score to use for link calculation.
-                Must be one of 'likescore', 'metcalf' and 'hg'.
             score_cutoff(float): Minimum score to consider a link (≥score_cutoff).
                 Default is 0.5.
         Returns:
@@ -457,7 +126,6 @@ def get_links(
 
         Raises:
             TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
-            ValueError: If score type is not one of 'likescore', 'metcalf' and 'hg'.
         """
         if self._isinstance(*objects, GCF):
             obj_type = 'gcf'
@@ -470,49 +138,26 @@ def get_links(
                 'Input objects must be GCF, Spectrum or MolecularFamily objects.'
             )
 
-        if score_type not in SCORE_TYPES:
-            raise ValueError(
-                f'Invalid score type "{score_type}". Must be one of "{SCORE_TYPES}"'
-            )
-
         links = []
-        # TODO CG: replace integer ids with string ids for `hg` and `likescore`
         if obj_type == 'gcf':
             obj_ids = [gcf.gcf_id for gcf in objects]
-            if score_type == 'likescore':
-                all_scores = (self.likescores_spec_gcf[:, obj_ids],
-                              self.likescores_fam_gcf[:, obj_ids])
-            if score_type == 'metcalf':
-                all_scores = (self.metcalf_spec_gcf.loc[:, obj_ids],
-                              self.metcalf_fam_gcf.loc[:, obj_ids])
-            if score_type == 'hg':
-                all_scores = (self.hg_spec_gcf[:, obj_ids],
-                              self.hg_fam_gcf[:, obj_ids])
+            all_scores = (self.raw_score_spec_gcf.loc[:, obj_ids],
+                          self.raw_score_fam_gcf.loc[:, obj_ids])
             for scores in all_scores:
                 links.append(self._get_scores_source_gcf(scores, score_cutoff))
-
         if obj_type == 'spec':
             obj_ids = [spec.spectrum_id for spec in objects]
-            if score_type == 'likescore':
-                all_scores = self.likescores_spec_gcf[obj_ids, :]
-            if score_type == 'metcalf':
-                all_scores = self.metcalf_spec_gcf.loc[obj_ids, :]
-            if score_type == 'hg':
-                all_scores = self.hg_spec_gcf[obj_ids, :]
+            all_scores = self.raw_score_spec_gcf.loc[obj_ids, :]
             links.append(self._get_scores_source_met(all_scores, score_cutoff))
-
         if obj_type == 'fam':
             obj_ids = [mf.family_id for mf in objects]
-            if score_type == 'likescore':
-                all_scores = self.likescores_fam_gcf[obj_ids, :]
-            if score_type == 'metcalf':
-                all_scores = self.metcalf_fam_gcf.loc[obj_ids, :]
-            if score_type == 'hg':
-                all_scores = self.hg_fam_gcf[obj_ids:]
+            all_scores = self.raw_score_fam_gcf.loc[obj_ids, :]
             links.append(self._get_scores_source_met(all_scores, score_cutoff))
-
         return links
 
+    def _isinstance(self, *objects, obj_type=GCF) -> bool:
+        return all(isinstance(x, obj_type) for x in objects)
+
     def _get_scores_source_gcf(self, scores, score_cutoff):
         candidate_met_gcf_indexes = np.where(scores >= score_cutoff)
         src_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list()
@@ -530,196 +175,3 @@ def _get_scores_source_met(self, scores, score_cutoff):
         )
         return tuple(src_obj_ids), tuple(target_obj_ids), tuple(
             scores_candidate)
-
-    @deprecated(version="1.3.3",
-                reason="The unworkable method will be removed")
-    def create_cytoscape_files(self,
-                               data_links,
-                               network_filename,
-                               link_type='mf-gcf',
-                               score_type='metcalf'):
-        """
-        Create network file for import into Cytoscape.
-        Network file will be generated using networkx.
-        The type of network created here is a bipartite network.
-            mass spec side --> bipartite = 0
-            gene cluster side --> bipartite = 1
-        Output format is a graphml file.
-
-        Parameters
-        ----------
-        data_links: DataLinks() object
-            containing co-occurence matrices
-        likelihood: LinkLikelihood() object
-            containing co-occurence likelihoods
-        network_filename: str
-            Filename to save generated model as graphml file.
-        """
-
-        import networkx as nx
-        NPlinker_net = nx.Graph()
-
-        if link_type == 'mf-gcf':
-            link_candidates = self.link_candidates_gcf_fam
-            type1str = 'family_id'
-        elif link_type == 'spec-gcf':
-            link_candidates = self.link_candidates_gcf_spec
-            type1str = 'spectrum_id'
-        else:
-            raise Exception("Wrong link-type given.")
-
-        # Select score type
-        if score_type == 'metcalf':
-            scorestr = 'metcalf score'
-        elif score_type == 'likescore':
-            scorestr = 'likelihood score'
-        else:
-            raise Exception(
-                "Wrong score_type given. Must be one of: 'metcalf', 'likescore' ."
-            )
-
-        # Add nodes (all partners from link_candidate table):
-        # mass spec side --> bipartite = 0
-        type1_names = []
-        for type1 in link_candidates[type1str].astype(int):
-            type1_names.append(type1str + str(type1))
-
-        type1_names_unique = list(set(type1_names))
-        NPlinker_net.add_nodes_from(type1_names_unique, bipartite=0)
-
-        # gene cluster side --> bipartite = 1
-        type2_names = []
-        for type2 in link_candidates['GCF id']:
-            type2_names.append("GCF_" + str(type2))
-
-        type2_names_unique = list(set(type2_names))
-        NPlinker_net.add_nodes_from(type2_names_unique, bipartite=1)
-
-        # Add edges:
-        for i in range(0, link_candidates.shape[0]):
-            NPlinker_net.add_edge(type1_names[i],
-                                  type2_names[i],
-                                  weight=float(link_candidates[scorestr][i]))
-
-        # Add edges between molecular family members
-        if link_type == 'spec-gcf':
-            type1_unique = (np.unique(link_candidates[type1str])).astype(int)
-            map_spec_fam = data_links.mapping_spec["fam-id"]
-            for type1 in type1_unique:
-                if map_spec_fam[type1] > 0:  # if no singleton
-                    members = data_links.family_members[int(
-                        map_spec_fam[type1])][0]
-
-                    # select other family members if among link candidates
-                    members_present = [
-                        x for x in list(members) if x in list(type1_unique)
-                    ]
-                    for member in members_present:
-                        NPlinker_net.add_edge(type1str + str(type1),
-                                              type1str + str(member))
-
-        # export graph for drawing (e.g. using Cytoscape)
-        nx.write_graphml(NPlinker_net, network_filename)
-
-    def plot_candidates(self,
-                        P_cutoff=0.8,
-                        score_type='likescore',
-                        score_cutoff=0,
-                        type='mf-gcf'):
-        """
-        Plot best rated correlations between gcfs and spectra/families
-        plot in form of seaborn clustermap
-        """
-
-        # Select score type
-        if score_type == 'metcalf':
-            scorestr = 'metcalf score'
-        elif score_type == 'likescore':
-            scorestr = 'likelihood score'
-        else:
-            raise Exception(
-                "Wrong score_type given. Must be one of: 'metcalf', 'likescore' ."
-            )
-
-        if type == 'spec-gcf':
-            link_candidates = self.link_candidates_gcf_spec
-            selected_ids = np.where(
-                (link_candidates["P(gcf|spec)"] > P_cutoff)
-                & (link_candidates[scorestr] > score_cutoff))[0]
-        elif type == 'mf-gcf':
-            link_candidates = self.link_candidates_gcf_fam
-            selected_ids = np.where(
-                (link_candidates["P(gcf|fam)"] > P_cutoff)
-                & (link_candidates[scorestr] > score_cutoff))[0]
-        else:
-            raise Exception("Wrong correlation 'type' given.")
-
-        mapping_fams = np.unique(link_candidates.iloc[selected_ids, 0])
-        mapping_gcfs = np.unique(link_candidates.iloc[selected_ids, 1])
-        unique_fams = len(mapping_fams)
-        unique_gcfs = len(mapping_gcfs)
-
-        M_links = np.zeros((unique_fams, unique_gcfs))
-
-        # define colors for different BGC classes
-        bigscape_classes_dict = {
-            "Others": "C0",
-            "NRPS": "C1",
-            "PKS-NRP_Hybrids": "C2",
-            "PKSother": "C3",
-            "PKSI": "C4",
-            "RiPPs": "C5",
-            "Saccharides": "C6",
-            "Terpene": "C7"
-        }
-        col_colors = []
-
-        # Create matrix with relevant link scores
-        # TODO replace by better numpy method...
-        for i in range(0, link_candidates.shape[0]):
-            x = np.where(mapping_fams == link_candidates.iloc[i, 0])[0]
-            y = np.where(mapping_gcfs == link_candidates.iloc[i, 1])[0]
-            M_links[x, y] = link_candidates[scorestr][i]
-
-        # make pandas dataframe from numpy array
-        M_links = pd.DataFrame(M_links,
-                               index=mapping_fams.astype(int),
-                               columns=mapping_gcfs.astype(int))
-        if type == 'spec-gcf':
-            M_links.index.name = 'spectrum number'
-        elif type == 'mf-gcf':
-            M_links.index.name = 'molecular family number'
-        M_links.columns.name = 'gene cluster family (GCF)'
-
-        # add color label representing gene cluster class
-        for bgc_class in link_candidates["BGC class"]:
-            if bgc_class is None:
-                col_colors.append((0, 0, 0))
-            else:
-                col_colors.append(bigscape_classes_dict[bgc_class])
-
-    #    bgc_type_colors = pd.Series(mapping_gcfs.astype(int), index=M_links.columns).map(col_colors)
-        graph = sns.clustermap(
-            M_links,
-            metric="correlation",
-            method="weighted",
-            cmap=
-            "Reds",  #sns.cubehelix_palette(8, start=0, rot=.3, dark=0, light=1),
-            vmin=0,
-            vmax=np.max(link_candidates[scorestr]),
-            col_cluster=True,
-            col_colors=col_colors,
-            robust=True)
-        graph.fig.suptitle('Correlation map')
-
-        # Rotate labels
-        plt.setp(graph.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
-        plt.setp(graph.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
-
-        # Make labels smaller
-        plt.setp(graph.ax_heatmap.xaxis.get_majorticklabels(), fontsize=7)
-        plt.setp(graph.ax_heatmap.yaxis.get_majorticklabels(), fontsize=7)
-
-        plt.ylabel("scoring index")
-
-        return M_links

From 5d9a9169fd5ca0a403b6bedeb86662ce298b1a53 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 9 May 2023 11:23:19 +0200
Subject: [PATCH 54/95] refactor returned type of `LinkFinder.get_links` method

---
 src/nplinker/scoring/linking/data_linking.py |   3 +-
 src/nplinker/scoring/linking/link_finder.py  | 118 ++++++++++---------
 2 files changed, 64 insertions(+), 57 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index 0edec183..e0849421 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -9,13 +9,14 @@
 from nplinker.metabolomics.spectrum import Spectrum
 from .data_linking_functions import calc_correlation_matrix
 
-
 if TYPE_CHECKING:
     from nplinker.strain_collection import StrainCollection
     from nplinker.strains import Strain
 
 logger = LogConfig.getLogger(__name__)
 
+LINK_TYPES = ['spec-gcf', 'mf-gcf']
+
 
 class DataLinks():
 
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index a97775bd..8bf16e1c 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -7,6 +7,7 @@
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.spectrum import Spectrum
+from .data_linking import LINK_TYPES
 
 
 if TYPE_CHECKING:
@@ -14,19 +15,8 @@
 
 logger = LogConfig.getLogger(__file__)
 
-LINK_TYPES = ['spec-gcf', 'mf-gcf']
-
 
 class LinkFinder():
-    """
-    Class to:
-    1) Score potential links based on collected information from:
-        DataLinks, LinkLikelihood (and potentially other resources)
-    Different scores can be used for this!
-
-    2) Rank and output selected candidates
-    3) Create output plots and tables
-    """
 
     def __init__(self):
         """
@@ -58,7 +48,14 @@ def cal_score(
                 use for Metcalf scoring. The weights are applied to
                 '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'.
                 Defaults to (10, -10, 0, 1).
+
+        Raises:
+            ValueError: If an invalid link type is provided.
         """
+        if link_type not in LINK_TYPES:
+            raise ValueError(
+                f'Invalid link type: {link_type}. Must be one of {LINK_TYPES}')
+
         if link_type == 'spec-gcf':
             self.raw_score_spec_gcf = (
                 data_links.cooccurrence_spec_gcf * scoring_weights[0] +
@@ -105,12 +102,10 @@ def _cal_mean_std(self, data_links, scoring_weights):
                 variance[n, m] = expected_sq
         return mean, np.sqrt(variance)
 
-    def get_links(
-        self,
-        *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
-        | tuple[MolecularFamily, ...],
-        score_cutoff: float = 0.5
-    ) -> list[tuple[tuple[str], tuple[str], tuple[float]]]:
+    def get_links(self,
+                  *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
+                  | tuple[MolecularFamily, ...],
+                  score_cutoff: float = 0.5) -> list[pd.DataFrame]:
         """Get scores for links between objects.
 
         Args:
@@ -118,60 +113,71 @@ def get_links(
             score_cutoff(float): Minimum score to consider a link (≥score_cutoff).
                 Default is 0.5.
         Returns:
-            list: List of tuples containing the ids of the linked objects and the score.
-                The tuple contains three tuples:
-                - the first tuple contains the ids of the input/source objects,
-                - the second tuple contains the ids of the target objects,
-                - the third tuple contains the scores.
+            list: List of data frames containing the ids of the linked objects
+                and the score. The data frame contains three rows:
+                - the first row contains the ids of the input/source objects,
+                - the second row contains the ids of the target objects,
+                - the third row contains the scores.
 
         Raises:
             TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
         """
-        if self._isinstance(*objects, GCF):
+        if self._isinstance(GCF, *objects):
             obj_type = 'gcf'
-        elif self._isinstance(*objects, Spectrum):
+        elif self._isinstance(Spectrum, *objects):
             obj_type = 'spec'
-        elif self._isinstance(*objects, MolecularFamily):
-            obj_type = 'fam'
+        elif self._isinstance(MolecularFamily, *objects):
+            obj_type = 'mf'
         else:
+            types = [type(i) for i in objects]
             raise TypeError(
-                'Input objects must be GCF, Spectrum or MolecularFamily objects.'
+                f'Invalid type "{set(types)}". Input objects must be GCF, Spectrum or MolecularFamily objects.'
             )
 
         links = []
         if obj_type == 'gcf':
             obj_ids = [gcf.gcf_id for gcf in objects]
-            all_scores = (self.raw_score_spec_gcf.loc[:, obj_ids],
-                          self.raw_score_fam_gcf.loc[:, obj_ids])
-            for scores in all_scores:
-                links.append(self._get_scores_source_gcf(scores, score_cutoff))
+            # spec-gcf
+            scores = self.raw_score_spec_gcf.loc[:, obj_ids]
+            df = self._get_scores_source_gcf(scores, score_cutoff)
+            df.name = LINK_TYPES[0]
+            links.append(df)
+            # mf-gcf
+            scores = self.raw_score_fam_gcf.loc[:, obj_ids]
+            df = self._get_scores_source_gcf(scores, score_cutoff)
+            df.name = LINK_TYPES[1]
+            links.append(df)
+
         if obj_type == 'spec':
             obj_ids = [spec.spectrum_id for spec in objects]
-            all_scores = self.raw_score_spec_gcf.loc[obj_ids, :]
-            links.append(self._get_scores_source_met(all_scores, score_cutoff))
-        if obj_type == 'fam':
+            scores = self.raw_score_spec_gcf.loc[obj_ids, :]
+            df = self._get_scores_source_met(scores, score_cutoff)
+            df.name = LINK_TYPES[0]
+            links.append(df)
+
+        if obj_type == 'mf':
             obj_ids = [mf.family_id for mf in objects]
-            all_scores = self.raw_score_fam_gcf.loc[obj_ids, :]
-            links.append(self._get_scores_source_met(all_scores, score_cutoff))
+            scores = self.raw_score_fam_gcf.loc[obj_ids, :]
+            df = self._get_scores_source_met(scores, score_cutoff)
+            df.name = LINK_TYPES[1]
+            links.append(df)
         return links
 
-    def _isinstance(self, *objects, obj_type=GCF) -> bool:
-        return all(isinstance(x, obj_type) for x in objects)
-
-    def _get_scores_source_gcf(self, scores, score_cutoff):
-        candidate_met_gcf_indexes = np.where(scores >= score_cutoff)
-        src_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list()
-        target_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list()
-        scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist(
-        )
-        return tuple(src_obj_ids), tuple(target_obj_ids), tuple(
-            scores_candidate)
-
-    def _get_scores_source_met(self, scores, score_cutoff):
-        candidate_met_gcf_indexes = np.where(scores >= score_cutoff)
-        src_obj_ids = scores.index[candidate_met_gcf_indexes[0]].to_list()
-        target_obj_ids = scores.columns[candidate_met_gcf_indexes[1]].to_list()
-        scores_candidate = scores.to_numpy()[candidate_met_gcf_indexes].tolist(
-        )
-        return tuple(src_obj_ids), tuple(target_obj_ids), tuple(
-            scores_candidate)
+    def _isinstance(self, _type, *objects) -> bool:
+        return all(isinstance(x, _type) for x in objects)
+
+    def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame:
+        row_indexes, col_indexes = np.where(scores >= score_cutoff)
+        src_obj_ids = scores.columns[col_indexes].to_list()
+        target_obj_ids = scores.index[row_indexes].to_list()
+        scores_candidate = scores.values[row_indexes, col_indexes].tolist()
+        return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate],
+                            index=['source', 'target', 'score'])
+
+    def _get_scores_source_met(self, scores, score_cutoff) -> pd.DataFrame:
+        row_indexes, col_indexes = np.where(scores >= score_cutoff)
+        src_obj_ids = scores.index[row_indexes].to_list()
+        target_obj_ids = scores.columns[col_indexes].to_list()
+        scores_candidate = scores.values[row_indexes, col_indexes].tolist()
+        return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate],
+                            index=['source', 'target', 'score'])

From 47770c57fd361d6ba00a3e5691a1bf05930fe6ee Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 9 May 2023 12:20:17 +0200
Subject: [PATCH 55/95] add `lookup_mf` method in NPLinker class

---
 src/nplinker/nplinker.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 02799ed0..943cb21b 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -18,6 +18,7 @@
 from .scoring.rosetta_scoring import RosettaScoring
 from .strain_collection import StrainCollection
 
+
 if TYPE_CHECKING:
     from collections.abc import Sequence
     from .strains import Strain
@@ -124,6 +125,7 @@ def __init__(self, userconfig=None):
         self._bgc_lookup = {}
         self._gcf_lookup = {}
         self._spec_lookup = {}
+        self._mf_lookup = {}
 
         self._scoring_methods = {}
         config_methods = self._config.config.get('scoring_methods', [])
@@ -298,6 +300,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
                 spec.spectrum_id: spec
                 for spec in self._spectra
             }
+            self._mf_lookup = {mf.family_id: mf for mf in self._molfams}
 
         logger.debug('load_data: completed')
         return True
@@ -392,8 +395,8 @@ def get_links(self, input_objects, scoring_methods, and_mode=True):
             objects_for_method = input_objects[i]
             logger.debug('Calling scoring method {} on {} objects'.format(
                 method.name, len(objects_for_method)))
-            link_collection = method.get_links(objects_for_method,
-                                               link_collection)
+            link_collection = method.get_links(*objects_for_method,
+                                               link_collection=link_collection)
 
         if not self._datalinks:
             logger.debug('Creating internal datalinks object')
@@ -482,6 +485,10 @@ def lookup_spectrum(self, spectrum_id):
         """If Spectrum ``name`` exists, return it. Otherwise return None"""
         return self._spec_lookup.get(spectrum_id, None)
 
+    def lookup_mf(self, mf_id):
+        """If MolecularFamily `family_id` exists, return it. Otherwise return None"""
+        return self._mf_lookup.get(mf_id, None)
+
     @property
     def strains(self):
         """Returns a list of all the strains in the dataset"""

From 25606e129fdab7dcca2c68ad39963af571ab47d8 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 9 May 2023 09:40:57 +0200
Subject: [PATCH 56/95] refactor MetcalfScoring class

---
 src/nplinker/scoring/metcalf_scoring.py | 389 ++++++++++--------------
 1 file changed, 169 insertions(+), 220 deletions(-)

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 9fb76fe6..8bc9d7d0 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -1,6 +1,8 @@
+from __future__ import annotations
 import os
+from typing import TYPE_CHECKING
 import numpy as np
-from nplinker.genomics import BGC
+import pandas as pd
 from nplinker.genomics import GCF
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
@@ -11,7 +13,10 @@
 from nplinker.scoring.linking.link_finder import LinkFinder
 from nplinker.scoring.methods import ScoringMethod
 from nplinker.scoring.object_link import ObjectLink
+from .linking.data_linking import LINK_TYPES
 
+if TYPE_CHECKING:
+    from .link_collection import LinkCollection
 
 logger = LogConfig.getLogger(__name__)
 
@@ -22,17 +27,13 @@ class MetcalfScoring(ScoringMethod):
     LINKFINDER = None
     NAME = 'metcalf'
 
-    # enumeration for accessing results of LinkFinder.get_links, which are (3, num_links) arrays:
-    # - R_SRC_ID: the ID of an object that was supplied as input to get_links
-    # - R_DST_ID: the ID of an object that was discovered to have a link to an input object
-    # - R_SCORE: the score for the link between a pair of objects
-    R_SRC_ID, R_DST_ID, R_SCORE = range(3)
-
     def __init__(self, npl):
         super().__init__(npl)
         self.cutoff = 1.0
         self.standardised = True
 
+    # TODO CG: not sure why using staticmethod here. Check later and refactor if possible
+    # TODO CG: refactor this method and extract code for cache file to a separate method
     @staticmethod
     def setup(npl):
         logger.info(
@@ -46,7 +47,6 @@ def setup(npl):
 
         # the metcalf preprocessing can take a long time for large datasets, so it's
         # better to cache as the data won't change unless the number of objects does
-
         dataset_counts = [
             len(npl.bgcs),
             len(npl.gcfs),
@@ -77,246 +77,195 @@ def setup(npl):
             logger.info(
                 'MetcalfScoring.setup preprocessing dataset (this may take some time)'
             )
-            MetcalfScoring.DATALINKS = DataLinks(npl._gcfs, npl._spectra,
-                                                 npl._molfams, npl._strains)
+            MetcalfScoring.DATALINKS = DataLinks(npl.gcfs, npl.spectra,
+                                                 npl.molfams, npl.strains)
             MetcalfScoring.LINKFINDER = LinkFinder()
-            MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
-                                                      link_type='spec-gcf')
-            MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS,
-                                                      link_type='mf-gcf')
+            MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS,
+                                                link_type=LINK_TYPES[0])
+            MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS,
+                                                link_type=LINK_TYPES[1])
             logger.debug('MetcalfScoring.setup caching results')
             save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS,
                                MetcalfScoring.LINKFINDER), cache_file)
 
         logger.info('MetcalfScoring.setup completed')
 
+    # TODO CG: is it needed? remove it if not
     @property
     def datalinks(self):
         return MetcalfScoring.DATALINKS
 
-    def _metcalf_postprocess_met(self, linkfinder, results, input_type):
-        logger.debug(
-            'Postprocessing results for standardised Metcalf scores (met input)'
-        )
-        # results will be links from EITHER Spectrum OR MolFam => GCF here
-
-        # need to know if the metabolomic objects given as input are Spectrum/MolFam
-        met_objs = self.npl.spectra if input_type == Spectrum else self.npl.molfams
-        new_src, new_dst, new_sco = [], [], []
-
-        # go through each pair of input objects and calculate their standardised scores
-        for i in range(len(results[0][self.R_SRC_ID])):
-            met_obj = met_objs[int(results[0][self.R_SRC_ID][i])]
-            # met_obj will now be either a Spectrum or a MolecularFamily, but
-            # doesn't matter which (in this implementation at least) because they
-            # both have a .strains attribute which is the only thing we need. For
-            # Spectra it's the number of strains, for a MolFam it's the total
-            # number of *unique* strains across all Spectra in that family.
-            met_strains = len(met_obj.strains)
-            gcf = self.npl.gcfs[int(results[0][self.R_DST_ID][i])]
-            gen_strains = len(gcf.strains)
-
-            # lookup expected + variance values based on strain counts
-            expected = linkfinder.metcalf_mean[met_strains][gen_strains]
-            variance_sqrt = linkfinder.metcalf_std[met_strains][
-                gen_strains]
-
-            # calculate the final score based on the basic Metcalf score for these two
-            # particular objects
-            final_score = (results[0][self.R_SCORE][i] -
-                           expected) / variance_sqrt
-
-            # finally apply the scoring cutoff and store the result
-            if self.cutoff is None or (final_score >= self.cutoff):
-                new_src.append(int(results[0][self.R_SRC_ID][i]))
-                new_dst.append(int(results[0][self.R_DST_ID][i]))
-                new_sco.append(final_score)
-
-        # overwrite original "results" with equivalent new data structure
-        return [np.array([new_src, new_dst, new_sco])]
-
-    def _metcalf_postprocess_gen(self, linkfinder, results, input_type):
-        logger.debug(
-            'Postprocessing results for standardised Metcalf scores (gen input)'
-        )
-        # results will be links from GCF to BOTH Spectrum and MolFams here (first
-        # element Spectra, second MolFams)
-
-        new_results = []
-        met_objs_list = [self.npl.spectra, self.npl.molfams]
-
-        # iterate over the Spectrum results and then the MolFam results
-        for m, met_objs in enumerate(met_objs_list):
-            new_src, new_dst, new_sco = [], [], []
-
-            # go through each pair of input objects and calculate their standardised scores
-            for i in range(len(results[m][self.R_SRC_ID])):
-                gcf = self.npl.gcfs[int(results[m][self.R_SRC_ID][i])]
-                gen_strains = len(gcf.strains)
-
-                # met_obj will now be either a Spectrum or a MolecularFamily, but
-                # doesn't matter which (in this implementation at least) because they
-                # both have a .strains attribute which is the only thing we need. For
-                # Spectra it's the number of strains, for a MolFam it's the total
-                # number of *unique* strains across all Spectra in that family.
-                met_obj = met_objs[int(results[m][self.R_DST_ID][i])]
-                met_strains = len(met_obj.strains)
-
-                # lookup expected + variance values based on strain counts
-                expected = linkfinder.metcalf_mean[met_strains][
-                    gen_strains]
-                variance_sqrt = linkfinder.metcalf_std[met_strains][
-                    gen_strains]
-
-                # calculate the final score based on the basic Metcalf score for these two
-                # particular objects
-                final_score = (results[m][self.R_SCORE][i] -
-                               expected) / variance_sqrt
-
-                # finally apply the scoring cutoff and store the result
-                if self.cutoff is None or (final_score >= self.cutoff):
-                    new_src.append(int(results[m][self.R_SRC_ID][i]))
-                    new_dst.append(int(results[m][self.R_DST_ID][i]))
-                    new_sco.append(final_score)
-
-            # overwrite original "results" with equivalent new data structure
-            new_results.append(np.array([new_src, new_dst, new_sco]))
-
-        return new_results
-
-    def get_links(self, objects, link_collection):
-        # enforce constraint that the list must contain a set of identically typed objects
-        if not all(isinstance(x, type(objects[0])) for x in objects):
-            raise Exception(
-                'MetcalfScoring: uniformly-typed list of objects is required')
-
-        # also can't handle BGCs here, must be one of the other 3 types (GCF/Spectrum/MolecularFamily)
-        if isinstance(objects[0], BGC):
-            raise Exception(
-                'MetcalfScoring requires input type GCF/Spectrum/MolecularFamily, not BGC'
+    def get_links(self, *objects, link_collection) -> LinkCollection:
+        if self._isinstance(GCF, *objects):
+            obj_type = 'gcf'
+        elif self._isinstance(Spectrum, *objects):
+            obj_type = 'spec'
+        elif self._isinstance(MolecularFamily, *objects):
+            obj_type = 'mf'
+        else:
+            types = [type(i) for i in objects]
+            raise TypeError(
+                f'Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects.'
             )
 
-        datalinks = MetcalfScoring.DATALINKS
-        linkfinder = MetcalfScoring.LINKFINDER
-        input_type = type(objects[0])
+        if self.LINKFINDER is None:
+            raise ValueError((
+                'LinkFinder object not found. Have you called `MetcalfScoring.setup(npl)`?'
+            ))
 
-        logger.debug('MetcalfScoring: standardised = {}'.format(
-            self.standardised))
+        logger.debug(f'MetcalfScoring: standardised = {self.standardised}')
         if not self.standardised:
-            results = linkfinder.get_links(objects, self.name,
-                                           self.cutoff)
+            scores_list = self.LINKFINDER.get_links(*objects,
+                                                    score_cutoff=self.cutoff)
+        # TODO CG: verify the logics of standardised score and add unit tests
         else:
-            # get the basic Metcalf scores BUT ignore the cutoff value here by setting
-            # it to None. The actual user-supplied cutoff value is applied further down
-            # once the standardised scores for these results have been calculated.
-            results = linkfinder.get_links(objects, self.name, None)
-
-            # The "results" object varies slightly depending on the input provided
-            # to the LinkFinder class:
-            #  - given Spectra/MolFam input, it will be a single element list containing
-            #   a (3, x) array, where the first row contains source (input) object
-            #   IDs, the second contains destination (linked) object IDs, and the
-            #   third contains regular Metcalf scores for those pairs of objects.
-            #  - however for GCF input, "results" is instead a 2-element list where
-            #   each entry has the same structure as described above, with the first
-            #   entry describing GCF-Spectrum links and the second GCF-MolFam links.
-
-            gcf_input = (input_type == GCF)
-
-            if not gcf_input:
-                results = self._metcalf_postprocess_met(
-                    linkfinder, results, input_type)
+            # use negative infinity as the score cutoff to ensure we get all links
+            # the self.cutoff will be applied later in the postprocessing step
+            scores_list = self.LINKFINDER.get_links(*objects,
+                                                    score_cutoff=np.NINF)
+            if obj_type == 'gcf':
+                scores_list = self._cal_standardised_score_gen(
+                    self.LINKFINDER, scores_list)
             else:
-                results = self._metcalf_postprocess_gen(
-                    linkfinder, results, input_type)
+                scores_list = self._cal_standardised_score_met(
+                    self.LINKFINDER, scores_list)
 
-        scores_found = set()
-        metcalf_results = {}
-
-        if input_type == GCF:
+        link_scores = {}
+        if obj_type == 'gcf':
             logger.debug(
-                'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs={}, results={}'
-                .format(len(objects), results[0].shape))
-            # for GCF input, results contains two arrays of shape (3, x),
-            # which contain spec-gcf and mf-gcf links respectively
-            result_gcf_spec, result_gcf_fam = results[0], results[1]
-
-            for res, type_ in [(result_gcf_spec, Spectrum),
-                               (result_gcf_fam, MolecularFamily)]:
-                if res.shape[1] == 0:
-                    if type_ != MolecularFamily:
-                        logger.debug(
-                            'Found no links for {} input objects (type {})'.
-                            format(len(objects), type_))
-                    continue  # no results
-
-                # for each entry in the results (each Spectrum or MolecularFamily)
-                for j in range(res.shape[1]):
-                    # extract the ID of the object and get the object itself
-                    obj_id = int(res[self.R_DST_ID, j])
-                    obj = self.npl._spectra[
-                        obj_id] if type_ == Spectrum else self.npl._molfams[
-                            obj_id]
-
-                    # retrieve the GCF object too (can use its internal ID to index
-                    # directly into the .gcfs list)
-                    gcf = self.npl._gcfs[int(res[self.R_SRC_ID][j])]
-
-                    # record that this GCF has at least one link associated with it
-                    scores_found.add(gcf)
-
-                    # save the scores
-                    if gcf not in metcalf_results:
-                        metcalf_results[gcf] = {}
-                    metcalf_results[gcf][obj] = ObjectLink(
-                        gcf, obj, self, res[self.R_SCORE, j])
-
+                f'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, '
+                f'#inputs={len(objects)}.')
+            for scores in scores_list:
+                # when no links found
+                if scores.shape[1] == 0:
+                    logger.debug(
+                        f'MetcalfScoring: found no "{scores.name}" links')
+                else:
+                    # when links found
+                    for col_index in range(scores.shape[1]):
+                        gcf = self.npl.lookup_gcf(scores.loc['source',
+                                                             col_index])
+                        if scores.name == LINK_TYPES[0]:
+                            met = self.npl.lookup_spectrum(
+                                scores.loc['target', col_index])
+                        else:
+                            met = self.npl.lookup_mf(scores.loc['target',
+                                                                col_index])
+                        if gcf not in link_scores:
+                            link_scores[gcf] = {}
+                        # TODO CG: use id instead of object for gcf, met and self?
+                        link_scores[gcf][met] = ObjectLink(
+                            gcf, met, self, scores.loc['score', col_index])
+                    logger.debug(
+                        f'MetcalfScoring: found {len(link_scores)} {scores.name} links.'
+                    )
         else:
             logger.debug(
-                'MetcalfScoring: input_type=Spec/MolFam, result_type=GCF, inputs={}, results={}'
-                .format(len(objects), results[0].shape))
-            # for non-GCF input, result is a list containing a single array, shape (3, x)
-            # where x is the total number of links found
-            results = results[0]
-            if results.shape[1] == 0:
-                logger.debug('Found no links for {} input objects'.format(
-                    len(objects)))
-                link_collection._add_links_from_method(self, metcalf_results)
-                # can just bail out here in this case
-                logger.debug('MetcalfScoring: completed')
-                return link_collection
-
-            # for each entry in the results (each GCF)
-            for j in range(results.shape[1]):
-                # extract the ID of the GCF and use that to get the object itself
-                gcf = self.npl._gcfs[int(results[self.R_DST_ID, j])]
-
-                # retrieve the Spec/MolFam object too (can use its internal ID to index
-                # directly into the appropriate list)
-                obj_id = int(results[self.R_SRC_ID, j])
-                obj = self.npl._spectra[
-                    obj_id] if input_type == Spectrum else self.npl._molfams[
-                        obj_id]
-
-                # record that this Spectrum or MolecularFamily has at least one link associated with it
-                scores_found.add(obj)
-
-                # save the scores
-                if obj not in metcalf_results:
-                    metcalf_results[obj] = {}
-                metcalf_results[obj][gcf] = ObjectLink(
-                    obj, gcf, self, results[self.R_SCORE, j])
-
-        logger.debug('MetcalfScoring found {} results'.format(
-            len(metcalf_results)))
-        link_collection._add_links_from_method(self, metcalf_results)
+                f'MetcalfScoring: input_type=Spec/MolFam, result_type=GCF, '
+                f'#inputs={len(objects)}.')
+            scores = scores_list[0]
+            # when no links found
+            if scores.shape[1] == 0:
+                logger.debug(
+                    f'MetcalfScoring: found no links "{scores.name}" for input objects'
+                )
+            else:
+                for col_index in range(scores.shape[1]):
+                    gcf = self.npl.lookup_gcf(scores.loc['target', col_index])
+                    if scores.name == LINK_TYPES[0]:
+                        met = self.npl.lookup_spectrum(scores.loc['source',
+                                                                  col_index])
+                    else:
+                        met = self.npl.lookup_mf(scores.loc['source',
+                                                            col_index])
+                    if met not in link_scores:
+                        link_scores[met] = {}
+                    link_scores[met][gcf] = ObjectLink(
+                        met, gcf, self, scores.loc['score', col_index])
+                logger.debug(
+                    f'MetcalfScoring: found {len(link_scores)} {scores.name} links.'
+                )
+
+        link_collection._add_links_from_method(self, link_scores)
         logger.debug('MetcalfScoring: completed')
         return link_collection
 
+    def _isinstance(self, _type, *objects) -> bool:
+        return all(isinstance(x, _type) for x in objects)
+
+    def _cal_standardised_score_met(self, linkfinder,
+                                    results) -> list[pd.DataFrame]:
+        logger.debug('Calculating standardised Metcalf scores (met input)')
+        raw_score = results[0]
+        z_scores = []
+        for col_index in range(raw_score.shape[1]):
+            gcf = self.npl.lookup_gcf(raw_score.loc['target', col_index])
+            if raw_score.name == LINK_TYPES[0]:
+                met = self.npl.lookup_spectrum(raw_score.at['source',
+                                                            col_index])
+            else:
+                met = self.npl.lookup_mf(raw_score.at['source', col_index])
+
+            num_gcf_strains = len(gcf.strains)
+            num_met_strains = len(met.strains)
+            mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains]
+            sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains]
+            z_score = (raw_score.at['score', col_index] - mean) / sqrt
+            z_scores.append(z_score)
+
+        z_scores = np.array(z_scores)
+        mask = z_scores >= self.cutoff
+
+        scores_df = pd.DataFrame([
+            raw_score.loc['source'].values[mask],
+            raw_score.loc['target'].values[mask], z_scores[mask]
+        ],
+                                 index=raw_score.index)
+        scores_df.name = raw_score.name
+
+        return [scores_df]
+
+    def _cal_standardised_score_gen(self, linkfinder,
+                                    results) -> list[pd.DataFrame]:
+        logger.debug('Calculating standardised Metcalf scores (gen input)')
+        postprocessed_scores = []
+        for raw_score in results:
+            z_scores = []
+            for col_index in range(raw_score.shape[1]):
+                gcf = self.npl.lookup_gcf(raw_score.loc['source', col_index])
+                if raw_score.name == LINK_TYPES[0]:
+                    met = self.npl.lookup_spectrum(raw_score.at['target',
+                                                                col_index])
+                else:
+                    met = self.npl.lookup_mf(raw_score.at['target', col_index])
+
+                num_gcf_strains = len(gcf.strains)
+                num_met_strains = len(met.strains)
+                mean = linkfinder.metcalf_mean[num_met_strains][
+                    num_gcf_strains]
+                sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains]
+                z_score = (raw_score.at['score', col_index] - mean) / sqrt
+                z_scores.append(z_score)
+
+            z_scores = np.array(z_scores)
+            mask = z_scores >= self.cutoff
+
+            scores_df = pd.DataFrame([
+                raw_score.loc['source'].values[mask],
+                raw_score.loc['target'].values[mask], z_scores[mask]
+            ],
+                                     index=raw_score.index)
+            scores_df.name = raw_score.name
+            postprocessed_scores.append(scores_df)
+
+        return postprocessed_scores
+
+    # TODO CG: refactor this method
     def format_data(self, data):
         # for metcalf the data will just be a floating point value (i.e. the score)
         return f'{data:.4f}'
 
+    # TODO CG: refactor this method
     def sort(self, objects, reverse=True):
         # sort based on score
         return sorted(objects,

From e22a8ce380579cfc9f33b149abee02d9f94ceefd Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 08:39:25 +0200
Subject: [PATCH 57/95] add deprecation to LinkLikelihood class

---
 src/nplinker/scoring/linking/link_likelihood.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py
index 830e0676..3a7b1e59 100644
--- a/src/nplinker/scoring/linking/link_likelihood.py
+++ b/src/nplinker/scoring/linking/link_likelihood.py
@@ -1,3 +1,4 @@
+from deprecated import deprecated
 from nplinker.logconfig import LogConfig
 from nplinker.scoring.linking.data_linking_functions import \
     calc_likelihood_matrix
@@ -5,7 +6,7 @@
 
 logger = LogConfig.getLogger(__file__)
 
-
+@deprecated(version='1.3.3', reason="It's unused and will be removed in 2.0.0")
 class LinkLikelihood():
     """
     Class to:

From 79ecb4dd138197c5eb8983147dfbac1b0df9e9ee Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 08:59:34 +0200
Subject: [PATCH 58/95] add `__init__.py` to linking module

---
 src/nplinker/scoring/__init__.py             | 10 ++++++++++
 src/nplinker/scoring/linking/__init__.py     | 10 ++++++++++
 src/nplinker/scoring/linking/data_linking.py |  1 +
 src/nplinker/scoring/linking/link_finder.py  |  4 ++--
 src/nplinker/scoring/metcalf_scoring.py      | 12 ++++++------
 tests/scoring/test_data_linking_functions.py |  2 +-
 tests/scoring/test_scoring.py                |  4 ++--
 7 files changed, 32 insertions(+), 11 deletions(-)
 create mode 100644 src/nplinker/scoring/linking/__init__.py

diff --git a/src/nplinker/scoring/__init__.py b/src/nplinker/scoring/__init__.py
index e69de29b..0aaefeaf 100644
--- a/src/nplinker/scoring/__init__.py
+++ b/src/nplinker/scoring/__init__.py
@@ -0,0 +1,10 @@
+import logging
+from .link_collection import LinkCollection
+from .metcalf_scoring import MetcalfScoring
+from .methods import ScoringMethod
+from .object_link import ObjectLink
+
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
+__all__ = ["LinkCollection", "MetcalfScoring", "ScoringMethod", "ObjectLink"]
diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py
new file mode 100644
index 00000000..bb9043df
--- /dev/null
+++ b/src/nplinker/scoring/linking/__init__.py
@@ -0,0 +1,10 @@
+import logging
+from .data_linking import DataLinks
+from .data_linking import LINK_TYPES
+from .data_linking_functions import calc_correlation_matrix
+from .link_finder import LinkFinder
+
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
+__all__ = ["DataLinks", "LINK_TYPES", "calc_correlation_matrix", "LinkFinder"]
diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index e0849421..c8515aaf 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -9,6 +9,7 @@
 from nplinker.metabolomics.spectrum import Spectrum
 from .data_linking_functions import calc_correlation_matrix
 
+
 if TYPE_CHECKING:
     from nplinker.strain_collection import StrainCollection
     from nplinker.strains import Strain
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 8bf16e1c..5a3c167c 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -7,11 +7,11 @@
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.spectrum import Spectrum
-from .data_linking import LINK_TYPES
+from . import LINK_TYPES
 
 
 if TYPE_CHECKING:
-    from .data_linking import DataLinks
+    from . import DataLinks
 
 logger = LogConfig.getLogger(__file__)
 
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 8bc9d7d0..bff24861 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -9,14 +9,14 @@
 from nplinker.metabolomics.spectrum import Spectrum
 from nplinker.pickler import load_pickled_data
 from nplinker.pickler import save_pickled_data
-from nplinker.scoring.linking.data_linking import DataLinks
-from nplinker.scoring.linking.link_finder import LinkFinder
-from nplinker.scoring.methods import ScoringMethod
-from nplinker.scoring.object_link import ObjectLink
-from .linking.data_linking import LINK_TYPES
+from .linking import DataLinks
+from .linking import LINK_TYPES
+from .linking import LinkFinder
+from .methods import ScoringMethod
+from .object_link import ObjectLink
 
 if TYPE_CHECKING:
-    from .link_collection import LinkCollection
+    from . import LinkCollection
 
 logger = LogConfig.getLogger(__name__)
 
diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_data_linking_functions.py
index 9f55d1ac..d0cd8c95 100644
--- a/tests/scoring/test_data_linking_functions.py
+++ b/tests/scoring/test_data_linking_functions.py
@@ -15,7 +15,7 @@
 # test functions
 
 import numpy as np
-from nplinker.scoring.linking.data_linking_functions import calc_correlation_matrix
+from nplinker.scoring.linking import calc_correlation_matrix
 
 
 def test_calc_correlation_matrix():
diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py
index 2825ad9e..b04f74b1 100644
--- a/tests/scoring/test_scoring.py
+++ b/tests/scoring/test_scoring.py
@@ -3,8 +3,8 @@
 from nplinker.genomics import BGC
 from nplinker.genomics import GCF
 from nplinker.metabolomics.spectrum import Spectrum
-from nplinker.scoring.linking.data_linking import DataLinks
-from nplinker.scoring.linking.link_finder import LinkFinder
+from nplinker.scoring.linking import DataLinks
+from nplinker.scoring.linking import LinkFinder
 from nplinker.scoring.linking.misc_deprecated import hg_scoring
 from nplinker.scoring.linking.misc_deprecated import metcalf_scoring
 from nplinker.strains import Strain

From ac94cd867067a5f9caadee04bde3a5c3724333ca Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 09:48:24 +0200
Subject: [PATCH 59/95] rename `data_linking.py` to `data_links.py`

---
 src/nplinker/scoring/linking/__init__.py                      | 4 ++--
 .../scoring/linking/{data_linking.py => data_links.py}        | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename src/nplinker/scoring/linking/{data_linking.py => data_links.py} (100%)

diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py
index bb9043df..d2419513 100644
--- a/src/nplinker/scoring/linking/__init__.py
+++ b/src/nplinker/scoring/linking/__init__.py
@@ -1,6 +1,6 @@
 import logging
-from .data_linking import DataLinks
-from .data_linking import LINK_TYPES
+from .data_links import DataLinks
+from .data_links import LINK_TYPES
 from .data_linking_functions import calc_correlation_matrix
 from .link_finder import LinkFinder
 
diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_links.py
similarity index 100%
rename from src/nplinker/scoring/linking/data_linking.py
rename to src/nplinker/scoring/linking/data_links.py

From 8c98e9633dc32cde2e468108c4646a1010633b90 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 09:49:58 +0200
Subject: [PATCH 60/95] rename `data_linking_functions.py` to `utils.py`

---
 src/nplinker/scoring/linking/__init__.py             |  2 +-
 src/nplinker/scoring/linking/data_links.py           |  2 +-
 src/nplinker/scoring/linking/link_likelihood.py      |  2 +-
 .../linking/{data_linking_functions.py => utils.py}  |  0
 tests/scoring/test_data_linking_functions.py         | 12 ++++++------
 5 files changed, 9 insertions(+), 9 deletions(-)
 rename src/nplinker/scoring/linking/{data_linking_functions.py => utils.py} (100%)

diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py
index d2419513..5bf1db00 100644
--- a/src/nplinker/scoring/linking/__init__.py
+++ b/src/nplinker/scoring/linking/__init__.py
@@ -1,7 +1,7 @@
 import logging
 from .data_links import DataLinks
 from .data_links import LINK_TYPES
-from .data_linking_functions import calc_correlation_matrix
+from .utils import calc_correlation_matrix
 from .link_finder import LinkFinder
 
 
diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index c8515aaf..ab4b0dcb 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -7,7 +7,7 @@
 from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.singleton_family import SingletonFamily
 from nplinker.metabolomics.spectrum import Spectrum
-from .data_linking_functions import calc_correlation_matrix
+from .utils import calc_correlation_matrix
 
 
 if TYPE_CHECKING:
diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py
index 3a7b1e59..b13e1e97 100644
--- a/src/nplinker/scoring/linking/link_likelihood.py
+++ b/src/nplinker/scoring/linking/link_likelihood.py
@@ -1,6 +1,6 @@
 from deprecated import deprecated
 from nplinker.logconfig import LogConfig
-from nplinker.scoring.linking.data_linking_functions import \
+from nplinker.scoring.linking.utils import \
     calc_likelihood_matrix
 
 
diff --git a/src/nplinker/scoring/linking/data_linking_functions.py b/src/nplinker/scoring/linking/utils.py
similarity index 100%
rename from src/nplinker/scoring/linking/data_linking_functions.py
rename to src/nplinker/scoring/linking/utils.py
diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_data_linking_functions.py
index d0cd8c95..200c198e 100644
--- a/tests/scoring/test_data_linking_functions.py
+++ b/tests/scoring/test_data_linking_functions.py
@@ -37,7 +37,7 @@ def test_calc_correlation_matrix():
          [1, 2, 2, 2, 2, 3], [1, 2, 2, 2, 2, 3]]))
 
 
-from nplinker.scoring.linking.data_linking_functions import calc_likelihood_matrix
+from nplinker.scoring.linking.utils import calc_likelihood_matrix
 
 
 def test_calc_likelihood_matrix():
@@ -59,7 +59,7 @@ def test_calc_likelihood_matrix():
     assert LBA.shape == (len(A), len(B))  # must have shape len(A), len(B)
 
 
-from nplinker.scoring.linking.data_linking_functions import pair_prob_hg
+from nplinker.scoring.linking.utils import pair_prob_hg
 
 
 def test_pair_prob_hg():
@@ -70,7 +70,7 @@ def test_pair_prob_hg():
     assert pair_prob_hg(1, 100, 2, 2) == 98 / 100 * 2 / 99 + 2 / 100 * 98 / 99
 
 
-from nplinker.scoring.linking.data_linking_functions import hit_prob_dist
+from nplinker.scoring.linking.utils import hit_prob_dist
 
 
 def test_hit_prob_dist():
@@ -82,7 +82,7 @@ def test_hit_prob_dist():
     assert pks[0][0] == 0.99**100
 
 
-from nplinker.scoring.linking.data_linking_functions import permutation_unique
+from nplinker.scoring.linking.utils import permutation_unique
 
 
 def test_permutation_unique():
@@ -101,7 +101,7 @@ def test_permutation_unique():
         testlist))) == 20  # math.factorial(5)/math.factorial(3)
 
 
-from nplinker.scoring.linking.data_linking_functions import pair_prob
+from nplinker.scoring.linking.utils import pair_prob
 
 
 def test_pair_prob():
@@ -124,7 +124,7 @@ def test_pair_prob():
     assert pair_prob(P_str, XG, Ny, hits) < (2 / 90 + 0.00000001)
 
 
-from nplinker.scoring.linking.data_linking_functions import link_prob
+from nplinker.scoring.linking.utils import link_prob
 
 
 def test_link_prob():

From 96dcb66fc5d11e1e6ca58eb0da352cc295f43a16 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 09:56:45 +0200
Subject: [PATCH 61/95] rename `test_data_linking_functions.py` to
 `test_linking_utils.py`.py

---
 .../{test_data_linking_functions.py => test_linking_utils.py}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/scoring/{test_data_linking_functions.py => test_linking_utils.py} (100%)

diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_linking_utils.py
similarity index 100%
rename from tests/scoring/test_data_linking_functions.py
rename to tests/scoring/test_linking_utils.py

From 18eb841b478edcc2c49abf75744cd5ff4a73d82a Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 09:54:49 +0200
Subject: [PATCH 62/95] Delete test_scoring.py

---
 tests/scoring/test_scoring.py | 132 ----------------------------------
 1 file changed, 132 deletions(-)
 delete mode 100644 tests/scoring/test_scoring.py

diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py
deleted file mode 100644
index b04f74b1..00000000
--- a/tests/scoring/test_scoring.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import numpy as np
-import pandas as pd
-from nplinker.genomics import BGC
-from nplinker.genomics import GCF
-from nplinker.metabolomics.spectrum import Spectrum
-from nplinker.scoring.linking import DataLinks
-from nplinker.scoring.linking import LinkFinder
-from nplinker.scoring.linking.misc_deprecated import hg_scoring
-from nplinker.scoring.linking.misc_deprecated import metcalf_scoring
-from nplinker.strains import Strain
-
-
-np.random.seed(50)
-
-
-def create_strains(n=10):
-    return [Strain(f'strain_{x:02d}') for x in range(n)]
-
-
-def create_gcfs(strains, n=3):
-    gcfs = []
-    for i in range(n):
-        gcf = GCF(f'fake_gcf_{i}')
-        num_strains = np.random.randint(1, len(strains))
-        randoms = list(range(len(strains)))
-        np.random.shuffle(randoms)
-        randoms = randoms[:num_strains]
-
-        for j in range(num_strains):
-            bgc = BGC(j, strains[randoms[j]])
-            gcf.add_bgc(bgc)
-
-        gcfs.append(gcf)
-
-    return gcfs
-
-
-def create_spectra(strains, n=3):
-    spectra = []
-
-    families = np.random.randint(0, n * 2, n)
-
-    for i in range(n):
-        # (id, peaks, spectrum_id, precursor_mz, parent_mz=None, rt=None):
-        spec = Spectrum(i, [(1, 2), (3, 4)], i, np.random.random())
-        num_strains = np.random.randint(1, len(strains))
-        randoms = list(range(len(strains)))
-        np.random.shuffle(randoms)
-        randoms = randoms[:num_strains]
-        spec.family = int(families[i])
-        for j in range(num_strains):
-            spec.add_strain(strains[randoms[j]], 'foo', 1)
-
-        spectra.append(spec)
-
-    return spectra
-
-
-def do_scoring_old(gcfs, spectra, strains, standardised):
-    scores = {}
-    for gcf in gcfs:
-        scores[gcf] = {}
-        for spec in spectra:
-            score = metcalf_scoring(spec,
-                                    gcf,
-                                    strains,
-                                    standardised=standardised)
-            scores[gcf][spec] = score
-
-    return scores
-
-
-def do_scoring_new(gcfs, spectra, strains, standardised):
-    datalinks = DataLinks(gcfs, spectra, strains)
-    lf = LinkFinder()
-    scores = lf.metcalf_scoring(datalinks)
-
-    # print(lf.metcalf_expected)
-    if standardised:
-        # standardised scoring thing
-        for i, gcf in enumerate(gcfs):
-            for j, spec in enumerate(spectra):
-                # get expected score, variance for objects with the current combo of strain counts
-                # (note that spectrum = type 1 object here)
-                met_strains = len(spec.strains)
-                gen_strains = len(gcf.strains)
-                expected = lf.metcalf_mean[met_strains][gen_strains]
-                variance = lf.metcalf_std[met_strains][gen_strains]
-                scores[j][i] = (scores[j][i] - expected) / np.sqrt(variance)
-    return scores
-
-
-def do_scoring_old_hg(gcfs, spectra, strains):
-    scores = {}
-    for gcf in gcfs:
-        scores[gcf] = {}
-        for spec in spectra:
-            score, _ = hg_scoring(spec, gcf, strains)
-            scores[gcf][spec] = score
-
-    return scores
-
-
-def do_scoring_new_hg(gcfs, spectra, strains):
-    datalinks = DataLinks(gcfs, spectra, strains)
-    lf = LinkFinder()
-    scores = lf.hg_scoring(datalinks)
-    return scores
-
-
-def run_metcalf_test(n_strains=3, n_gcfs=5, n_spectra=4, standardised=False):
-    strains = create_strains(n_strains)
-    gcfs = create_gcfs(strains, n_gcfs)
-    spectra = create_spectra(strains, n_spectra)
-
-    old_scores = do_scoring_old(gcfs, spectra, strains, standardised)
-    new_scores = do_scoring_new(gcfs, spectra, strains, standardised)
-
-    dfdata = {'nonvec_score': [], 'vec_score': [], 'gcf': [], 'spec': []}
-
-    for i, gcf in enumerate(gcfs):
-        for j, spec in enumerate(spectra):
-            dfdata['nonvec_score'].append(old_scores[gcf][spec])
-            dfdata['vec_score'].append(new_scores[j][i])
-            dfdata['gcf'].append(gcf)
-            dfdata['spec'].append(spec)
-
-    return pd.DataFrame(data=dfdata)
-
-
-if __name__ == "__main__":
-    run_metcalf_test()

From 4f12f355b04c1ef63ff29a13314140560335bd10 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 11:38:42 +0200
Subject: [PATCH 63/95] add dtype to DataLinks dataframes

---
 src/nplinker/scoring/linking/data_links.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index ab4b0dcb..e28b3bb5 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -157,7 +157,7 @@ def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF],
         """
         df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))),
                                      index=[gcf.gcf_id for gcf in gcfs],
-                                     columns=[strain.id for strain in strains])
+                                     columns=[strain.id for strain in strains], dtype=int)
         for gcf in gcfs:
             for strain in strains:
                 if gcf.has_strain(strain):
@@ -175,7 +175,7 @@ def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum],
         df_spec_strain = pd.DataFrame(
             np.zeros((len(spectra), len(strains))),
             index=[spectrum.spectrum_id for spectrum in spectra],
-            columns=[strain.id for strain in strains])
+            columns=[strain.id for strain in strains], dtype=int)
         for spectrum in spectra:
             for strain in strains:
                 if spectrum.has_strain(strain):
@@ -198,7 +198,7 @@ def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily],
 
         df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))),
                                     index=[mf.family_id for mf in mfs],
-                                    columns=[strain.id for strain in strains])
+                                    columns=[strain.id for strain in strains], dtype=int)
         for mf in mfs:
             for strain in strains:
                 if mf.has_strain(strain):
@@ -243,15 +243,15 @@ def _get_cooccurrence(
                                                  self.occurrence_gcf_strain)
         df_met_gcf = pd.DataFrame(m1,
                                   index=met_strain_occurrence.index,
-                                  columns=self.occurrence_gcf_strain.index)
+                                  columns=self.occurrence_gcf_strain.index, dtype=int)
         df_met_notgcf = pd.DataFrame(m2,
                                      index=met_strain_occurrence.index,
-                                     columns=self.occurrence_gcf_strain.index)
+                                     columns=self.occurrence_gcf_strain.index, dtype=int)
         df_notmet_gcf = pd.DataFrame(m3,
                                      index=met_strain_occurrence.index,
-                                     columns=self.occurrence_gcf_strain.index)
+                                     columns=self.occurrence_gcf_strain.index, dtype=int)
         df_notmet_notgcf = pd.DataFrame(
             m4,
             index=met_strain_occurrence.index,
-            columns=self.occurrence_gcf_strain.index)
+            columns=self.occurrence_gcf_strain.index, dtype=int)
         return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf

From e3fdd86f6c43a036a1934964e001e813478ecec4 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 11:42:17 +0200
Subject: [PATCH 64/95] remove mapping dataframes and relevant method from
 DataLinks

Removed:
- self.mapping_spec
- self.mapping_gcf
- self.mapping_fam
-self.mapping_strain
- _get_mappings_from_occurrence() method
---
 src/nplinker/scoring/linking/data_links.py | 48 +++++++---------------
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index e28b3bb5..7d275335 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -57,10 +57,6 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
                 of not molecular families<->gcfs.
             cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
                 of not molecular families<->not gcfs.
-            mapping_gcf(pd.DataFrame): A DataFrame to store mappings for gcfs.
-            mapping_spec(pd.DataFrame): A DataFrame to store mappings for spectra.
-            mapping_mf(pd.DataFrame): A DataFrame to store mappings for molecular families.
-            mapping_strain(pd.DataFrame): A DataFrame to store mappings for strains.
         """
         self._strains = strains
 
@@ -76,14 +72,6 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
         self.occurrence_mf_strain = self._get_occurrence_mf_strain(
             mfs, strains)
 
-        # DataFrame to store mapping tables, check `_get_mappings_from_occurance` for details
-        # TODO: these mappings could be removed when refactoring LinkFinder
-        self.mapping_spec = pd.DataFrame()
-        self.mapping_gcf = pd.DataFrame()
-        self.mapping_fam = pd.DataFrame()
-        self.mapping_strain = pd.DataFrame()
-        self._get_mappings_from_occurrence()
-
         # DataFrame to store co-occurrence of "spectra<->gcf" or "mfs<->gcf"
         logger.debug("Create correlation matrices: spectra<->gcfs.")
         (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf,
@@ -157,7 +145,8 @@ def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF],
         """
         df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))),
                                      index=[gcf.gcf_id for gcf in gcfs],
-                                     columns=[strain.id for strain in strains], dtype=int)
+                                     columns=[strain.id for strain in strains],
+                                     dtype=int)
         for gcf in gcfs:
             for strain in strains:
                 if gcf.has_strain(strain):
@@ -175,7 +164,8 @@ def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum],
         df_spec_strain = pd.DataFrame(
             np.zeros((len(spectra), len(strains))),
             index=[spectrum.spectrum_id for spectrum in spectra],
-            columns=[strain.id for strain in strains], dtype=int)
+            columns=[strain.id for strain in strains],
+            dtype=int)
         for spectrum in spectra:
             for strain in strains:
                 if spectrum.has_strain(strain):
@@ -198,28 +188,14 @@ def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily],
 
         df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))),
                                     index=[mf.family_id for mf in mfs],
-                                    columns=[strain.id for strain in strains], dtype=int)
+                                    columns=[strain.id for strain in strains],
+                                    dtype=int)
         for mf in mfs:
             for strain in strains:
                 if mf.has_strain(strain):
                     df_mf_strain.loc[mf.family_id, strain.id] = 1
         return df_mf_strain
 
-    def _get_mappings_from_occurrence(self):
-
-        # pd.Series with index = gcf.gcf_id and value = number of strains where gcf occurs
-        self.mapping_gcf["no of strains"] = self.occurrence_gcf_strain.sum(
-            axis=1)
-        # pd.Series with index = spectrum.spectrum_id and value = number of strains where spec occurs
-        self.mapping_spec["no of strains"] = self.occurrence_spec_strain.sum(
-            axis=1)
-        # pd.Series with index = mf.family_id and value = number of strains where mf occurs
-        self.mapping_fam["no of strains"] = self.occurrence_mf_strain.sum(
-            axis=1)
-        # pd.Series with index = strain.id and value = number of spectra in strain
-        self.mapping_strain["no of spectra"] = self.occurrence_spec_strain.sum(
-            axis=0)
-
     def _get_cooccurrence(
         self,
         link_type: str = 'spec-gcf'
@@ -243,15 +219,19 @@ def _get_cooccurrence(
                                                  self.occurrence_gcf_strain)
         df_met_gcf = pd.DataFrame(m1,
                                   index=met_strain_occurrence.index,
-                                  columns=self.occurrence_gcf_strain.index, dtype=int)
+                                  columns=self.occurrence_gcf_strain.index,
+                                  dtype=int)
         df_met_notgcf = pd.DataFrame(m2,
                                      index=met_strain_occurrence.index,
-                                     columns=self.occurrence_gcf_strain.index, dtype=int)
+                                     columns=self.occurrence_gcf_strain.index,
+                                     dtype=int)
         df_notmet_gcf = pd.DataFrame(m3,
                                      index=met_strain_occurrence.index,
-                                     columns=self.occurrence_gcf_strain.index, dtype=int)
+                                     columns=self.occurrence_gcf_strain.index,
+                                     dtype=int)
         df_notmet_notgcf = pd.DataFrame(
             m4,
             index=met_strain_occurrence.index,
-            columns=self.occurrence_gcf_strain.index, dtype=int)
+            columns=self.occurrence_gcf_strain.index,
+            dtype=int)
         return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf

From 7ef4da3473935f9cd22214441b17abe49049641d Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 11:58:12 +0200
Subject: [PATCH 65/95] Create test_data_links.py

---
 tests/scoring/test_data_links.py | 174 +++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 tests/scoring/test_data_links.py

diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
new file mode 100644
index 00000000..e812e24f
--- /dev/null
+++ b/tests/scoring/test_data_links.py
@@ -0,0 +1,174 @@
+import pandas as pd
+from pandas.util.testing import assert_frame_equal
+from pytest import fixture
+from nplinker.genomics import GCF
+from nplinker.metabolomics.molecular_family import MolecularFamily
+from nplinker.metabolomics.spectrum import Spectrum
+from nplinker.scoring.linking import DataLinks
+from nplinker.strain_collection import StrainCollection
+from nplinker.strains import Strain
+
+
+@fixture(scope='module')
+def strains_list():
+    return Strain('strain1'), Strain('strain2'), Strain('strain3')
+
+
+@fixture(scope='module')
+def strains(strains_list):
+    strains = StrainCollection()
+    for strain in strains_list:
+        strains.add(strain)
+    return strains
+
+
+@fixture(scope='module')
+def gcfs(strains_list):
+    gcf1 = GCF('gcf1')
+    gcf1.strains.add(strains_list[0])
+    gcf2 = GCF('gcf2')
+    gcf2.strains.add(strains_list[1])
+    gcf3 = GCF('gcf3')
+    gcf3.strains.add(strains_list[0])
+    gcf3.strains.add(strains_list[1])
+    return gcf1, gcf2, gcf3
+
+
+@fixture(scope='module')
+def spectra(strains_list):
+    spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None)
+    spectrum1.strains.add(strains_list[0])
+    spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None)
+    spectrum2.strains.add(strains_list[1])
+    spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None)
+    spectrum3.strains.add(strains_list[0])
+    spectrum3.strains.add(strains_list[1])
+    return spectrum1, spectrum2, spectrum3
+
+
+@fixture(scope='module')
+def mfs(spectra):
+    mf1 = MolecularFamily('mf1')
+    mf1.add_spectrum(spectra[0])
+    mf2 = MolecularFamily('mf2')
+    mf2.add_spectrum(spectra[1])
+    mf3 = MolecularFamily('mf3')
+    mf3.add_spectrum(spectra[2])
+    return mf1, mf2, mf3
+
+
+@fixture(scope='module')
+def datalinks(gcfs, spectra, mfs, strains):
+    return DataLinks(gcfs, spectra, mfs, strains)
+
+
+def test_init(datalinks):
+    # test occorrences
+    col_names = ['strain1', 'strain2', 'strain3']
+    assert_frame_equal(
+        datalinks.occurrence_gcf_strain,
+        pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]],
+                     index=['gcf1', 'gcf2', 'gcf3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.occurrence_spec_strain,
+        pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]],
+                     index=['spectrum1', 'spectrum2', 'spectrum3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.occurrence_mf_strain,
+        pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]],
+                     index=['mf1', 'mf2', 'mf3'],
+                     columns=col_names))
+    # test co-occorrences spec-gcf
+    col_names = ['gcf1', 'gcf2', 'gcf3']
+    assert_frame_equal(
+        datalinks.cooccurrence_spec_gcf,
+        pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]],
+                     index=['spectrum1', 'spectrum2', 'spectrum3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.cooccurrence_spec_notgcf,
+        pd.DataFrame([[0, 1, 0], [1, 0, 0], [1, 1, 0]],
+                     index=['spectrum1', 'spectrum2', 'spectrum3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.cooccurrence_notspec_gcf,
+        pd.DataFrame([[0, 1, 1], [1, 0, 1], [0, 0, 0]],
+                     index=['spectrum1', 'spectrum2', 'spectrum3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.cooccurrence_notspec_notgcf,
+        pd.DataFrame([[2, 1, 1], [1, 2, 1], [1, 1, 1]],
+                     index=['spectrum1', 'spectrum2', 'spectrum3'],
+                     columns=col_names))
+    # test co-occorrences mf-gcf
+    assert_frame_equal(
+        datalinks.cooccurrence_mf_gcf,
+        pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]],
+                     index=['mf1', 'mf2', 'mf3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.cooccurrence_mf_notgcf,
+        pd.DataFrame([[0, 1, 0], [1, 0, 0], [1, 1, 0]],
+                     index=['mf1', 'mf2', 'mf3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.cooccurrence_notmf_gcf,
+        pd.DataFrame([[0, 1, 1], [1, 0, 1], [0, 0, 0]],
+                     index=['mf1', 'mf2', 'mf3'],
+                     columns=col_names))
+    assert_frame_equal(
+        datalinks.cooccurrence_notmf_notgcf,
+        pd.DataFrame([[2, 1, 1], [1, 2, 1], [1, 1, 1]],
+                     index=['mf1', 'mf2', 'mf3'],
+                     columns=col_names))
+
+
+def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list):
+    sut = datalinks.get_common_strains(spectra[:2], gcfs)
+    expected = {
+        (spectra[0], gcfs[0]): [strains_list[0]],
+        (spectra[0], gcfs[1]): [],
+        (spectra[0], gcfs[2]): [strains_list[0]],
+        (spectra[1], gcfs[0]): [],
+        (spectra[1], gcfs[1]): [strains_list[1]],
+        (spectra[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+    sut = datalinks.get_common_strains(spectra[:2],
+                                       gcfs,
+                                       filter_no_shared=True)
+    expected = {
+        (spectra[0], gcfs[0]): [strains_list[0]],
+        (spectra[0], gcfs[2]): [strains_list[0]],
+        (spectra[1], gcfs[1]): [strains_list[1]],
+        (spectra[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+
+def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list):
+    sut = datalinks.get_common_strains(mfs[:2], gcfs)
+    expected = {
+        (mfs[0], gcfs[0]): [strains_list[0]],
+        (mfs[0], gcfs[1]): [],
+        (mfs[0], gcfs[2]): [strains_list[0]],
+        (mfs[1], gcfs[0]): [],
+        (mfs[1], gcfs[1]): [strains_list[1]],
+        (mfs[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+    sut = datalinks.get_common_strains(mfs[:2], gcfs, filter_no_shared=True)
+    expected = {
+        (mfs[0], gcfs[0]): [strains_list[0]],
+        (mfs[0], gcfs[2]): [strains_list[0]],
+        (mfs[1], gcfs[1]): [strains_list[1]],
+        (mfs[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+
+# TODO: add tests for the DataLinks class

From fd01596c8fb839cba11274b72e40c35b660f0e3d Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 12:06:17 +0200
Subject: [PATCH 66/95] add `conftest.py` for scoring tests

---
 tests/scoring/conftest.py        | 54 +++++++++++++++++++++++++++++
 tests/scoring/test_data_links.py | 58 +-------------------------------
 2 files changed, 55 insertions(+), 57 deletions(-)
 create mode 100644 tests/scoring/conftest.py

diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py
new file mode 100644
index 00000000..d627bcf1
--- /dev/null
+++ b/tests/scoring/conftest.py
@@ -0,0 +1,54 @@
+from pytest import fixture
+from nplinker.genomics import GCF
+from nplinker.metabolomics.molecular_family import MolecularFamily
+from nplinker.metabolomics.spectrum import Spectrum
+from nplinker.strain_collection import StrainCollection
+from nplinker.strains import Strain
+
+
+@fixture
+def strains_list():
+    return Strain('strain1'), Strain('strain2'), Strain('strain3')
+
+
+@fixture
+def strains(strains_list):
+    strains = StrainCollection()
+    for strain in strains_list:
+        strains.add(strain)
+    return strains
+
+
+@fixture
+def gcfs(strains_list):
+    gcf1 = GCF('gcf1')
+    gcf1.strains.add(strains_list[0])
+    gcf2 = GCF('gcf2')
+    gcf2.strains.add(strains_list[1])
+    gcf3 = GCF('gcf3')
+    gcf3.strains.add(strains_list[0])
+    gcf3.strains.add(strains_list[1])
+    return gcf1, gcf2, gcf3
+
+
+@fixture
+def spectra(strains_list):
+    spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None)
+    spectrum1.strains.add(strains_list[0])
+    spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None)
+    spectrum2.strains.add(strains_list[1])
+    spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None)
+    spectrum3.strains.add(strains_list[0])
+    spectrum3.strains.add(strains_list[1])
+    return spectrum1, spectrum2, spectrum3
+
+
+@fixture
+def mfs(spectra):
+    mf1 = MolecularFamily('mf1')
+    mf1.add_spectrum(spectra[0])
+    mf2 = MolecularFamily('mf2')
+    mf2.add_spectrum(spectra[1])
+    mf3 = MolecularFamily('mf3')
+    mf3.add_spectrum(spectra[2])
+    return mf1, mf2, mf3
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index e812e24f..36fc7b04 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -1,63 +1,10 @@
 import pandas as pd
 from pandas.util.testing import assert_frame_equal
 from pytest import fixture
-from nplinker.genomics import GCF
-from nplinker.metabolomics.molecular_family import MolecularFamily
-from nplinker.metabolomics.spectrum import Spectrum
 from nplinker.scoring.linking import DataLinks
-from nplinker.strain_collection import StrainCollection
-from nplinker.strains import Strain
 
 
-@fixture(scope='module')
-def strains_list():
-    return Strain('strain1'), Strain('strain2'), Strain('strain3')
-
-
-@fixture(scope='module')
-def strains(strains_list):
-    strains = StrainCollection()
-    for strain in strains_list:
-        strains.add(strain)
-    return strains
-
-
-@fixture(scope='module')
-def gcfs(strains_list):
-    gcf1 = GCF('gcf1')
-    gcf1.strains.add(strains_list[0])
-    gcf2 = GCF('gcf2')
-    gcf2.strains.add(strains_list[1])
-    gcf3 = GCF('gcf3')
-    gcf3.strains.add(strains_list[0])
-    gcf3.strains.add(strains_list[1])
-    return gcf1, gcf2, gcf3
-
-
-@fixture(scope='module')
-def spectra(strains_list):
-    spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None)
-    spectrum1.strains.add(strains_list[0])
-    spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None)
-    spectrum2.strains.add(strains_list[1])
-    spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None)
-    spectrum3.strains.add(strains_list[0])
-    spectrum3.strains.add(strains_list[1])
-    return spectrum1, spectrum2, spectrum3
-
-
-@fixture(scope='module')
-def mfs(spectra):
-    mf1 = MolecularFamily('mf1')
-    mf1.add_spectrum(spectra[0])
-    mf2 = MolecularFamily('mf2')
-    mf2.add_spectrum(spectra[1])
-    mf3 = MolecularFamily('mf3')
-    mf3.add_spectrum(spectra[2])
-    return mf1, mf2, mf3
-
-
-@fixture(scope='module')
+@fixture
 def datalinks(gcfs, spectra, mfs, strains):
     return DataLinks(gcfs, spectra, mfs, strains)
 
@@ -169,6 +116,3 @@ def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list):
         (mfs[1], gcfs[2]): [strains_list[1]]
     }
     assert sut == expected
-
-
-# TODO: add tests for the DataLinks class

From 9f0b289970582004d1865c294aca5790d5deaab3 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 14:09:16 +0200
Subject: [PATCH 67/95] update LinkFinder's attribute and private method

- refactor method `_cal_mean_std`
- rename attribute `raw_score_fam_gcf` to `raw_score_mf_gcf`
---
 src/nplinker/scoring/linking/link_finder.py | 23 ++++++++++++---------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 5a3c167c..38559e2d 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -16,6 +16,7 @@
 logger = LogConfig.getLogger(__file__)
 
 
+# TODO CG: this class could be merged to MetcalfScoring class
 class LinkFinder():
 
     def __init__(self):
@@ -26,12 +27,13 @@ def __init__(self):
         """
         # metcalf scores
         self.raw_score_spec_gcf = pd.DataFrame()
-        self.raw_score_fam_gcf = pd.DataFrame()
+        self.raw_score_mf_gcf = pd.DataFrame()
 
         # metcalf caching
         self.metcalf_mean = None
         self.metcalf_std = None
 
+    # TODO CG: cal_score method could be integrated to __init__
     def cal_score(
         self,
         data_links: DataLinks,
@@ -63,20 +65,19 @@ def cal_score(
                 data_links.cooccurrence_notspec_gcf * scoring_weights[2] +
                 data_links.cooccurrence_notspec_notgcf * scoring_weights[3])
         if link_type == 'mf-gcf':
-            self.raw_score_fam_gcf = (
+            self.raw_score_mf_gcf = (
                 data_links.cooccurrence_mf_gcf * scoring_weights[0] +
                 data_links.cooccurrence_mf_notgcf * scoring_weights[1] +
                 data_links.cooccurrence_notmf_gcf * scoring_weights[2] +
                 data_links.cooccurrence_notmf_notgcf * scoring_weights[3])
 
+        n_strains = data_links.occurrence_gcf_strain.shape[1]
         if self.metcalf_mean is None or self.metcalf_std is None:
             self.metcalf_mean, self.metcalf_std = self._cal_mean_std(
-                data_links, scoring_weights)
+                n_strains, scoring_weights)
 
-    def _cal_mean_std(self, data_links, scoring_weights):
-        # Compute the expected values for all possible values of spec and gcf strains
-        # we need the total number of strains
-        _, n_strains = data_links.occurrence_gcf_strain.shape
+    # TODO CG: read paper and check the logics of this method
+    def _cal_mean_std(self, n_strains, scoring_weights):
         sz = (n_strains + 1, n_strains + 1)
         mean = np.zeros(sz)
         variance = np.zeros(sz)
@@ -131,7 +132,7 @@ def get_links(self,
         else:
             types = [type(i) for i in objects]
             raise TypeError(
-                f'Invalid type "{set(types)}". Input objects must be GCF, Spectrum or MolecularFamily objects.'
+                f'Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects.'
             )
 
         links = []
@@ -143,7 +144,7 @@ def get_links(self,
             df.name = LINK_TYPES[0]
             links.append(df)
             # mf-gcf
-            scores = self.raw_score_fam_gcf.loc[:, obj_ids]
+            scores = self.raw_score_mf_gcf.loc[:, obj_ids]
             df = self._get_scores_source_gcf(scores, score_cutoff)
             df.name = LINK_TYPES[1]
             links.append(df)
@@ -157,7 +158,7 @@ def get_links(self,
 
         if obj_type == 'mf':
             obj_ids = [mf.family_id for mf in objects]
-            scores = self.raw_score_fam_gcf.loc[obj_ids, :]
+            scores = self.raw_score_mf_gcf.loc[obj_ids, :]
             df = self._get_scores_source_met(scores, score_cutoff)
             df.name = LINK_TYPES[1]
             links.append(df)
@@ -166,6 +167,8 @@ def get_links(self,
     def _isinstance(self, _type, *objects) -> bool:
         return all(isinstance(x, _type) for x in objects)
 
+    # TODO CG: the returned data could be changed to dict, like the that in
+    # the get_links method of the MetcalfScoring class
     def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame:
         row_indexes, col_indexes = np.where(scores >= score_cutoff)
         src_obj_ids = scores.columns[col_indexes].to_list()

From 3e3bdfa504dc335f4e4be227a48da82630f09711 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 15:47:17 +0200
Subject: [PATCH 68/95] Create test_link_finder.py

---
 tests/scoring/test_link_finder.py | 168 ++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 tests/scoring/test_link_finder.py

diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py
new file mode 100644
index 00000000..be28dcd3
--- /dev/null
+++ b/tests/scoring/test_link_finder.py
@@ -0,0 +1,168 @@
+import numpy as np
+import pandas as pd
+from pandas.util.testing import assert_frame_equal
+import pytest
+from pytest import fixture
+from nplinker.scoring.linking import DataLinks
+from nplinker.scoring.linking import LinkFinder
+
+
+@fixture
+def linkfinder():
+    return LinkFinder()
+
+
+@fixture
+def datalinks(gcfs, spectra, mfs, strains):
+    return DataLinks(gcfs, spectra, mfs, strains)
+
+
+def test_init(linkfinder):
+    assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame())
+    assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame())
+    assert linkfinder.metcalf_mean is None
+    assert linkfinder.metcalf_std is None
+
+
+def test_cal_score(linkfinder, datalinks):
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    assert_frame_equal(
+        linkfinder.raw_score_spec_gcf,
+        pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+                     index=['spectrum1', 'spectrum2', 'spectrum3'],
+                     columns=['gcf1', 'gcf2', 'gcf3']))
+
+    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    assert_frame_equal(
+        linkfinder.raw_score_mf_gcf,
+        pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+                     index=['mf1', 'mf2', 'mf3'],
+                     columns=['gcf1', 'gcf2', 'gcf3']))
+
+    # TODO CG: add tests for values after refactoring _cal_mean_std
+    assert isinstance(linkfinder.metcalf_mean, np.ndarray)
+    assert isinstance(linkfinder.metcalf_std, np.ndarray)
+    assert linkfinder.metcalf_mean.shape == (4, 4
+                                             )  # (n_strains+1 , n_strains+1)
+    assert linkfinder.metcalf_mean.shape == (4, 4)
+    # generated metcalf_mean
+    # array([[  3.,   2.,   1.,   0.],
+    #    [ -8.,  -2.,   4.,  10.],
+    #    [-19.,  -6.,   7.,  20.],
+    #    [-30., -10.,  10.,  30.]])
+    # generated metcalf_std
+    # array([[1.        , 1.        , 1.        , 1.        ],
+    #    [1.        , 9.89949494, 9.89949494, 1.        ],
+    #    [1.        , 9.89949494, 9.89949494, 1.        ],
+    #    [1.        , 1.        , 1.        , 1.        ]])
+
+
+def test_get_links_gcf(linkfinder, datalinks, gcfs):
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    index_names = ['source', 'target', 'score']
+    # cutoff = negative infinity (float)
+    links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF)
+    assert len(links) == 2
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3,
+                      [
+                          *['spectrum1'] * 3,
+                          *['spectrum2'] * 3,
+                          *['spectrum3'] * 3,
+                      ], [12, -9, 11, -9, 12, 11, 1, 1, 21]],
+                     index=index_names))
+    assert_frame_equal(
+        links[1],
+        pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3,
+                      [
+                          *['mf1'] * 3,
+                          *['mf2'] * 3,
+                          *['mf3'] * 3,
+                      ], [12, -9, 11, -9, 12, 11, 1, 1, 21]],
+                     index=index_names))
+
+    # cutoff = 0
+    links = linkfinder.get_links(*gcfs, score_cutoff=0)
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'],
+                      [
+                          *['spectrum1'] * 2,
+                          *['spectrum2'] * 2,
+                          *['spectrum3'] * 3,
+                      ], [12, 11, 12, 11, 1, 1, 21]],
+                     index=index_names))
+    assert_frame_equal(
+        links[1],
+        pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'],
+                      [
+                          *['mf1'] * 2,
+                          *['mf2'] * 2,
+                          *['mf3'] * 3,
+                      ], [12, 11, 12, 11, 1, 1, 21]],
+                     index=index_names))
+
+
+def test_get_links_spec(linkfinder, datalinks, spectra):
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    index_names = ['source', 'target', 'score']
+    # cutoff = negative infinity (float)
+    links = linkfinder.get_links(*spectra, score_cutoff=np.NINF)
+    assert len(links) == 1
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame([[
+            *['spectrum1'] * 3,
+            *['spectrum2'] * 3,
+            *['spectrum3'] * 3,
+        ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]],
+                     index=index_names))
+    # cutoff = 0
+    links = linkfinder.get_links(*spectra, score_cutoff=0)
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame([[
+            *['spectrum1'] * 2,
+            *['spectrum2'] * 2,
+            *['spectrum3'] * 3,
+        ], ['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'],
+                      [12, 11, 12, 11, 1, 1, 21]],
+                     index=index_names))
+
+
+def test_get_links_mf(linkfinder, datalinks, mfs):
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    index_names = ['source', 'target', 'score']
+    # cutoff = negative infinity (float)
+    links = linkfinder.get_links(*mfs, score_cutoff=np.NINF)
+    assert len(links) == 1
+
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame([[
+            *['mf1'] * 3,
+            *['mf2'] * 3,
+            *['mf3'] * 3,
+        ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]],
+                     index=index_names))
+
+    links = linkfinder.get_links(*mfs, score_cutoff=0)
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame([[
+            *['mf1'] * 2,
+            *['mf2'] * 2,
+            *['mf3'] * 3,
+        ], ['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'],
+                      [12, 11, 12, 11, 1, 1, 21]],
+                     index=index_names))
+
+
+def test_get_links_exceptions(linkfinder):
+    with pytest.raises(TypeError) as e:
+        linkfinder.get_links("")
+    assert "Invalid type {<class 'str'>}" in str(e.value)

From 46880a805e900d50a20d2e7130eda902947e4168 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 10:44:32 +0200
Subject: [PATCH 69/95] Update vscode plugin autodocstring template

- fix indentation bug in autodocsting
- remove `Examples:` section
---
 .vscode/vscode_docstring_google_adapted.mustache | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.vscode/vscode_docstring_google_adapted.mustache b/.vscode/vscode_docstring_google_adapted.mustache
index 42a7d1e1..f2d2b35d 100644
--- a/.vscode/vscode_docstring_google_adapted.mustache
+++ b/.vscode/vscode_docstring_google_adapted.mustache
@@ -26,7 +26,3 @@ Returns:
     {{typePlaceholder}}: {{descriptionPlaceholder}}
 {{/returns}}
 {{/returnsExist}}
-
-Examples:
-    >>> {{#placeholder}}
-    {{/placeholder}}
\ No newline at end of file

From 31586ddec8acbd6c5284ca5a4d52b00bdf7d6348 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 12:15:53 +0200
Subject: [PATCH 70/95] add scope for fixtures

---
 tests/scoring/conftest.py         | 10 +++++-----
 tests/scoring/test_data_links.py  |  2 +-
 tests/scoring/test_link_finder.py |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py
index d627bcf1..6a15f05f 100644
--- a/tests/scoring/conftest.py
+++ b/tests/scoring/conftest.py
@@ -6,12 +6,12 @@
 from nplinker.strains import Strain
 
 
-@fixture
+@fixture(scope='session')
 def strains_list():
     return Strain('strain1'), Strain('strain2'), Strain('strain3')
 
 
-@fixture
+@fixture(scope='session')
 def strains(strains_list):
     strains = StrainCollection()
     for strain in strains_list:
@@ -19,7 +19,7 @@ def strains(strains_list):
     return strains
 
 
-@fixture
+@fixture(scope='session')
 def gcfs(strains_list):
     gcf1 = GCF('gcf1')
     gcf1.strains.add(strains_list[0])
@@ -31,7 +31,7 @@ def gcfs(strains_list):
     return gcf1, gcf2, gcf3
 
 
-@fixture
+@fixture(scope='session')
 def spectra(strains_list):
     spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None)
     spectrum1.strains.add(strains_list[0])
@@ -43,7 +43,7 @@ def spectra(strains_list):
     return spectrum1, spectrum2, spectrum3
 
 
-@fixture
+@fixture(scope='session')
 def mfs(spectra):
     mf1 = MolecularFamily('mf1')
     mf1.add_spectrum(spectra[0])
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index 36fc7b04..94d1d24c 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -4,7 +4,7 @@
 from nplinker.scoring.linking import DataLinks
 
 
-@fixture
+@fixture(scope='module')
 def datalinks(gcfs, spectra, mfs, strains):
     return DataLinks(gcfs, spectra, mfs, strains)
 
diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py
index be28dcd3..871f4e90 100644
--- a/tests/scoring/test_link_finder.py
+++ b/tests/scoring/test_link_finder.py
@@ -7,12 +7,12 @@
 from nplinker.scoring.linking import LinkFinder
 
 
-@fixture
+@fixture(scope='module')
 def linkfinder():
     return LinkFinder()
 
 
-@fixture
+@fixture(scope='module')
 def datalinks(gcfs, spectra, mfs, strains):
     return DataLinks(gcfs, spectra, mfs, strains)
 

From 22ffd9039940e092bed6f13e4732bf671a8dc96e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 10 May 2023 17:11:39 +0200
Subject: [PATCH 71/95] Create test_metcalf_scoring.py

---
 tests/scoring/test_metcalf_scoring.py | 238 ++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 tests/scoring/test_metcalf_scoring.py

diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py
new file mode 100644
index 00000000..8bb715fc
--- /dev/null
+++ b/tests/scoring/test_metcalf_scoring.py
@@ -0,0 +1,238 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+from pandas.util.testing import assert_frame_equal
+import pytest
+from pytest import fixture
+from nplinker.nplinker import NPLinker
+from nplinker.scoring import LinkCollection
+from nplinker.scoring import MetcalfScoring
+from nplinker.scoring import ObjectLink
+from nplinker.scoring.linking import DataLinks
+from nplinker.scoring.linking import LinkFinder
+from .. import DATA_DIR
+
+
+@fixture(scope='module')
+def datalinks(gcfs, spectra, mfs, strains):
+    """DataLinks object. See `test_data_links.py` for its actual values."""
+    return DataLinks(gcfs, spectra, mfs, strains)
+
+
+@fixture(scope='module')
+def linkfinder(datalinks):
+    """LinkFinder object. See `test_link_finder.py` for its actual values."""
+    linkfinder = LinkFinder()
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    return linkfinder
+
+
+@fixture(scope='module')
+def npl(gcfs, spectra, mfs, strains, tmp_path_factory):
+    """Constructed NPLinker object.
+
+    This NPLinker object does not do loading `npl.load_data()`, instead we
+    manually set its attributes to the values we want to test.
+
+    The config file `nplinker_demo1.toml` does not affect the tests, just
+    making sure the NPLinker object can be created succesfully.
+    """
+    npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
+    npl._gcfs = gcfs
+    npl._spectra = spectra
+    npl._molfams = mfs
+    npl._strains = strains
+    npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs}
+    npl._mf_lookup = {mf.family_id: mf for mf in mfs}
+    npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra}
+    # tmp path to store 'metcalf/metcalf_scores.pckl' file
+    # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope)
+    npl._loader._root = tmp_path_factory.mktemp('npl_test')
+    return npl
+
+
+@fixture(scope='module')
+def mc(npl):
+    """MetcalfScoring object."""
+    mc = MetcalfScoring(npl)
+    mc.setup(npl)
+    return mc
+
+
+def test_init(npl):
+    mc = MetcalfScoring(npl)
+    assert mc.npl == npl
+    assert mc.name == 'metcalf'
+    assert mc.cutoff == 1.0
+    assert mc.standardised is True
+    assert mc.DATALINKS is None
+    assert mc.LINKFINDER is None
+
+
+def test_setup(mc, datalinks, linkfinder):
+    """Test `setup` method when cache file does not exist."""
+    assert isinstance(mc.DATALINKS, DataLinks)
+    assert isinstance(mc.LINKFINDER, LinkFinder)
+
+    assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain,
+                       datalinks.occurrence_gcf_strain)
+    assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf,
+                       datalinks.cooccurrence_spec_gcf)
+
+    assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf,
+                       linkfinder.raw_score_spec_gcf)
+    assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf,
+                       linkfinder.raw_score_mf_gcf)
+    assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean)
+    assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std)
+
+
+def test_setup_load_cache(mc, npl, datalinks, linkfinder, caplog):
+    """Test `setup` method when cache file exists."""
+    mc.setup(npl)
+    assert "MetcalfScoring.setup loading cached data" in caplog.text
+    assert "MetcalfScoring.setup caching results" not in caplog.text
+
+    assert isinstance(mc.DATALINKS, DataLinks)
+    assert isinstance(mc.LINKFINDER, LinkFinder)
+
+    assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain,
+                       datalinks.occurrence_gcf_strain)
+    assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf,
+                       datalinks.cooccurrence_spec_gcf)
+
+    assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf,
+                       linkfinder.raw_score_spec_gcf)
+    assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf,
+                       linkfinder.raw_score_mf_gcf)
+    assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean)
+    assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std)
+
+
+def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs):
+    """Test `get_links` method when input is GCF objects and `standardised` is False."""
+    # test raw scores (no standardisation)
+    mc.standardised = False
+
+    # when cutoff is negative infinity, i.e. taking all scores
+    mc.cutoff = np.NINF
+    links = mc.get_links(*gcfs, link_collection=LinkCollection())
+    assert isinstance(links, LinkCollection)
+    links = links.links  # dict of link values
+    assert len(links) == 3
+    assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'}
+    assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink)
+    # check actual values in `test_get_links_gcf` of test_link_finder.py
+    assert links[gcfs[0]][spectra[0]].data(mc) == 12
+    assert links[gcfs[1]][spectra[0]].data(mc) == -9
+    assert links[gcfs[2]][spectra[0]].data(mc) == 11
+    assert links[gcfs[0]][mfs[0]].data(mc) == 12
+    assert links[gcfs[1]][mfs[1]].data(mc) == 12
+    assert links[gcfs[2]][mfs[2]].data(mc) == 21
+
+    # when test cutoff is 0, i.e. taking scores >= 0
+    mc.cutoff = 0
+    links = mc.get_links(*gcfs, link_collection=LinkCollection())
+    assert isinstance(links, LinkCollection)
+    links = links.links
+    assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'}
+    assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink)
+    assert links[gcfs[0]][spectra[0]].data(mc) == 12
+    assert links[gcfs[1]].get(spectra[0]) is None
+    assert links[gcfs[2]][spectra[0]].data(mc) == 11
+    assert links[gcfs[0]][mfs[0]].data(mc) == 12
+    assert links[gcfs[1]][mfs[1]].data(mc) == 12
+    assert links[gcfs[2]][mfs[2]].data(mc) == 21
+
+
+@pytest.mark.skip(reason='To add after refactoring relevant code.')
+def test_get_links_gcf_standardised_true(mc, gcfs, spectra, mfs):
+    """Test `get_links` method when input is GCF objects and `standardised` is True."""
+    mc.standardised = True
+    ...
+
+
+def test_get_links_spec_standardised_false(mc, gcfs, spectra):
+    """Test `get_links` method when input is Spectrum objects and `standardised` is False."""
+    mc.standardised = False
+
+    mc.cutoff = np.NINF
+    links = mc.get_links(*spectra, link_collection=LinkCollection())
+    assert isinstance(links, LinkCollection)
+    links = links.links  # dict of link values
+    assert len(links) == 3
+    assert {i.spectrum_id
+            for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'}
+    assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink)
+    assert links[spectra[0]][gcfs[0]].data(mc) == 12
+    assert links[spectra[0]][gcfs[1]].data(mc) == -9
+    assert links[spectra[0]][gcfs[2]].data(mc) == 11
+
+    mc.cutoff = 0
+    links = mc.get_links(*spectra, link_collection=LinkCollection())
+    assert isinstance(links, LinkCollection)
+    links = links.links  # dict of link values
+    assert len(links) == 3
+    assert {i.spectrum_id
+            for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'}
+    assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink)
+    assert links[spectra[0]][gcfs[0]].data(mc) == 12
+    assert links[spectra[0]].get(gcfs[1]) is None
+    assert links[spectra[0]][gcfs[2]].data(mc) == 11
+
+
+@pytest.mark.skip(reason='To add after refactoring relevant code.')
+def test_get_links_spec_standardised_true(mc, gcfs, spectra):
+    """Test `get_links` method when input is Spectrum objects and `standardised` is True."""
+    mc.standardised = True
+    ...
+
+
+def test_get_links_mf_standardised_false(mc, gcfs, mfs):
+    """Test `get_links` method when input is MolecularFamily objects and `standardised` is False."""
+    mc.standardised = False
+
+    mc.cutoff = np.NINF
+    links = mc.get_links(*mfs, link_collection=LinkCollection())
+    assert isinstance(links, LinkCollection)
+    links = links.links
+    assert len(links) == 3
+    assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'}
+    assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink)
+    assert links[mfs[0]][gcfs[0]].data(mc) == 12
+    assert links[mfs[0]][gcfs[1]].data(mc) == -9
+    assert links[mfs[0]][gcfs[2]].data(mc) == 11
+
+    mc.cutoff = 0
+    links = mc.get_links(*mfs, link_collection=LinkCollection())
+    assert isinstance(links, LinkCollection)
+    links = links.links
+    assert len(links) == 3
+    assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'}
+    assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink)
+    assert links[mfs[0]][gcfs[0]].data(mc) == 12
+    assert links[mfs[0]].get(gcfs[1]) is None
+    assert links[mfs[0]][gcfs[2]].data(mc) == 11
+
+
+@pytest.mark.skip(reason='To add after refactoring relevant code.')
+def test_get_links_mf_standardised_true(mc, gcfs, mfs):
+    """Test `get_links` method when input is MolecularFamily objects and `standardised` is True."""
+    mc.standardised = True
+    ...
+
+
+def test_get_links_invalid_input(mc):
+    """Test `get_links` method when input is invalid."""
+    with pytest.raises(TypeError) as e:
+        mc.get_links("", link_collection=LinkCollection())
+    assert "Invalid type {<class 'str'>}" in str(e.value)
+
+
+def test_get_links_no_linkfinder(npl, gcfs):
+    """Test `get_links` method when no LinkFinder object is found."""
+    mc = MetcalfScoring(npl)
+    mc.LINKFINDER = None
+    with pytest.raises(ValueError) as e:
+        mc.get_links(*gcfs, link_collection=LinkCollection())
+    assert "LinkFinder object not found." in str(e.value)

From 79082a6413be03a8cfe9b5ee45feebcda9fcb8c3 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 16:49:07 +0200
Subject: [PATCH 72/95] add docstrings and type hints to `MetcalfScoring` class

---
 src/nplinker/scoring/metcalf_scoring.py | 63 +++++++++++++++++++++----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index bff24861..5f9d61c8 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -15,19 +15,41 @@
 from .methods import ScoringMethod
 from .object_link import ObjectLink
 
+
 if TYPE_CHECKING:
     from . import LinkCollection
+    from ..nplinker import NPLinker
 
 logger = LogConfig.getLogger(__name__)
 
 
 class MetcalfScoring(ScoringMethod):
+    """Metcalf scoring method.
+
+    Attributes:
+        DATALINKS (DataLinks): The DataLinks object to use for scoring.
+        LINKFINDER (LinkFinder): The LinkFinder object to use for scoring.
+        NAME (str): The name of the scoring method. This is set to 'metcalf'.
+    """
 
     DATALINKS = None
     LINKFINDER = None
     NAME = 'metcalf'
 
-    def __init__(self, npl):
+    def __init__(self, npl: NPLinker) -> None:
+        """Create a MetcalfScoring object.
+
+        Args:
+            npl (NPLinker): The NPLinker object to use for scoring.
+
+        Attributes:
+            cutoff (float): The cutoff value to use for scoring. Scores below
+                this value will be discarded. Defaults to 1.0.
+            standardised (bool): Whether to use standardised scores. Defaults
+                to True.
+            name (str): The name of the scoring method. It's set to a fixed value
+                'metcalf'.
+        """
         super().__init__(npl)
         self.cutoff = 1.0
         self.standardised = True
@@ -35,7 +57,11 @@ def __init__(self, npl):
     # TODO CG: not sure why using staticmethod here. Check later and refactor if possible
     # TODO CG: refactor this method and extract code for cache file to a separate method
     @staticmethod
-    def setup(npl):
+    def setup(npl: NPLinker):
+        """Setup the MetcalfScoring object.
+
+        DataLinks and LinkFinder objects are created and cached for later use.
+        """
         logger.info(
             'MetcalfScoring.setup (bgcs={}, gcfs={}, spectra={}, molfams={}, strains={})'
             .format(len(npl.bgcs), len(npl.gcfs), len(npl.spectra),
@@ -92,10 +118,30 @@ def setup(npl):
 
     # TODO CG: is it needed? remove it if not
     @property
-    def datalinks(self):
+    def datalinks(self) -> DataLinks:
         return MetcalfScoring.DATALINKS
 
-    def get_links(self, *objects, link_collection) -> LinkCollection:
+    def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
+                  | tuple[MolecularFamily, ...],
+                  link_collection: LinkCollection) -> LinkCollection:
+        """Get links for the given objects and add them to the given LinkCollection.
+
+        The given objects are treated as input or source objects, which must
+        be GCF, Spectrum or MolecularFamily objects.
+
+        Args:
+            objects(tuple): The objects to get links for. Must be GCF, Spectrum
+                or MolecularFamily objects.
+            link_collection: The LinkCollection object to add the links to.
+
+        Returns:
+            LinkCollection: The LinkCollection object with the new links added.
+
+        Raises:
+            TypeError: If the input objects are not of the correct type.
+            ValueError: If LinkFinder instance has not been created
+                (MetcalfScoring object has not been setup).
+        """
         if self._isinstance(GCF, *objects):
             obj_type = 'gcf'
         elif self._isinstance(Spectrum, *objects):
@@ -191,10 +237,11 @@ def get_links(self, *objects, link_collection) -> LinkCollection:
         return link_collection
 
     def _isinstance(self, _type, *objects) -> bool:
+        """Check if all objects are of the given type."""
         return all(isinstance(x, _type) for x in objects)
 
-    def _cal_standardised_score_met(self, linkfinder,
-                                    results) -> list[pd.DataFrame]:
+    def _cal_standardised_score_met(self, linkfinder: LinkFinder,
+                                    results: list) -> list[pd.DataFrame]:
         logger.debug('Calculating standardised Metcalf scores (met input)')
         raw_score = results[0]
         z_scores = []
@@ -225,8 +272,8 @@ def _cal_standardised_score_met(self, linkfinder,
 
         return [scores_df]
 
-    def _cal_standardised_score_gen(self, linkfinder,
-                                    results) -> list[pd.DataFrame]:
+    def _cal_standardised_score_gen(self, linkfinder: LinkFinder,
+                                    results: list) -> list[pd.DataFrame]:
         logger.debug('Calculating standardised Metcalf scores (gen input)')
         postprocessed_scores = []
         for raw_score in results:

From 15a0bd1f69048a5fdfba6c478290fe67ecdacb0a Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 17:13:05 +0200
Subject: [PATCH 73/95] add util func `isinstance_all`

---
 src/nplinker/scoring/linking/__init__.py | 8 ++++++--
 src/nplinker/scoring/linking/utils.py    | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py
index 5bf1db00..f33d043b 100644
--- a/src/nplinker/scoring/linking/__init__.py
+++ b/src/nplinker/scoring/linking/__init__.py
@@ -1,10 +1,14 @@
 import logging
 from .data_links import DataLinks
 from .data_links import LINK_TYPES
-from .utils import calc_correlation_matrix
 from .link_finder import LinkFinder
+from .utils import calc_correlation_matrix
+from .utils import isinstance_all
 
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
 
-__all__ = ["DataLinks", "LINK_TYPES", "calc_correlation_matrix", "LinkFinder"]
+__all__ = [
+    "DataLinks", "LINK_TYPES", "LinkFinder", "calc_correlation_matrix",
+    "isinstance_all"
+]
diff --git a/src/nplinker/scoring/linking/utils.py b/src/nplinker/scoring/linking/utils.py
index 20824c62..044627ad 100644
--- a/src/nplinker/scoring/linking/utils.py
+++ b/src/nplinker/scoring/linking/utils.py
@@ -18,6 +18,10 @@
 import numpy as np
 
 
+def isinstance_all(*objects, objtype) -> bool:
+    """Check if all objects are of the given type."""
+    return all(isinstance(x, objtype) for x in objects)
+
 def calc_correlation_matrix(M_type1_cond, M_type2_cond):
     """ 
     Calculate correlation matrices from co-occurence matrices

From f0f570a00d127ab717a9eeeb5d9347dbe5d87de2 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 17:17:25 +0200
Subject: [PATCH 74/95] replace `_isinstance` with util func `isinstance_all`

---
 src/nplinker/scoring/linking/link_finder.py | 10 ++++------
 src/nplinker/scoring/metcalf_scoring.py     | 11 ++++-------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 38559e2d..10ffa811 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -8,6 +8,7 @@
 from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.spectrum import Spectrum
 from . import LINK_TYPES
+from .utils import isinstance_all
 
 
 if TYPE_CHECKING:
@@ -123,11 +124,11 @@ def get_links(self,
         Raises:
             TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
         """
-        if self._isinstance(GCF, *objects):
+        if isinstance_all(*objects, objtype=GCF):
             obj_type = 'gcf'
-        elif self._isinstance(Spectrum, *objects):
+        elif isinstance_all(*objects, objtype=Spectrum):
             obj_type = 'spec'
-        elif self._isinstance(MolecularFamily, *objects):
+        elif isinstance_all(*objects, objtype=MolecularFamily):
             obj_type = 'mf'
         else:
             types = [type(i) for i in objects]
@@ -164,9 +165,6 @@ def get_links(self,
             links.append(df)
         return links
 
-    def _isinstance(self, _type, *objects) -> bool:
-        return all(isinstance(x, _type) for x in objects)
-
     # TODO CG: the returned data could be changed to dict, like the that in
     # the get_links method of the MetcalfScoring class
     def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame:
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 5f9d61c8..8dfd8432 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -10,6 +10,7 @@
 from nplinker.pickler import load_pickled_data
 from nplinker.pickler import save_pickled_data
 from .linking import DataLinks
+from .linking import isinstance_all
 from .linking import LINK_TYPES
 from .linking import LinkFinder
 from .methods import ScoringMethod
@@ -142,11 +143,11 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
             ValueError: If LinkFinder instance has not been created
                 (MetcalfScoring object has not been setup).
         """
-        if self._isinstance(GCF, *objects):
+        if isinstance_all(*objects, objtype=GCF):
             obj_type = 'gcf'
-        elif self._isinstance(Spectrum, *objects):
+        elif isinstance_all(*objects, objtype=Spectrum):
             obj_type = 'spec'
-        elif self._isinstance(MolecularFamily, *objects):
+        elif isinstance_all(*objects, objtype=MolecularFamily):
             obj_type = 'mf'
         else:
             types = [type(i) for i in objects]
@@ -236,10 +237,6 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
         logger.debug('MetcalfScoring: completed')
         return link_collection
 
-    def _isinstance(self, _type, *objects) -> bool:
-        """Check if all objects are of the given type."""
-        return all(isinstance(x, _type) for x in objects)
-
     def _cal_standardised_score_met(self, linkfinder: LinkFinder,
                                     results: list) -> list[pd.DataFrame]:
         logger.debug('Calculating standardised Metcalf scores (met input)')

From 9f4bd2bc477d5797fb78124e2a107f811802eedd Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 18:27:54 +0200
Subject: [PATCH 75/95] update validation of args for `DataLinks`

---
 src/nplinker/scoring/linking/data_links.py | 23 ++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index 7d275335..1cde6883 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -8,6 +8,7 @@
 from nplinker.metabolomics.singleton_family import SingletonFamily
 from nplinker.metabolomics.spectrum import Spectrum
 from .utils import calc_correlation_matrix
+from .utils import isinstance_all
 
 
 if TYPE_CHECKING:
@@ -96,7 +97,8 @@ def get_common_strains(
 
         Args:
             spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]):
-                A list of Spectrum or MolecularFamily objects.
+                A list of Spectrum or MolecularFamily objects and all objects
+                must be of the same type.
             gcfs(Sequence[GCF]): A list of GCF objects.
             filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF
                 without common strains will be removed from the returned dict;
@@ -104,13 +106,22 @@ def get_common_strains(
         Returns:
             dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF)
             and values are a list of shared Strain objects.
+
+        Raises:
+            ValueError: If the first argument is not a list of Spectrum or
+                MolecularFamily objects, or the second argument is not a list of
+                GCF objects.
         """
-        if not isinstance(spectra_or_mfs[0], (Spectrum, MolecularFamily)):
+        # Check input arguments
+        if len(spectra_or_mfs) == 0 or len(gcfs) == 0:
+            raise ValueError('Empty list for first or second argument.')
+        if not isinstance_all(*spectra_or_mfs,
+                              objtype=Spectrum) and not isinstance_all(
+                                  *spectra_or_mfs, objtype=MolecularFamily):
             raise ValueError(
-                'Must provide Spectra or MolecularFamilies as the first argument!'
-            )
-        if not isinstance(gcfs[0], GCF):
-            raise ValueError('Must provide GCFs as the second argument!')
+                'First argument must be Spectrum or MolecularFamily objects.')
+        if not isinstance_all(*gcfs, objtype=GCF):
+            raise ValueError('Second argument must be GCF objects.')
 
         # Assume that 3 occurrence dataframes have same df.columns (strain ids)
         strain_ids = self.occurrence_gcf_strain.columns

From 1bd05a86673e088b0b1a88f73988f198f197a07f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 18:29:54 +0200
Subject: [PATCH 76/95] Update test_data_links.py

- add docstrings
- add more tests
---
 tests/scoring/test_data_links.py | 70 ++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index 94d1d24c..c89bc5cc 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -1,6 +1,8 @@
 import pandas as pd
 from pandas.util.testing import assert_frame_equal
+import pytest
 from pytest import fixture
+from nplinker.metabolomics.singleton_family import SingletonFamily
 from nplinker.scoring.linking import DataLinks
 
 
@@ -10,6 +12,11 @@ def datalinks(gcfs, spectra, mfs, strains):
 
 
 def test_init(datalinks):
+    """Test that the DataLinks object is initialised correctly.
+
+        Multiple private methods are called in the init method, so we test
+        that the correct dataframes are created.
+    """
     # test occorrences
     col_names = ['strain1', 'strain2', 'strain3']
     assert_frame_equal(
@@ -73,6 +80,7 @@ def test_init(datalinks):
 
 
 def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list):
+    """Test get_common_strains method for input spectra and gcfs."""
     sut = datalinks.get_common_strains(spectra[:2], gcfs)
     expected = {
         (spectra[0], gcfs[0]): [strains_list[0]],
@@ -97,6 +105,7 @@ def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list):
 
 
 def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list):
+    """Test get_common_strains method for input mfs and gcfs."""
     sut = datalinks.get_common_strains(mfs[:2], gcfs)
     expected = {
         (mfs[0], gcfs[0]): [strains_list[0]],
@@ -116,3 +125,64 @@ def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list):
         (mfs[1], gcfs[2]): [strains_list[1]]
     }
     assert sut == expected
+
+
+def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list):
+    """Test get_common_strains method for input SingletonFamily."""
+    smf = SingletonFamily()
+
+    sut = datalinks.get_common_strains([smf], gcfs)
+    assert sut == {}
+
+    # the expected are same as `test_get_common_strains_mf`
+    mfs_mix = (*mfs[:2], smf)
+    sut = datalinks.get_common_strains(mfs_mix, gcfs)
+    expected = {
+        (mfs[0], gcfs[0]): [strains_list[0]],
+        (mfs[0], gcfs[1]): [],
+        (mfs[0], gcfs[2]): [strains_list[0]],
+        (mfs[1], gcfs[0]): [],
+        (mfs[1], gcfs[1]): [strains_list[1]],
+        (mfs[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+    sut = datalinks.get_common_strains(mfs_mix, gcfs, filter_no_shared=True)
+    expected = {
+        (mfs[0], gcfs[0]): [strains_list[0]],
+        (mfs[0], gcfs[2]): [strains_list[0]],
+        (mfs[1], gcfs[1]): [strains_list[1]],
+        (mfs[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+
+@pytest.mark.parametrize(
+    "first_arg, expected",
+    [([], "Empty list for first or second argument."),
+     ((), "Empty list for first or second argument."),
+     ([1], "First argument must be Spectrum or MolecularFamily objects.")])
+def test_get_common_strains_exception_first_arg(datalinks, spectra, mfs, gcfs,
+                                                first_arg, expected):
+    """Test get_common_strains method for invalid 1st arugment."""
+    with pytest.raises(ValueError) as e:
+        datalinks.get_common_strains(first_arg, gcfs)
+    assert expected in str(e.value)
+
+    # mixed input
+    with pytest.raises(ValueError) as e:
+        datalinks.get_common_strains(spectra + mfs, gcfs)
+    assert "First argument must be Spectrum or MolecularFamily objects." in str(
+        e.value)
+
+
+@pytest.mark.parametrize("second_arg, expected",
+                         [([], "Empty list for first or second argument."),
+                          ((), "Empty list for first or second argument."),
+                          ([1], "Second argument must be GCF objects.")])
+def test_get_common_strains_exception_second_arg(datalinks, spectra,
+                                                 second_arg, expected):
+    """Test get_common_strains method for invalid 2nd argument."""
+    with pytest.raises(ValueError) as e:
+        datalinks.get_common_strains(spectra, second_arg)
+    assert expected in str(e.value)

From a19f43f99aaddb8ba97f77f06e9a5c117e28f50e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 11 May 2023 18:39:51 +0200
Subject: [PATCH 77/95] add type hints for returned values to unit tests

---
 tests/scoring/conftest.py             | 13 ++++++++-----
 tests/scoring/test_data_links.py      |  2 +-
 tests/scoring/test_link_finder.py     |  2 +-
 tests/scoring/test_metcalf_scoring.py |  8 ++++----
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py
index 6a15f05f..73606506 100644
--- a/tests/scoring/conftest.py
+++ b/tests/scoring/conftest.py
@@ -7,12 +7,12 @@
 
 
 @fixture(scope='session')
-def strains_list():
+def strains_list() -> tuple[Strain, Strain, Strain]:
     return Strain('strain1'), Strain('strain2'), Strain('strain3')
 
 
 @fixture(scope='session')
-def strains(strains_list):
+def strains(strains_list) -> StrainCollection:
     strains = StrainCollection()
     for strain in strains_list:
         strains.add(strain)
@@ -20,7 +20,7 @@ def strains(strains_list):
 
 
 @fixture(scope='session')
-def gcfs(strains_list):
+def gcfs(strains_list) -> tuple[GCF, GCF, GCF]:
     gcf1 = GCF('gcf1')
     gcf1.strains.add(strains_list[0])
     gcf2 = GCF('gcf2')
@@ -32,7 +32,7 @@ def gcfs(strains_list):
 
 
 @fixture(scope='session')
-def spectra(strains_list):
+def spectra(strains_list) -> tuple[Spectrum, Spectrum, Spectrum]:
     spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None)
     spectrum1.strains.add(strains_list[0])
     spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None)
@@ -44,7 +44,10 @@ def spectra(strains_list):
 
 
 @fixture(scope='session')
-def mfs(spectra):
+def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]:
+    """For simplicity, we just use one Spectrum object for each MolecularFamily
+    object, and notice that they are not SingletonFamily object.
+    """
     mf1 = MolecularFamily('mf1')
     mf1.add_spectrum(spectra[0])
     mf2 = MolecularFamily('mf2')
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index c89bc5cc..5921c61b 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -7,7 +7,7 @@
 
 
 @fixture(scope='module')
-def datalinks(gcfs, spectra, mfs, strains):
+def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
     return DataLinks(gcfs, spectra, mfs, strains)
 
 
diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py
index 871f4e90..faab635e 100644
--- a/tests/scoring/test_link_finder.py
+++ b/tests/scoring/test_link_finder.py
@@ -8,7 +8,7 @@
 
 
 @fixture(scope='module')
-def linkfinder():
+def linkfinder() -> LinkFinder:
     return LinkFinder()
 
 
diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py
index 8bb715fc..34b59d58 100644
--- a/tests/scoring/test_metcalf_scoring.py
+++ b/tests/scoring/test_metcalf_scoring.py
@@ -13,13 +13,13 @@
 
 
 @fixture(scope='module')
-def datalinks(gcfs, spectra, mfs, strains):
+def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
     """DataLinks object. See `test_data_links.py` for its actual values."""
     return DataLinks(gcfs, spectra, mfs, strains)
 
 
 @fixture(scope='module')
-def linkfinder(datalinks):
+def linkfinder(datalinks) -> LinkFinder:
     """LinkFinder object. See `test_link_finder.py` for its actual values."""
     linkfinder = LinkFinder()
     linkfinder.cal_score(datalinks, link_type='spec-gcf')
@@ -28,7 +28,7 @@ def linkfinder(datalinks):
 
 
 @fixture(scope='module')
-def npl(gcfs, spectra, mfs, strains, tmp_path_factory):
+def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
     """Constructed NPLinker object.
 
     This NPLinker object does not do loading `npl.load_data()`, instead we
@@ -52,7 +52,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path_factory):
 
 
 @fixture(scope='module')
-def mc(npl):
+def mc(npl) -> MetcalfScoring:
     """MetcalfScoring object."""
     mc = MetcalfScoring(npl)
     mc.setup(npl)

From 6fc0ef5c48856e6e77adbbd71690d77adaf75789 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 11:31:56 +0200
Subject: [PATCH 78/95] update exception types for invalid input

---
 src/nplinker/scoring/linking/data_links.py  | 10 +++---
 src/nplinker/scoring/linking/link_finder.py |  4 +++
 src/nplinker/scoring/metcalf_scoring.py     |  4 +++
 tests/scoring/test_data_links.py            | 34 +++++++++++++--------
 tests/scoring/test_metcalf_scoring.py       | 34 ++++++++++++++++-----
 5 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index 1cde6883..733e53f9 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -108,9 +108,9 @@ def get_common_strains(
             and values are a list of shared Strain objects.
 
         Raises:
-            ValueError: If the first argument is not a list of Spectrum or
-                MolecularFamily objects, or the second argument is not a list of
-                GCF objects.
+            ValueError: If given `spectra_or_mfs` or `gcfs` is empty.
+            TypeError: If given `spectra_or_mfs` or `gcfs` is not a list of
+                Spectrum/MolecularFamily or GCF objects, respectively.
         """
         # Check input arguments
         if len(spectra_or_mfs) == 0 or len(gcfs) == 0:
@@ -118,10 +118,10 @@ def get_common_strains(
         if not isinstance_all(*spectra_or_mfs,
                               objtype=Spectrum) and not isinstance_all(
                                   *spectra_or_mfs, objtype=MolecularFamily):
-            raise ValueError(
+            raise TypeError(
                 'First argument must be Spectrum or MolecularFamily objects.')
         if not isinstance_all(*gcfs, objtype=GCF):
-            raise ValueError('Second argument must be GCF objects.')
+            raise TypeError('Second argument must be GCF objects.')
 
         # Assume that 3 occurrence dataframes have same df.columns (strain ids)
         strain_ids = self.occurrence_gcf_strain.columns
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 10ffa811..ddef34aa 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -122,8 +122,12 @@ def get_links(self,
                 - the third row contains the scores.
 
         Raises:
+            ValueError: If input objects are empty.
             TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
         """
+        if len(objects) == 0:
+            raise ValueError('Empty input objects.')
+
         if isinstance_all(*objects, objtype=GCF):
             obj_type = 'gcf'
         elif isinstance_all(*objects, objtype=Spectrum):
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 8dfd8432..586f1595 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -139,10 +139,14 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
             LinkCollection: The LinkCollection object with the new links added.
 
         Raises:
+            ValueError: If the input objects are empty.
             TypeError: If the input objects are not of the correct type.
             ValueError: If LinkFinder instance has not been created
                 (MetcalfScoring object has not been setup).
         """
+        if len(objects) == 0:
+            raise ValueError('Empty input objects.')
+
         if isinstance_all(*objects, objtype=GCF):
             obj_type = 'gcf'
         elif isinstance_all(*objects, objtype=Spectrum):
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index 5921c61b..7872266a 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -157,32 +157,40 @@ def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list):
     assert sut == expected
 
 
+def test_get_common_strains_invalid_value(datalinks, spectra, gcfs):
+    """Test get_common_strains method for empty arguments."""
+    with pytest.raises(ValueError) as e:
+        datalinks.get_common_strains([], gcfs)
+    assert "Empty list for first or second argument." in str(e.value)
+    with pytest.raises(ValueError) as e:
+        datalinks.get_common_strains(spectra, [])
+    assert "Empty list for first or second argument." in str(e.value)
+
+
 @pytest.mark.parametrize(
     "first_arg, expected",
-    [([], "Empty list for first or second argument."),
-     ((), "Empty list for first or second argument."),
-     ([1], "First argument must be Spectrum or MolecularFamily objects.")])
-def test_get_common_strains_exception_first_arg(datalinks, spectra, mfs, gcfs,
-                                                first_arg, expected):
+    [([1], "First argument must be Spectrum or MolecularFamily objects."),
+     ([1, 2], "First argument must be Spectrum or MolecularFamily objects."),
+     ("12", "First argument must be Spectrum or MolecularFamily objects.")])
+def test_get_common_strains_invalid_type_first_arg(datalinks, spectra, mfs,
+                                                   gcfs, first_arg, expected):
     """Test get_common_strains method for invalid 1st arugment."""
-    with pytest.raises(ValueError) as e:
+    with pytest.raises(TypeError) as e:
         datalinks.get_common_strains(first_arg, gcfs)
     assert expected in str(e.value)
 
     # mixed input
-    with pytest.raises(ValueError) as e:
+    with pytest.raises(TypeError) as e:
         datalinks.get_common_strains(spectra + mfs, gcfs)
     assert "First argument must be Spectrum or MolecularFamily objects." in str(
         e.value)
 
 
 @pytest.mark.parametrize("second_arg, expected",
-                         [([], "Empty list for first or second argument."),
-                          ((), "Empty list for first or second argument."),
-                          ([1], "Second argument must be GCF objects.")])
-def test_get_common_strains_exception_second_arg(datalinks, spectra,
-                                                 second_arg, expected):
+                         [([1], "Second argument must be GCF objects.")])
+def test_get_common_strains_invalid_type_second_arg(datalinks, spectra,
+                                                    second_arg, expected):
     """Test get_common_strains method for invalid 2nd argument."""
-    with pytest.raises(ValueError) as e:
+    with pytest.raises(TypeError) as e:
         datalinks.get_common_strains(spectra, second_arg)
     assert expected in str(e.value)
diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py
index 34b59d58..e9398e5e 100644
--- a/tests/scoring/test_metcalf_scoring.py
+++ b/tests/scoring/test_metcalf_scoring.py
@@ -14,13 +14,13 @@
 
 @fixture(scope='module')
 def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
-    """DataLinks object. See `test_data_links.py` for its actual values."""
+    """DataLinks object. See `test_data_links.py` for its values."""
     return DataLinks(gcfs, spectra, mfs, strains)
 
 
 @fixture(scope='module')
 def linkfinder(datalinks) -> LinkFinder:
-    """LinkFinder object. See `test_link_finder.py` for its actual values."""
+    """LinkFinder object. See `test_link_finder.py` for its values."""
     linkfinder = LinkFinder()
     linkfinder.cal_score(datalinks, link_type='spec-gcf')
     linkfinder.cal_score(datalinks, link_type='mf-gcf')
@@ -122,7 +122,7 @@ def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs):
     assert len(links) == 3
     assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'}
     assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink)
-    # check actual values in `test_get_links_gcf` of test_link_finder.py
+    # expected values are from `test_get_links_gcf` of test_link_finder.py
     assert links[gcfs[0]][spectra[0]].data(mc) == 12
     assert links[gcfs[1]][spectra[0]].data(mc) == -9
     assert links[gcfs[2]][spectra[0]].data(mc) == 11
@@ -222,11 +222,31 @@ def test_get_links_mf_standardised_true(mc, gcfs, mfs):
     ...
 
 
-def test_get_links_invalid_input(mc):
-    """Test `get_links` method when input is invalid."""
+@pytest.mark.parametrize("objects, expected", [([], "Empty input objects"),
+                                               ("", "Empty input objects")])
+def test_get_links_invalid_input_value(mc, objects, expected):
+    with pytest.raises(ValueError) as e:
+        mc.get_links(*objects, link_collection=LinkCollection())
+    assert expected in str(e.value)
+
+
+@pytest.mark.parametrize("objects, expected",
+                         [([1], "Invalid type {<class 'int'>}"),
+                          ([1, 2], "Invalid type {<class 'int'>}"),
+                          ("12", "Invalid type {<class 'str'>}")])
+def test_get_links_invalid_input_type(mc, objects, expected):
+    with pytest.raises(TypeError) as e:
+        mc.get_links(*objects, link_collection=LinkCollection())
+    assert expected in str(e.value)
+
+
+def test_get_links_invalid_mixed_types(mc, spectra, mfs):
+    objects = (*spectra, *mfs)
     with pytest.raises(TypeError) as e:
-        mc.get_links("", link_collection=LinkCollection())
-    assert "Invalid type {<class 'str'>}" in str(e.value)
+        mc.get_links(*objects, link_collection=LinkCollection())
+    assert "Invalid type" in str(e.value)
+    assert ".MolecularFamily" in str(e.value)
+    assert ".Spectrum" in str(e.value)
 
 
 def test_get_links_no_linkfinder(npl, gcfs):

From 91bcff79212f2128fb04f7063ad8476114171081 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 11:54:05 +0200
Subject: [PATCH 79/95] add docstrings and type hints to `LinkFinder` class

---
 src/nplinker/scoring/linking/link_finder.py | 64 ++++++++++++---------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index ddef34aa..39bbb555 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -17,37 +17,41 @@
 logger = LogConfig.getLogger(__file__)
 
 
-# TODO CG: this class could be merged to MetcalfScoring class
+# TODO CG: this class could be merged to MetcalfScoring class?
 class LinkFinder():
 
-    def __init__(self):
+    def __init__(self) -> None:
+        """Initialise LinkFinder object.
+
+        Attributes:
+            raw_score_spec_gcf (pd.DataFrame): The raw Metcalf scores for
+                spectrum-GCF links.
+            raw_score_mf_gcf (pd.DataFrame): The raw Metcalf scores for
+                molecular family-GCF links.
+            metcalf_mean (np.ndarray | None): The mean value used for
+                standardising Metcalf scores.
+            metcalf_std (np.ndarray | None): The standard deviation value used
+                for standardising Metcalf scores.
         """
-        Create tables of prospective link candidates.
-        Separate tables will exist for different linking scenarios, such as
-        gcfs <-> spectra OR gcf <-> mol.families
-        """
-        # metcalf scores
         self.raw_score_spec_gcf = pd.DataFrame()
         self.raw_score_mf_gcf = pd.DataFrame()
-
-        # metcalf caching
         self.metcalf_mean = None
         self.metcalf_std = None
 
-    # TODO CG: cal_score method could be integrated to __init__
+    # TODO CG: cal_score method could be integrated to __init__?
     def cal_score(
         self,
         data_links: DataLinks,
         link_type: str = 'spec-gcf',
         scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1)
     ) -> None:
-        """Calculate metcalf scores.
+        """Calculate Metcalf scores.
 
         Args:
-            data_links (DataLinks): The DataLinks object to use for scoring.
-            link_type (str, optional): The type of link to score. Available
-                types are 'spec-gcf' and 'mf-gcf'. Defaults to 'spec-gcf'.
-            scoring_weights (tuple[int,int,int,int], optional): The weights to
+            data_links(DataLinks): The DataLinks object to use for scoring.
+            link_type(str): The type of link to score. Must be 'spec-gcf' or
+                'mf-gcf'. Defaults to 'spec-gcf'.
+            scoring_weights(tuple[int,int,int,int]): The weights to
                 use for Metcalf scoring. The weights are applied to
                 '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'.
                 Defaults to (10, -10, 0, 1).
@@ -72,13 +76,16 @@ def cal_score(
                 data_links.cooccurrence_notmf_gcf * scoring_weights[2] +
                 data_links.cooccurrence_notmf_notgcf * scoring_weights[3])
 
+        # TODO CG: this part should be moved outside of this method
         n_strains = data_links.occurrence_gcf_strain.shape[1]
         if self.metcalf_mean is None or self.metcalf_std is None:
             self.metcalf_mean, self.metcalf_std = self._cal_mean_std(
                 n_strains, scoring_weights)
 
     # TODO CG: read paper and check the logics of this method
-    def _cal_mean_std(self, n_strains, scoring_weights):
+    def _cal_mean_std(
+        self, n_strains: int, scoring_weights: tuple[int, int, int, int]
+    ) -> tuple[np.ndarray, np.ndarray]:
         sz = (n_strains + 1, n_strains + 1)
         mean = np.zeros(sz)
         variance = np.zeros(sz)
@@ -104,22 +111,27 @@ def _cal_mean_std(self, n_strains, scoring_weights):
                 variance[n, m] = expected_sq
         return mean, np.sqrt(variance)
 
+    # TODO CG: the data type of returned should be improved for faster
+    #  processing. Maybe using dict instead of pd.DataFrame?
+    #  like that in the get_links method of the MetcalfScoring class
     def get_links(self,
                   *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
                   | tuple[MolecularFamily, ...],
                   score_cutoff: float = 0.5) -> list[pd.DataFrame]:
-        """Get scores for links between objects.
+        """Get links and scores for given objects.
 
         Args:
-            objects(tuple): GCF, Spectrum or MolecularFamily objects.
+            objects(tuple): A list of GCF, Spectrum or MolecularFamily objects
+                and all objects must be of the same type.
             score_cutoff(float): Minimum score to consider a link (≥score_cutoff).
                 Default is 0.5.
         Returns:
             list: List of data frames containing the ids of the linked objects
-                and the score. The data frame contains three rows:
-                - the first row contains the ids of the input/source objects,
-                - the second row contains the ids of the target objects,
-                - the third row contains the scores.
+                and the score. The data frame has index names of
+                'source', 'target' and 'score':
+                - the 'source' row contains the ids of the input/source objects,
+                - the 'target' row contains the ids of the target objects,
+                - the 'score' row contains the scores.
 
         Raises:
             ValueError: If input objects are empty.
@@ -169,9 +181,8 @@ def get_links(self,
             links.append(df)
         return links
 
-    # TODO CG: the returned data could be changed to dict, like the that in
-    # the get_links method of the MetcalfScoring class
-    def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame:
+    def _get_scores_source_gcf(self, scores: pd.DataFrame,
+                               score_cutoff: float) -> pd.DataFrame:
         row_indexes, col_indexes = np.where(scores >= score_cutoff)
         src_obj_ids = scores.columns[col_indexes].to_list()
         target_obj_ids = scores.index[row_indexes].to_list()
@@ -179,7 +190,8 @@ def _get_scores_source_gcf(self, scores, score_cutoff) -> pd.DataFrame:
         return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate],
                             index=['source', 'target', 'score'])
 
-    def _get_scores_source_met(self, scores, score_cutoff) -> pd.DataFrame:
+    def _get_scores_source_met(self, scores: pd.DataFrame,
+                               score_cutoff: float) -> pd.DataFrame:
         row_indexes, col_indexes = np.where(scores >= score_cutoff)
         src_obj_ids = scores.index[row_indexes].to_list()
         target_obj_ids = scores.columns[col_indexes].to_list()

From 989471aadc8c51f5ccfabc6dd7c9aff0b2fc10d2 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 11:54:29 +0200
Subject: [PATCH 80/95] add more unit tests for `LinkFinder`

---
 tests/scoring/test_link_finder.py | 64 ++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py
index faab635e..3aa2e4e9 100644
--- a/tests/scoring/test_link_finder.py
+++ b/tests/scoring/test_link_finder.py
@@ -14,6 +14,7 @@ def linkfinder() -> LinkFinder:
 
 @fixture(scope='module')
 def datalinks(gcfs, spectra, mfs, strains):
+    """DataLinks object. See `test_data_links.py` for its actual values."""
     return DataLinks(gcfs, spectra, mfs, strains)
 
 
@@ -24,14 +25,20 @@ def test_init(linkfinder):
     assert linkfinder.metcalf_std is None
 
 
-def test_cal_score(linkfinder, datalinks):
+def test_cal_score_raw_score(linkfinder, datalinks):
+    """Test `cal_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`.
+
+    The expected values are calculated manually by using values from `test_init`
+    of `test_data_links.py` and the default scoring weights.
+    """
+    # link type = 'spec-gcf'
     linkfinder.cal_score(datalinks, link_type='spec-gcf')
     assert_frame_equal(
         linkfinder.raw_score_spec_gcf,
         pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
                      index=['spectrum1', 'spectrum2', 'spectrum3'],
                      columns=['gcf1', 'gcf2', 'gcf3']))
-
+    # link type = 'mf-gcf'
     linkfinder.cal_score(datalinks, link_type='mf-gcf')
     assert_frame_equal(
         linkfinder.raw_score_mf_gcf,
@@ -39,31 +46,29 @@ def test_cal_score(linkfinder, datalinks):
                      index=['mf1', 'mf2', 'mf3'],
                      columns=['gcf1', 'gcf2', 'gcf3']))
 
-    # TODO CG: add tests for values after refactoring _cal_mean_std
+
+def test_cal_score_mean_std(linkfinder, datalinks):
+    """Test `cal_score` method for `metcalf_mean` and `metcalf_std`."""
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
     assert isinstance(linkfinder.metcalf_mean, np.ndarray)
     assert isinstance(linkfinder.metcalf_std, np.ndarray)
     assert linkfinder.metcalf_mean.shape == (4, 4
                                              )  # (n_strains+1 , n_strains+1)
     assert linkfinder.metcalf_mean.shape == (4, 4)
-    # generated metcalf_mean
-    # array([[  3.,   2.,   1.,   0.],
-    #    [ -8.,  -2.,   4.,  10.],
-    #    [-19.,  -6.,   7.,  20.],
-    #    [-30., -10.,  10.,  30.]])
-    # generated metcalf_std
-    # array([[1.        , 1.        , 1.        , 1.        ],
-    #    [1.        , 9.89949494, 9.89949494, 1.        ],
-    #    [1.        , 9.89949494, 9.89949494, 1.        ],
-    #    [1.        , 1.        , 1.        , 1.        ]])
+    # TODO CG: add tests for values after refactoring _cal_mean_std method
+    # assert linkfinder.metcalf_mean == expected_array
 
 
 def test_get_links_gcf(linkfinder, datalinks, gcfs):
+    """Test `get_links` method for input GCF objects."""
     linkfinder.cal_score(datalinks, link_type='spec-gcf')
     linkfinder.cal_score(datalinks, link_type='mf-gcf')
     index_names = ['source', 'target', 'score']
+
     # cutoff = negative infinity (float)
     links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF)
     assert len(links) == 2
+    # expected values got from `test_cal_score_raw_score`
     assert_frame_equal(
         links[0],
         pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3,
@@ -85,6 +90,7 @@ def test_get_links_gcf(linkfinder, datalinks, gcfs):
 
     # cutoff = 0
     links = linkfinder.get_links(*gcfs, score_cutoff=0)
+    assert len(links) == 2
     assert_frame_equal(
         links[0],
         pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'],
@@ -106,6 +112,7 @@ def test_get_links_gcf(linkfinder, datalinks, gcfs):
 
 
 def test_get_links_spec(linkfinder, datalinks, spectra):
+    """Test `get_links` method for input Spectrum objects."""
     linkfinder.cal_score(datalinks, link_type='spec-gcf')
     linkfinder.cal_score(datalinks, link_type='mf-gcf')
     index_names = ['source', 'target', 'score']
@@ -134,13 +141,13 @@ def test_get_links_spec(linkfinder, datalinks, spectra):
 
 
 def test_get_links_mf(linkfinder, datalinks, mfs):
+    """Test `get_links` method for input MolecularFamily objects."""
     linkfinder.cal_score(datalinks, link_type='spec-gcf')
     linkfinder.cal_score(datalinks, link_type='mf-gcf')
     index_names = ['source', 'target', 'score']
     # cutoff = negative infinity (float)
     links = linkfinder.get_links(*mfs, score_cutoff=np.NINF)
     assert len(links) == 1
-
     assert_frame_equal(
         links[0],
         pd.DataFrame([[
@@ -149,7 +156,7 @@ def test_get_links_mf(linkfinder, datalinks, mfs):
             *['mf3'] * 3,
         ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]],
                      index=index_names))
-
+    # cutoff = 0
     links = linkfinder.get_links(*mfs, score_cutoff=0)
     assert_frame_equal(
         links[0],
@@ -162,7 +169,28 @@ def test_get_links_mf(linkfinder, datalinks, mfs):
                      index=index_names))
 
 
-def test_get_links_exceptions(linkfinder):
+@pytest.mark.parametrize("objects, expected", [([], "Empty input objects"),
+                                               ("", "Empty input objects")])
+def test_get_links_invalid_value(linkfinder, objects, expected):
+    with pytest.raises(ValueError) as e:
+        linkfinder.get_links(*objects)
+    assert expected in str(e.value)
+
+
+@pytest.mark.parametrize("objects, expected",
+                         [([1], "Invalid type {<class 'int'>}"),
+                          ([1, 2], "Invalid type {<class 'int'>}"),
+                          ("12", "Invalid type {<class 'str'>}")])
+def test_get_links_invalid_type(linkfinder, objects, expected):
+    with pytest.raises(TypeError) as e:
+        linkfinder.get_links(*objects)
+    assert expected in str(e.value)
+
+
+def test_get_links_invalid_mixed_types(linkfinder, spectra, mfs):
+    objects = (*spectra, *mfs)
     with pytest.raises(TypeError) as e:
-        linkfinder.get_links("")
-    assert "Invalid type {<class 'str'>}" in str(e.value)
+        linkfinder.get_links(*objects)
+    assert "Invalid type" in str(e.value)
+    assert ".MolecularFamily" in str(e.value)
+    assert ".Spectrum" in str(e.value)

From 04a5b27a94015c1204012d387e6addb15850edfe Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 12:26:40 +0200
Subject: [PATCH 81/95] fix input type bug for `DataLinks.get_common_strains`

---
 src/nplinker/scoring/linking/data_links.py | 14 +++---
 tests/scoring/test_data_links.py           | 58 +++++++++++++++++-----
 2 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index 733e53f9..53a62ba1 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -10,7 +10,6 @@
 from .utils import calc_correlation_matrix
 from .utils import isinstance_all
 
-
 if TYPE_CHECKING:
     from nplinker.strain_collection import StrainCollection
     from nplinker.strains import Strain
@@ -87,7 +86,7 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
 
     def get_common_strains(
         self,
-        spectra_or_mfs: Sequence[Spectrum] | Sequence[MolecularFamily],
+        spectra_or_mfs: Sequence[Spectrum | MolecularFamily],
         gcfs: Sequence[GCF],
         filter_no_shared: bool = False
     ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]:
@@ -96,9 +95,8 @@ def get_common_strains(
         Note that SingletonFamily objects are excluded from given `spectra_or_mfs`.
 
         Args:
-            spectra_or_mfs(Sequence[Spectrum] | Sequence[MolecularFamily]):
-                A list of Spectrum or MolecularFamily objects and all objects
-                must be of the same type.
+            spectra_or_mfs(Sequence[Spectrum | MolecularFamily]):
+                A list of Spectrum and/or MolecularFamily objects.
             gcfs(Sequence[GCF]): A list of GCF objects.
             filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF
                 without common strains will be removed from the returned dict;
@@ -116,10 +114,10 @@ def get_common_strains(
         if len(spectra_or_mfs) == 0 or len(gcfs) == 0:
             raise ValueError('Empty list for first or second argument.')
         if not isinstance_all(*spectra_or_mfs,
-                              objtype=Spectrum) and not isinstance_all(
-                                  *spectra_or_mfs, objtype=MolecularFamily):
+                              objtype=(Spectrum, MolecularFamily)):
             raise TypeError(
-                'First argument must be Spectrum or MolecularFamily objects.')
+                'First argument must be Spectrum and/or MolecularFamily objects.'
+            )
         if not isinstance_all(*gcfs, objtype=GCF):
             raise TypeError('Second argument must be GCF objects.')
 
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index 7872266a..c652ea2c 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -127,6 +127,43 @@ def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list):
     assert sut == expected
 
 
+def test_get_common_strains_spec_mf(datalinks, spectra, mfs, gcfs,
+                                    strains_list):
+    """Test get_common_strains method for mixed input of spectra and mfs."""
+    mixed_input = (*spectra[:2], *mfs[:2])
+    sut = datalinks.get_common_strains(mixed_input, gcfs)
+    expected = {
+        (spectra[0], gcfs[0]): [strains_list[0]],
+        (spectra[0], gcfs[1]): [],
+        (spectra[0], gcfs[2]): [strains_list[0]],
+        (spectra[1], gcfs[0]): [],
+        (spectra[1], gcfs[1]): [strains_list[1]],
+        (spectra[1], gcfs[2]): [strains_list[1]],
+        (mfs[0], gcfs[0]): [strains_list[0]],
+        (mfs[0], gcfs[1]): [],
+        (mfs[0], gcfs[2]): [strains_list[0]],
+        (mfs[1], gcfs[0]): [],
+        (mfs[1], gcfs[1]): [strains_list[1]],
+        (mfs[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+    sut = datalinks.get_common_strains(mixed_input,
+                                       gcfs,
+                                       filter_no_shared=True)
+    expected = {
+        (spectra[0], gcfs[0]): [strains_list[0]],
+        (spectra[0], gcfs[2]): [strains_list[0]],
+        (spectra[1], gcfs[1]): [strains_list[1]],
+        (spectra[1], gcfs[2]): [strains_list[1]],
+        (mfs[0], gcfs[0]): [strains_list[0]],
+        (mfs[0], gcfs[2]): [strains_list[0]],
+        (mfs[1], gcfs[1]): [strains_list[1]],
+        (mfs[1], gcfs[2]): [strains_list[1]]
+    }
+    assert sut == expected
+
+
 def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list):
     """Test get_common_strains method for input SingletonFamily."""
     smf = SingletonFamily()
@@ -167,24 +204,19 @@ def test_get_common_strains_invalid_value(datalinks, spectra, gcfs):
     assert "Empty list for first or second argument." in str(e.value)
 
 
-@pytest.mark.parametrize(
-    "first_arg, expected",
-    [([1], "First argument must be Spectrum or MolecularFamily objects."),
-     ([1, 2], "First argument must be Spectrum or MolecularFamily objects."),
-     ("12", "First argument must be Spectrum or MolecularFamily objects.")])
-def test_get_common_strains_invalid_type_first_arg(datalinks, spectra, mfs,
-                                                   gcfs, first_arg, expected):
+@pytest.mark.parametrize("first_arg, expected", [
+    ([1], "First argument must be Spectrum and/or MolecularFamily objects."),
+    ([1, 2
+      ], "First argument must be Spectrum and/or MolecularFamily objects."),
+    ("12", "First argument must be Spectrum and/or MolecularFamily objects.")
+])
+def test_get_common_strains_invalid_type_first_arg(datalinks, gcfs, first_arg,
+                                                   expected):
     """Test get_common_strains method for invalid 1st arugment."""
     with pytest.raises(TypeError) as e:
         datalinks.get_common_strains(first_arg, gcfs)
     assert expected in str(e.value)
 
-    # mixed input
-    with pytest.raises(TypeError) as e:
-        datalinks.get_common_strains(spectra + mfs, gcfs)
-    assert "First argument must be Spectrum or MolecularFamily objects." in str(
-        e.value)
-
 
 @pytest.mark.parametrize("second_arg, expected",
                          [([1], "Second argument must be GCF objects.")])

From 9c7dd499f19a7607523db9f8a6268dbe98c36d3a Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 14:08:17 +0200
Subject: [PATCH 82/95] Create test_nplinker_scoring.py

---
 tests/scoring/test_nplinker_scoring.py | 198 +++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 tests/scoring/test_nplinker_scoring.py

diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py
new file mode 100644
index 00000000..fc0656c3
--- /dev/null
+++ b/tests/scoring/test_nplinker_scoring.py
@@ -0,0 +1,198 @@
+import numpy as np
+import pytest
+from pytest import fixture
+from nplinker.nplinker import NPLinker
+from nplinker.scoring import LinkCollection
+from nplinker.scoring import MetcalfScoring
+from nplinker.scoring import ObjectLink
+from nplinker.scoring.linking import DataLinks
+from nplinker.scoring.linking import LinkFinder
+from .. import DATA_DIR
+
+
+@fixture(scope='module')
+def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
+    """DataLinks object. See `test_data_links.py` for its values."""
+    return DataLinks(gcfs, spectra, mfs, strains)
+
+
+@fixture(scope='module')
+def linkfinder(datalinks) -> LinkFinder:
+    """LinkFinder object. See `test_link_finder.py` for its values."""
+    linkfinder = LinkFinder()
+    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    return linkfinder
+
+
+@fixture(scope='module')
+def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
+    """Constructed NPLinker object.
+
+    This NPLinker object does not do loading `npl.load_data()`, instead we
+    manually set its attributes to the values we want to test.
+
+    The config file `nplinker_demo1.toml` does not affect the tests, just
+    making sure the NPLinker object can be created succesfully.
+    """
+    npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
+    npl._gcfs = gcfs
+    npl._spectra = spectra
+    npl._molfams = mfs
+    npl._strains = strains
+    npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs}
+    npl._mf_lookup = {mf.family_id: mf for mf in mfs}
+    npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra}
+    # tmp path to store 'metcalf/metcalf_scores.pckl' file
+    # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope)
+    npl._loader._root = tmp_path_factory.mktemp('npl_test')
+    return npl
+
+
+@fixture(scope='module')
+def mc(npl) -> MetcalfScoring:
+    """MetcalfScoring object."""
+    mc = MetcalfScoring(npl)
+    mc.setup(npl)
+    return mc
+
+
+def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs,
+                                          strains_list):
+    """Test `get_links` method when input is GCF objects and `standardised` is False."""
+    # test raw scores (no standardisation)
+    mc.standardised = False
+
+    # when cutoff is negative infinity, i.e. taking all scores
+    mc.cutoff = np.NINF
+    links = npl.get_links(list(gcfs), mc, and_mode=True)
+    assert isinstance(links, LinkCollection)
+    links = links.links  # dict of link values
+    assert len(links) == 3
+    assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'}
+    assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink)
+    # expected values are from `test_get_links_gcf` of test_link_finder.py
+    assert links[gcfs[0]][spectra[0]].data(mc) == 12
+    assert links[gcfs[1]][spectra[0]].data(mc) == -9
+    assert links[gcfs[2]][spectra[0]].data(mc) == 11
+    assert links[gcfs[0]][mfs[0]].data(mc) == 12
+    assert links[gcfs[1]][mfs[1]].data(mc) == 12
+    assert links[gcfs[2]][mfs[2]].data(mc) == 21
+    # expected values are from `test_get_common_strains_spec` of test_data_links.py
+    assert links[gcfs[0]][spectra[0]].shared_strains == [strains_list[0]]
+    assert links[gcfs[1]][spectra[0]].shared_strains == []
+    assert links[gcfs[2]][spectra[0]].shared_strains == [strains_list[0]]
+    assert links[gcfs[0]][mfs[0]].shared_strains == [strains_list[0]]
+    assert links[gcfs[1]][mfs[1]].shared_strains == [strains_list[1]]
+    assert set(links[gcfs[2]][mfs[2]].shared_strains) == set(strains_list[0:2])
+
+    # when test cutoff is 0, i.e. taking scores >= 0
+    mc.cutoff = 0
+    links = npl.get_links(list(gcfs), mc, and_mode=True)
+    assert isinstance(links, LinkCollection)
+    links = links.links
+    assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'}
+    assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink)
+    # test scores
+    assert links[gcfs[0]][spectra[0]].data(mc) == 12
+    assert links[gcfs[1]].get(spectra[0]) is None
+    assert links[gcfs[2]][spectra[0]].data(mc) == 11
+    assert links[gcfs[0]][mfs[0]].data(mc) == 12
+    assert links[gcfs[1]][mfs[1]].data(mc) == 12
+    assert links[gcfs[2]][mfs[2]].data(mc) == 21
+    # test shared strains
+    assert links[gcfs[0]][spectra[0]].shared_strains == [strains_list[0]]
+    assert links[gcfs[2]][spectra[0]].shared_strains == [strains_list[0]]
+    assert links[gcfs[0]][mfs[0]].shared_strains == [strains_list[0]]
+    assert links[gcfs[1]][mfs[1]].shared_strains == [strains_list[1]]
+    assert set(links[gcfs[2]][mfs[2]].shared_strains) == set(strains_list[0:2])
+
+
+@pytest.mark.skip(reason='To add after refactoring relevant code.')
+def test_get_links_gcf_standardised_true(npl, mc, gcfs, spectra, mfs,
+                                         strains_list):
+    """Test `get_links` method when input is GCF objects and `standardised` is True."""
+    mc.standardised = True
+    ...
+
+
+def test_get_links_spec_standardised_false(npl, mc, gcfs, spectra,
+                                           strains_list):
+    """Test `get_links` method when input is Spectrum objects and `standardised` is False."""
+    mc.standardised = False
+
+    mc.cutoff = np.NINF
+    links = npl.get_links(list(spectra), mc, and_mode=True)
+    assert isinstance(links, LinkCollection)
+    links = links.links  # dict of link values
+    assert len(links) == 3
+    assert {i.spectrum_id
+            for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'}
+    assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink)
+    assert links[spectra[0]][gcfs[0]].data(mc) == 12
+    assert links[spectra[0]][gcfs[1]].data(mc) == -9
+    assert links[spectra[0]][gcfs[2]].data(mc) == 11
+    assert links[spectra[0]][gcfs[0]].shared_strains == [strains_list[0]]
+    assert links[spectra[0]][gcfs[1]].shared_strains == []
+    assert links[spectra[0]][gcfs[2]].shared_strains == [strains_list[0]]
+
+    mc.cutoff = 0
+    links = npl.get_links(list(spectra), mc, and_mode=True)
+    assert isinstance(links, LinkCollection)
+    links = links.links  # dict of link values
+    assert len(links) == 3
+    assert {i.spectrum_id
+            for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'}
+    assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink)
+    assert links[spectra[0]][gcfs[0]].data(mc) == 12
+    assert links[spectra[0]].get(gcfs[1]) is None
+    assert links[spectra[0]][gcfs[2]].data(mc) == 11
+    assert links[spectra[0]][gcfs[0]].shared_strains == [strains_list[0]]
+    assert links[spectra[0]][gcfs[2]].shared_strains == [strains_list[0]]
+
+
+@pytest.mark.skip(reason='To add after refactoring relevant code.')
+def test_get_links_spec_standardised_true(npl, mc, gcfs, spectra,
+                                          strains_list):
+    """Test `get_links` method when input is Spectrum objects and `standardised` is True."""
+    mc.standardised = True
+    ...
+
+
+def test_get_links_mf_standardised_false(npl, mc, gcfs, mfs, strains_list):
+    """Test `get_links` method when input is MolecularFamily objects and `standardised` is False."""
+    mc.standardised = False
+
+    mc.cutoff = np.NINF
+    links = npl.get_links(list(mfs), mc, and_mode=True)
+    assert isinstance(links, LinkCollection)
+    links = links.links
+    assert len(links) == 3
+    assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'}
+    assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink)
+    assert links[mfs[0]][gcfs[0]].data(mc) == 12
+    assert links[mfs[0]][gcfs[1]].data(mc) == -9
+    assert links[mfs[0]][gcfs[2]].data(mc) == 11
+    assert links[mfs[0]][gcfs[0]].shared_strains == [strains_list[0]]
+    assert links[mfs[0]][gcfs[1]].shared_strains == []
+    assert links[mfs[0]][gcfs[2]].shared_strains == [strains_list[0]]
+
+    mc.cutoff = 0
+    links = npl.get_links(list(mfs), mc, and_mode=True)
+    assert isinstance(links, LinkCollection)
+    links = links.links
+    assert len(links) == 3
+    assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'}
+    assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink)
+    assert links[mfs[0]][gcfs[0]].data(mc) == 12
+    assert links[mfs[0]].get(gcfs[1]) is None
+    assert links[mfs[0]][gcfs[2]].data(mc) == 11
+    assert links[mfs[0]][gcfs[0]].shared_strains == [strains_list[0]]
+    assert links[mfs[0]][gcfs[2]].shared_strains == [strains_list[0]]
+
+
+@pytest.mark.skip(reason='To add after refactoring relevant code.')
+def test_get_links_mf_standardised_true(npl, mc, gcfs, mfs, strains_list):
+    """Test `get_links` method when input is MolecularFamily objects and `standardised` is True."""
+    mc.standardised = True
+    ...

From 60ccbdbdfb3cb18cd9e6fc37c79d76024a1f1247 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 14:30:44 +0200
Subject: [PATCH 83/95] add todo comments to `NPLinker` class

---
 src/nplinker/nplinker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 943cb21b..12b266e7 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -305,6 +305,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
         logger.debug('load_data: completed')
         return True
 
+    # TODO CG: refactor this method and update its unit tests
     def get_links(self, input_objects, scoring_methods, and_mode=True):
         """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams)
 

From 9a1af05994caaa4e9c4f80a138eb935316ee7283 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 14:31:48 +0200
Subject: [PATCH 84/95] remove local integration tests for scoring part of
 `NPLinker`

- rename `test_nplinker.py` to `test_nplinker_local.py`
---
 ...est_nplinker.py => test_nplinker_local.py} | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)
 rename tests/{test_nplinker.py => test_nplinker_local.py} (62%)

diff --git a/tests/test_nplinker.py b/tests/test_nplinker_local.py
similarity index 62%
rename from tests/test_nplinker.py
rename to tests/test_nplinker_local.py
index 4399d24c..77640e6f 100644
--- a/tests/test_nplinker.py
+++ b/tests/test_nplinker_local.py
@@ -5,6 +5,11 @@
 from . import DATA_DIR
 
 
+# NOTE: This file only contains tests that run locally and are skipped on CI.
+# Basically, only tests related to data loading should be put here.
+# For tests on scoring/links, add them to `scoring/test_nplinker_scoring.py`.
+
+
 @pytest.fixture(scope='module')
 def npl() -> NPLinker:
     npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
@@ -23,17 +28,3 @@ def test_load_data(npl: NPLinker):
     assert len(npl.gcfs) == 113
     assert len(npl.spectra) == 25935
     assert len(npl.molfams) == 25769
-
-
-@pytest.mark.skipif(os.environ.get('CI') == 'true',
-                    reason="Skip when running on CI")
-def test_get_links(npl: NPLinker):
-    mc = npl.scoring_method('metcalf')
-    mc.cutoff = 3.5
-    mc.standardised = True
-
-    actual = npl.get_links(npl.gcfs, mc, and_mode=True)
-    assert len(actual) == len(actual.sources) == len(actual.links) == 101
-
-    actual.filter_links(lambda link: link[mc] > 5.0)
-    assert len(actual.links) == 60

From b0e44e12109d57390c92aa3a10da2337bd0c2496 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 14:38:27 +0200
Subject: [PATCH 85/95] remove unused imports

---
 src/nplinker/strains.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 54338027..7b07a588 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 from .logconfig import LogConfig
-from typing import Iterator
 
 
 logger = LogConfig.getLogger(__name__)

From 6ee04e93cfe9b44a1901c2a6aecfc1cecce1653e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Fri, 12 May 2023 16:24:53 +0200
Subject: [PATCH 86/95] Fix mypy warnings as much as possible

---
 src/nplinker/genomics/bgc.py                  |  3 ++-
 src/nplinker/genomics/genomics.py             |  3 ++-
 src/nplinker/metabolomics/molecular_family.py |  2 +-
 src/nplinker/scoring/linking/link_finder.py   |  2 ++
 src/nplinker/scoring/metcalf_scoring.py       | 12 +++++++++++-
 src/nplinker/scoring/methods.py               |  6 +++++-
 6 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 7eb7921a..668d159c 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -4,6 +4,7 @@
 from nplinker.logconfig import LogConfig
 from .aa_pred import predict_aa
 
+
 if TYPE_CHECKING:
     from ..strains import Strain
     from .gcf import GCF
@@ -121,7 +122,7 @@ def strain(self, strain: Strain) -> None:
         self._strain = strain
 
     @property
-    def bigscape_classes(self) -> set[str]:
+    def bigscape_classes(self) -> set[str | None]:
         """Get BiG-SCAPE's BGC classes.
 
         BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have
diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
index c4e694b0..2ed1ab62 100644
--- a/src/nplinker/genomics/genomics.py
+++ b/src/nplinker/genomics/genomics.py
@@ -245,7 +245,8 @@ def _filter_gcfs(
 
     for bgc in bgcs_to_remove:
         bgcs.remove(bgc)
-        strains.remove(bgc.strain)
+        if bgc.strain is not None:
+            strains.remove(bgc.strain)
 
     logger.info(
         'Remove GCFs that has only MIBiG BGCs: removing {} GCFs and {} BGCs'.
diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
index 57cd686a..c4e584c5 100644
--- a/src/nplinker/metabolomics/molecular_family.py
+++ b/src/nplinker/metabolomics/molecular_family.py
@@ -55,7 +55,7 @@ def __str__(self) -> str:
         return 'MolFam(family_id={}, spectra={})'.format(
             self.family_id, len(self.spectra))
 
-    def __eq__(self, other: Self) -> bool:
+    def __eq__(self, other) -> bool:
         if isinstance(other, MolecularFamily):
             return (self.id == other.id
                     and self.family_id == other.family_id)
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 39bbb555..9b194493 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -154,6 +154,8 @@ def get_links(self,
 
         links = []
         if obj_type == 'gcf':
+            # TODO CG: the hint and mypy warnings will be gone after renaming all
+            # string ids to `.id`
             obj_ids = [gcf.gcf_id for gcf in objects]
             # spec-gcf
             scores = self.raw_score_spec_gcf.loc[:, obj_ids]
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 586f1595..4bf53177 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -181,7 +181,9 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
                 scores_list = self._cal_standardised_score_met(
                     self.LINKFINDER, scores_list)
 
-        link_scores = {}
+        link_scores: dict[GCF | Spectrum | MolecularFamily,
+                          dict[GCF | Spectrum | MolecularFamily,
+                               ObjectLink]] = {}
         if obj_type == 'gcf':
             logger.debug(
                 f'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, '
@@ -243,6 +245,10 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
 
     def _cal_standardised_score_met(self, linkfinder: LinkFinder,
                                     results: list) -> list[pd.DataFrame]:
+        if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None:
+            raise ValueError(
+                'Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?'
+            )
         logger.debug('Calculating standardised Metcalf scores (met input)')
         raw_score = results[0]
         z_scores = []
@@ -275,6 +281,10 @@ def _cal_standardised_score_met(self, linkfinder: LinkFinder,
 
     def _cal_standardised_score_gen(self, linkfinder: LinkFinder,
                                     results: list) -> list[pd.DataFrame]:
+        if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None:
+            raise ValueError(
+                'Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?'
+            )
         logger.debug('Calculating standardised Metcalf scores (gen input)')
         postprocessed_scores = []
         for raw_score in results:
diff --git a/src/nplinker/scoring/methods.py b/src/nplinker/scoring/methods.py
index ff6e7f54..e42b6cdd 100644
--- a/src/nplinker/scoring/methods.py
+++ b/src/nplinker/scoring/methods.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+from typing import TYPE_CHECKING
 from nplinker.logconfig import LogConfig
 
 
+if TYPE_CHECKING:
+    from . import LinkCollection
 
 logger = LogConfig.getLogger(__name__)
 
@@ -35,7 +39,7 @@ def setup(npl):
         """Perform any one-off initialisation required (will only be called once)"""
         pass
 
-    def get_links(self, objects, link_collection):
+    def get_links(self, *objects, link_collection: LinkCollection) -> LinkCollection:
         """Given a set of objects, return link information"""
         return link_collection
 

From 88f48b32bea8e4c23e3272f1b8c41bf33c5a49d4 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 15 May 2023 11:59:49 +0200
Subject: [PATCH 87/95] check strain existence using strain dict

---
 src/nplinker/strain_collection.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
index 08291fc7..42387c9b 100644
--- a/src/nplinker/strain_collection.py
+++ b/src/nplinker/strain_collection.py
@@ -17,7 +17,7 @@ class StrainCollection():
     def __init__(self):
         """A collection of Strain objects."""
         self._strains: list[Strain] = []
-        # dict of strain name (id and alias) to strain object
+        # dict of strain name (id and alias) to primary strain object
         self._strain_dict_name: dict[str, Strain] = {}
 
     def __repr__(self) -> str:
@@ -45,14 +45,10 @@ def __contains__(self, item: str | Strain) -> bool:
         The given strain could be a Strain object, or a strain id or alias.
         """
         if isinstance(item, str):
-            for strain in self:
-                if item == strain.id or item in strain:
-                    return True
-        elif isinstance(item, Strain):
-            return item in self._strains
-        else:
-            raise TypeError(f"Expected Strain or str, got {type(item)}")
-        return False
+            return item in self._strain_dict_name
+        if isinstance(item, Strain):
+            return item.id in self._strain_dict_name
+        raise TypeError(f"Expected Strain or str, got {type(item)}")
 
     def __iter__(self) -> Iterator[Strain]:
         return iter(self._strains)
@@ -101,10 +97,8 @@ def filter(self, strain_set: set[Strain]):
     def lookup(self, name: str) -> Strain:
         """Lookup a strain by name (id or alias).
 
-        If the name is found, return the strain object; Otherwise, raise a
-        KeyError.
-
         Args:
+
             name(str): Strain name (id or alias) to lookup.
 
         Returns:
@@ -113,9 +107,8 @@ def lookup(self, name: str) -> Strain:
         Raises:
             KeyError: If the strain name is not found.
         """
-        for strain in self:
-            if name == strain.id or name in strain:
-                return strain
+        if name in self:
+            return self._strain_dict_name[name]
         raise KeyError(f"Strain {name} not found in strain collection.")
 
     def add_from_file(self, file: str | PathLike) -> None:

From 27bcc2dfa881948c1f8a212072d19fb8fb2f2b50 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 20 Jun 2023 14:22:16 +0200
Subject: [PATCH 88/95] change calculate abbrevation from "cal" to "calc"

---
 src/nplinker/scoring/linking/link_finder.py |  8 +++---
 src/nplinker/scoring/metcalf_scoring.py     | 12 ++++-----
 tests/scoring/test_link_finder.py           | 30 ++++++++++-----------
 tests/scoring/test_metcalf_scoring.py       |  4 +--
 tests/scoring/test_nplinker_scoring.py      |  4 +--
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 9b194493..39300f58 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -38,8 +38,8 @@ def __init__(self) -> None:
         self.metcalf_mean = None
         self.metcalf_std = None
 
-    # TODO CG: cal_score method could be integrated to __init__?
-    def cal_score(
+    # TODO CG: calc_score method could be integrated to __init__?
+    def calc_score(
         self,
         data_links: DataLinks,
         link_type: str = 'spec-gcf',
@@ -79,11 +79,11 @@ def cal_score(
         # TODO CG: this part should be moved outside of this method
         n_strains = data_links.occurrence_gcf_strain.shape[1]
         if self.metcalf_mean is None or self.metcalf_std is None:
-            self.metcalf_mean, self.metcalf_std = self._cal_mean_std(
+            self.metcalf_mean, self.metcalf_std = self._calc_mean_std(
                 n_strains, scoring_weights)
 
     # TODO CG: read paper and check the logics of this method
-    def _cal_mean_std(
+    def _calc_mean_std(
         self, n_strains: int, scoring_weights: tuple[int, int, int, int]
     ) -> tuple[np.ndarray, np.ndarray]:
         sz = (n_strains + 1, n_strains + 1)
diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index 4bf53177..c291a511 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -107,9 +107,9 @@ def setup(npl: NPLinker):
             MetcalfScoring.DATALINKS = DataLinks(npl.gcfs, npl.spectra,
                                                  npl.molfams, npl.strains)
             MetcalfScoring.LINKFINDER = LinkFinder()
-            MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS,
+            MetcalfScoring.LINKFINDER.calc_score(MetcalfScoring.DATALINKS,
                                                 link_type=LINK_TYPES[0])
-            MetcalfScoring.LINKFINDER.cal_score(MetcalfScoring.DATALINKS,
+            MetcalfScoring.LINKFINDER.calc_score(MetcalfScoring.DATALINKS,
                                                 link_type=LINK_TYPES[1])
             logger.debug('MetcalfScoring.setup caching results')
             save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS,
@@ -175,10 +175,10 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
             scores_list = self.LINKFINDER.get_links(*objects,
                                                     score_cutoff=np.NINF)
             if obj_type == 'gcf':
-                scores_list = self._cal_standardised_score_gen(
+                scores_list = self._calc_standardised_score_gen(
                     self.LINKFINDER, scores_list)
             else:
-                scores_list = self._cal_standardised_score_met(
+                scores_list = self._calc_standardised_score_met(
                     self.LINKFINDER, scores_list)
 
         link_scores: dict[GCF | Spectrum | MolecularFamily,
@@ -243,7 +243,7 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
         logger.debug('MetcalfScoring: completed')
         return link_collection
 
-    def _cal_standardised_score_met(self, linkfinder: LinkFinder,
+    def _calc_standardised_score_met(self, linkfinder: LinkFinder,
                                     results: list) -> list[pd.DataFrame]:
         if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None:
             raise ValueError(
@@ -279,7 +279,7 @@ def _cal_standardised_score_met(self, linkfinder: LinkFinder,
 
         return [scores_df]
 
-    def _cal_standardised_score_gen(self, linkfinder: LinkFinder,
+    def _calc_standardised_score_gen(self, linkfinder: LinkFinder,
                                     results: list) -> list[pd.DataFrame]:
         if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None:
             raise ValueError(
diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py
index 3aa2e4e9..94db051f 100644
--- a/tests/scoring/test_link_finder.py
+++ b/tests/scoring/test_link_finder.py
@@ -25,21 +25,21 @@ def test_init(linkfinder):
     assert linkfinder.metcalf_std is None
 
 
-def test_cal_score_raw_score(linkfinder, datalinks):
-    """Test `cal_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`.
+def test_calc_score_raw_score(linkfinder, datalinks):
+    """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`.
 
     The expected values are calculated manually by using values from `test_init`
     of `test_data_links.py` and the default scoring weights.
     """
     # link type = 'spec-gcf'
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
     assert_frame_equal(
         linkfinder.raw_score_spec_gcf,
         pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
                      index=['spectrum1', 'spectrum2', 'spectrum3'],
                      columns=['gcf1', 'gcf2', 'gcf3']))
     # link type = 'mf-gcf'
-    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
     assert_frame_equal(
         linkfinder.raw_score_mf_gcf,
         pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
@@ -47,28 +47,28 @@ def test_cal_score_raw_score(linkfinder, datalinks):
                      columns=['gcf1', 'gcf2', 'gcf3']))
 
 
-def test_cal_score_mean_std(linkfinder, datalinks):
-    """Test `cal_score` method for `metcalf_mean` and `metcalf_std`."""
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
+def test_calc_score_mean_std(linkfinder, datalinks):
+    """Test `calc_score` method for `metcalf_mean` and `metcalf_std`."""
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
     assert isinstance(linkfinder.metcalf_mean, np.ndarray)
     assert isinstance(linkfinder.metcalf_std, np.ndarray)
     assert linkfinder.metcalf_mean.shape == (4, 4
                                              )  # (n_strains+1 , n_strains+1)
     assert linkfinder.metcalf_mean.shape == (4, 4)
-    # TODO CG: add tests for values after refactoring _cal_mean_std method
+    # TODO CG: add tests for values after refactoring _calc_mean_std method
     # assert linkfinder.metcalf_mean == expected_array
 
 
 def test_get_links_gcf(linkfinder, datalinks, gcfs):
     """Test `get_links` method for input GCF objects."""
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
-    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
     index_names = ['source', 'target', 'score']
 
     # cutoff = negative infinity (float)
     links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF)
     assert len(links) == 2
-    # expected values got from `test_cal_score_raw_score`
+    # expected values got from `test_calc_score_raw_score`
     assert_frame_equal(
         links[0],
         pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3,
@@ -113,8 +113,8 @@ def test_get_links_gcf(linkfinder, datalinks, gcfs):
 
 def test_get_links_spec(linkfinder, datalinks, spectra):
     """Test `get_links` method for input Spectrum objects."""
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
-    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
     index_names = ['source', 'target', 'score']
     # cutoff = negative infinity (float)
     links = linkfinder.get_links(*spectra, score_cutoff=np.NINF)
@@ -142,8 +142,8 @@ def test_get_links_spec(linkfinder, datalinks, spectra):
 
 def test_get_links_mf(linkfinder, datalinks, mfs):
     """Test `get_links` method for input MolecularFamily objects."""
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
-    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
     index_names = ['source', 'target', 'score']
     # cutoff = negative infinity (float)
     links = linkfinder.get_links(*mfs, score_cutoff=np.NINF)
diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py
index e9398e5e..e5643c00 100644
--- a/tests/scoring/test_metcalf_scoring.py
+++ b/tests/scoring/test_metcalf_scoring.py
@@ -22,8 +22,8 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
 def linkfinder(datalinks) -> LinkFinder:
     """LinkFinder object. See `test_link_finder.py` for its values."""
     linkfinder = LinkFinder()
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
-    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
     return linkfinder
 
 
diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py
index fc0656c3..c3821493 100644
--- a/tests/scoring/test_nplinker_scoring.py
+++ b/tests/scoring/test_nplinker_scoring.py
@@ -20,8 +20,8 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
 def linkfinder(datalinks) -> LinkFinder:
     """LinkFinder object. See `test_link_finder.py` for its values."""
     linkfinder = LinkFinder()
-    linkfinder.cal_score(datalinks, link_type='spec-gcf')
-    linkfinder.cal_score(datalinks, link_type='mf-gcf')
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
     return linkfinder
 
 

From ab5d02593aa7bf1cc388784e4604a5bd44239adf Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 20 Jun 2023 14:36:47 +0200
Subject: [PATCH 89/95] remove resolved TODO comment

---
 src/nplinker/scoring/linking/link_finder.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
index 39300f58..7a400bd9 100644
--- a/src/nplinker/scoring/linking/link_finder.py
+++ b/src/nplinker/scoring/linking/link_finder.py
@@ -111,9 +111,6 @@ def _calc_mean_std(
                 variance[n, m] = expected_sq
         return mean, np.sqrt(variance)
 
-    # TODO CG: the data type of returned should be improved for faster
-    #  processing. Maybe using dict instead of pd.DataFrame?
-    #  like that in the get_links method of the MetcalfScoring class
     def get_links(self,
                   *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
                   | tuple[MolecularFamily, ...],

From e4a0b3ead8fc415bb3ae41a3b2e45af58732355f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 20 Jun 2023 15:07:02 +0200
Subject: [PATCH 90/95] move shared fixtures to conftest.py

---
 tests/scoring/conftest.py              | 52 +++++++++++++++++++++++++
 tests/scoring/test_data_links.py       |  7 ----
 tests/scoring/test_link_finder.py      |  7 ----
 tests/scoring/test_metcalf_scoring.py  | 50 ------------------------
 tests/scoring/test_nplinker_scoring.py | 53 --------------------------
 5 files changed, 52 insertions(+), 117 deletions(-)

diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py
index 73606506..b64a5694 100644
--- a/tests/scoring/conftest.py
+++ b/tests/scoring/conftest.py
@@ -2,8 +2,13 @@
 from nplinker.genomics import GCF
 from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.spectrum import Spectrum
+from nplinker.nplinker import NPLinker
+from nplinker.scoring import MetcalfScoring
+from nplinker.scoring.linking import DataLinks
+from nplinker.scoring.linking import LinkFinder
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
+from .. import DATA_DIR
 
 
 @fixture(scope='session')
@@ -55,3 +60,50 @@ def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]:
     mf3 = MolecularFamily('mf3')
     mf3.add_spectrum(spectra[2])
     return mf1, mf2, mf3
+
+
+@fixture(scope='module')
+def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
+    """DataLinks object. See `test_data_links.py` for its values."""
+    return DataLinks(gcfs, spectra, mfs, strains)
+
+
+@fixture(scope='module')
+def linkfinder(datalinks) -> LinkFinder:
+    """LinkFinder object. See `test_link_finder.py` for its values."""
+    linkfinder = LinkFinder()
+    linkfinder.calc_score(datalinks, link_type='spec-gcf')
+    linkfinder.calc_score(datalinks, link_type='mf-gcf')
+    return linkfinder
+
+
+@fixture(scope='module')
+def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
+    """Constructed NPLinker object.
+
+    This NPLinker object does not do loading `npl.load_data()`, instead we
+    manually set its attributes to the values we want to test.
+
+    The config file `nplinker_demo1.toml` does not affect the tests, just
+    making sure the NPLinker object can be created succesfully.
+    """
+    npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
+    npl._gcfs = gcfs
+    npl._spectra = spectra
+    npl._molfams = mfs
+    npl._strains = strains
+    npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs}
+    npl._mf_lookup = {mf.family_id: mf for mf in mfs}
+    npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra}
+    # tmp path to store 'metcalf/metcalf_scores.pckl' file
+    # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope)
+    npl._loader._root = tmp_path_factory.mktemp('npl_test')
+    return npl
+
+
+@fixture(scope='module')
+def mc(npl) -> MetcalfScoring:
+    """MetcalfScoring object."""
+    mc = MetcalfScoring(npl)
+    mc.setup(npl)
+    return mc
diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py
index c652ea2c..22a9c21a 100644
--- a/tests/scoring/test_data_links.py
+++ b/tests/scoring/test_data_links.py
@@ -1,14 +1,7 @@
 import pandas as pd
 from pandas.util.testing import assert_frame_equal
 import pytest
-from pytest import fixture
 from nplinker.metabolomics.singleton_family import SingletonFamily
-from nplinker.scoring.linking import DataLinks
-
-
-@fixture(scope='module')
-def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
-    return DataLinks(gcfs, spectra, mfs, strains)
 
 
 def test_init(datalinks):
diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py
index 94db051f..30d84980 100644
--- a/tests/scoring/test_link_finder.py
+++ b/tests/scoring/test_link_finder.py
@@ -3,7 +3,6 @@
 from pandas.util.testing import assert_frame_equal
 import pytest
 from pytest import fixture
-from nplinker.scoring.linking import DataLinks
 from nplinker.scoring.linking import LinkFinder
 
 
@@ -12,12 +11,6 @@ def linkfinder() -> LinkFinder:
     return LinkFinder()
 
 
-@fixture(scope='module')
-def datalinks(gcfs, spectra, mfs, strains):
-    """DataLinks object. See `test_data_links.py` for its actual values."""
-    return DataLinks(gcfs, spectra, mfs, strains)
-
-
 def test_init(linkfinder):
     assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame())
     assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame())
diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py
index e5643c00..471c073c 100644
--- a/tests/scoring/test_metcalf_scoring.py
+++ b/tests/scoring/test_metcalf_scoring.py
@@ -2,61 +2,11 @@
 from numpy.testing import assert_array_equal
 from pandas.util.testing import assert_frame_equal
 import pytest
-from pytest import fixture
-from nplinker.nplinker import NPLinker
 from nplinker.scoring import LinkCollection
 from nplinker.scoring import MetcalfScoring
 from nplinker.scoring import ObjectLink
 from nplinker.scoring.linking import DataLinks
 from nplinker.scoring.linking import LinkFinder
-from .. import DATA_DIR
-
-
-@fixture(scope='module')
-def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
-    """DataLinks object. See `test_data_links.py` for its values."""
-    return DataLinks(gcfs, spectra, mfs, strains)
-
-
-@fixture(scope='module')
-def linkfinder(datalinks) -> LinkFinder:
-    """LinkFinder object. See `test_link_finder.py` for its values."""
-    linkfinder = LinkFinder()
-    linkfinder.calc_score(datalinks, link_type='spec-gcf')
-    linkfinder.calc_score(datalinks, link_type='mf-gcf')
-    return linkfinder
-
-
-@fixture(scope='module')
-def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
-    """Constructed NPLinker object.
-
-    This NPLinker object does not do loading `npl.load_data()`, instead we
-    manually set its attributes to the values we want to test.
-
-    The config file `nplinker_demo1.toml` does not affect the tests, just
-    making sure the NPLinker object can be created succesfully.
-    """
-    npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
-    npl._gcfs = gcfs
-    npl._spectra = spectra
-    npl._molfams = mfs
-    npl._strains = strains
-    npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs}
-    npl._mf_lookup = {mf.family_id: mf for mf in mfs}
-    npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra}
-    # tmp path to store 'metcalf/metcalf_scores.pckl' file
-    # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope)
-    npl._loader._root = tmp_path_factory.mktemp('npl_test')
-    return npl
-
-
-@fixture(scope='module')
-def mc(npl) -> MetcalfScoring:
-    """MetcalfScoring object."""
-    mc = MetcalfScoring(npl)
-    mc.setup(npl)
-    return mc
 
 
 def test_init(npl):
diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py
index c3821493..77743dbb 100644
--- a/tests/scoring/test_nplinker_scoring.py
+++ b/tests/scoring/test_nplinker_scoring.py
@@ -1,60 +1,7 @@
 import numpy as np
 import pytest
-from pytest import fixture
-from nplinker.nplinker import NPLinker
 from nplinker.scoring import LinkCollection
-from nplinker.scoring import MetcalfScoring
 from nplinker.scoring import ObjectLink
-from nplinker.scoring.linking import DataLinks
-from nplinker.scoring.linking import LinkFinder
-from .. import DATA_DIR
-
-
-@fixture(scope='module')
-def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
-    """DataLinks object. See `test_data_links.py` for its values."""
-    return DataLinks(gcfs, spectra, mfs, strains)
-
-
-@fixture(scope='module')
-def linkfinder(datalinks) -> LinkFinder:
-    """LinkFinder object. See `test_link_finder.py` for its values."""
-    linkfinder = LinkFinder()
-    linkfinder.calc_score(datalinks, link_type='spec-gcf')
-    linkfinder.calc_score(datalinks, link_type='mf-gcf')
-    return linkfinder
-
-
-@fixture(scope='module')
-def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
-    """Constructed NPLinker object.
-
-    This NPLinker object does not do loading `npl.load_data()`, instead we
-    manually set its attributes to the values we want to test.
-
-    The config file `nplinker_demo1.toml` does not affect the tests, just
-    making sure the NPLinker object can be created succesfully.
-    """
-    npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml'))
-    npl._gcfs = gcfs
-    npl._spectra = spectra
-    npl._molfams = mfs
-    npl._strains = strains
-    npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs}
-    npl._mf_lookup = {mf.family_id: mf for mf in mfs}
-    npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra}
-    # tmp path to store 'metcalf/metcalf_scores.pckl' file
-    # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope)
-    npl._loader._root = tmp_path_factory.mktemp('npl_test')
-    return npl
-
-
-@fixture(scope='module')
-def mc(npl) -> MetcalfScoring:
-    """MetcalfScoring object."""
-    mc = MetcalfScoring(npl)
-    mc.setup(npl)
-    return mc
 
 
 def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs,

From ade54a36e3d00627edef533344fae2d99efbdd47 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 21 Jun 2023 09:05:21 +0200
Subject: [PATCH 91/95] remove unnecessary type hints

---
 src/nplinker/nplinker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 12b266e7..6e58981f 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -283,7 +283,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False):
         self._bgcs = self._loader.bgcs
         self._gcfs = self._loader.gcfs
         self._mibig_bgc_dict = self._loader.mibig_bgc_dict
-        self._strains: StrainCollection = self._loader.strains
+        self._strains = self._loader.strains
         self._product_types = self._loader.product_types
         self._chem_classes = self._loader.chem_classes
         self._class_matches = self._loader.class_matches

From 78aa794dcb072378cc1966cd4ba6544a81f138c7 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 21 Jun 2023 09:56:44 +0200
Subject: [PATCH 92/95] update docstrings for cooccurrences

---
 src/nplinker/scoring/linking/data_links.py | 26 +++++++++++++++-------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py
index 53a62ba1..30e7c1b8 100644
--- a/src/nplinker/scoring/linking/data_links.py
+++ b/src/nplinker/scoring/linking/data_links.py
@@ -10,6 +10,7 @@
 from .utils import calc_correlation_matrix
 from .utils import isinstance_all
 
+
 if TYPE_CHECKING:
     from nplinker.strain_collection import StrainCollection
     from nplinker.strains import Strain
@@ -42,21 +43,30 @@ def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum],
             occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of
                 molecular families with respect to strains.
             cooccurrence_spec_gcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of spectra<->gcfs.
+                of the presence of spectra and the presence of gcfs with respect
+                to strains.
             cooccurrence_spec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of spectra<->not gcfs.
+                of the presence of spectra and the absence of gcfs with respect
+                to strains. "notgcf" means the absence of gcfs.
             cooccurrence_notspec_gcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of not spectra<->gcfs.
+                of the absence of spectra and the presence of gcfs with respect
+                to strains. "notspec" means the absence of spectra.
             cooccurrence_notspec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of not spectra<->not gcfs.
+                of the absence of spectra and the absence of gcfs with respect
+                to strains.
             cooccurrence_mf_gcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of molecular families<->gcfs.
+                of the presence of molecular families and the presence of gcfs
+                with respect to strains.
             cooccurrence_mf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of molecular families<->not gcfs.
+                of the presence of molecular families and the absence of gcfs
+                with respect to strains. "notgcf" means the absence of gcfs.
             cooccurrence_notmf_gcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of not molecular families<->gcfs.
+                of the absence of molecular families and the presence of gcfs
+                with respect to strains. "notmf" means the absence of molecular
+                families.
             cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence
-                of not molecular families<->not gcfs.
+                of the absence of molecular families and the absence of gcfs
+                with respect to strains.
         """
         self._strains = strains
 

From 09c049f98e6d2c6d038cd87258f79ccedbe90bf0 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 20 Jun 2023 14:17:37 +0200
Subject: [PATCH 93/95] use uuid for singleton molecular families #144

---
 src/nplinker/class_info/chem_classes.py           |  5 +++--
 .../gnps/gnps_molecular_family_loader.py          |  4 ++--
 src/nplinker/metabolomics/singleton_family.py     |  3 ++-
 src/nplinker/scoring/np_class_scoring.py          |  4 ++--
 .../test_gnps_molecular_family_loader.py          | 15 ++++++++++-----
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py
index d2f184a3..eb0e0c99 100644
--- a/src/nplinker/class_info/chem_classes.py
+++ b/src/nplinker/class_info/chem_classes.py
@@ -401,8 +401,8 @@ class prediction for a level. When no class is present, instead of Tuple it will
         for molfam in molfams:
             fid = molfam.family_id  # the key
             spectra = molfam.spectra
-            # if singleton family, format like '-1_spectrum-id'
-            if fid == '-1':
+            # if singleton family, format like 'fid_spectrum-id'
+            if fid.startswith('singleton-'):
                 spec_id = spectra[0].spectrum_id
                 fid += f'_{spec_id}'
             len_molfam = len(spectra)
@@ -555,6 +555,7 @@ def _read_cf_classes(self, mne_dir):
                 nr_nodes = line.pop(0)
                 # todo: make it easier to query classes of singleton families
                 # if singleton family, format like '-1_spectrum-id' like canopus results
+                # CG: Note that the singleton families id is "singleton-" + uuid.
                 if nr_nodes == '1':
                     component = f'-1_{cluster}'
                 class_info = []
diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
index 0a3b7659..acbe5547 100644
--- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
+++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
@@ -19,9 +19,9 @@ def __init__(self, file: str | PathLike):
         self._families: list[MolecularFamily | SingletonFamily] = []
 
         for family_id, spectra_ids in _load_molecular_families(file).items():
-            if family_id == '-1':
+            if family_id == '-1': # the "-1" is from GNPS result
                 for spectrum_id in spectra_ids:
-                    family = SingletonFamily()
+                    family = SingletonFamily() ## uuid as family id
                     family.spectra_ids = set([spectrum_id])
                     self._families.append(family)
             else:
diff --git a/src/nplinker/metabolomics/singleton_family.py b/src/nplinker/metabolomics/singleton_family.py
index 36f03e49..94457049 100644
--- a/src/nplinker/metabolomics/singleton_family.py
+++ b/src/nplinker/metabolomics/singleton_family.py
@@ -1,10 +1,11 @@
+import uuid
 from .molecular_family import MolecularFamily
 
 
 class SingletonFamily(MolecularFamily):
 
     def __init__(self):
-        super().__init__('-1')
+        super().__init__("singleton-" + str(uuid.uuid4()))
 
     def __str__(self):
         return f"Singleton molecular family (id={self.id})"
diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py
index 79d1a71f..84975344 100644
--- a/src/nplinker/scoring/np_class_scoring.py
+++ b/src/nplinker/scoring/np_class_scoring.py
@@ -271,7 +271,7 @@ def _get_met_classes(self, spec_like, method='mix'):
                     spectra_classes_names_inds
             else:  # molfam
                 fam_id = spec_like.family_id
-                if fam_id == '-1':  # account for singleton families
+                if fam_id.startswith("singleton-"):  # account for singleton families
                     fam_id += f'_{spec_like.spectra[0].spectrum_id}'
                 all_classes = self.npl.chem_classes.canopus.molfam_classes.get(
                     fam_id)
@@ -289,7 +289,7 @@ def _get_met_classes(self, spec_like, method='mix'):
                     spectra_classes(spec_like.spectrum_id)
             else:  # molfam
                 fam_id = spec_like.family_id
-                if fam_id == '-1':  # account for singleton families
+                if fam_id.startswith("singleton"):  # account for singleton families
                     fam_id += f'_{spec_like.spectra[0].spectrum_id}'
                 spec_like_classes = self.npl.chem_classes.molnetenhancer. \
                     molfam_classes.get(fam_id)
diff --git a/tests/metabolomics/test_gnps_molecular_family_loader.py b/tests/metabolomics/test_gnps_molecular_family_loader.py
index 1174e28b..6e9cd531 100644
--- a/tests/metabolomics/test_gnps_molecular_family_loader.py
+++ b/tests/metabolomics/test_gnps_molecular_family_loader.py
@@ -12,17 +12,22 @@ def test_has_molecular_families(filename):
     sut = GNPSMolecularFamilyLoader(filename)
     actual = sut.families()
     assert len(actual) == 25769
-    mf_ids = [mf.family_id for mf in actual[:30]]
-    assert mf_ids == [
+
+    assert [mf.family_id for mf in actual[:29]] == [
         '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
         '14', '15', '16', '17', '18', '20', '21', '22', '23', '24', '26', '28',
-        '30', '31', '32', '33', '-1'
+        '30', '31', '32', '33'
     ]
-    num_spec_ids = [len(mf.spectra_ids) for mf in actual[:30]]
-    assert num_spec_ids == [
+    for i in range(30, 25769):
+        assert actual[i].family_id.startswith('singleton-')
+
+    assert [len(mf.spectra_ids) for mf in actual[:30]] == [
         19, 48, 3, 3, 11, 4, 9, 3, 15, 3, 5, 2, 3, 3, 5, 3, 14, 4, 2, 2, 12, 2,
         3, 5, 2, 4, 2, 2, 2, 1
     ]
+    for i in range(30, 25769):
+        assert len(actual[i].spectra_ids) == 1
+
     assert actual[0].spectra_ids == set(
         ('13170', '13662', '15316', '15364', '16341', '17201', '17270',
          '18120', '18172', '18748', '18831', '19005', '19673', '19719',

From 1ba71062464b9383f70275c877b3812711b6bdc9 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 21 Jun 2023 12:02:55 +0200
Subject: [PATCH 94/95] add TODO comment for GNPSLoader

---
 src/nplinker/loader.py                | 1 +
 src/nplinker/metabolomics/spectrum.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 0c1e0172..b25952c4 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -674,6 +674,7 @@ def _load_genomics(self):
 
         return True
 
+    # TODO CG: replace deprecated load_dataset with GPNSLoader
     def _load_metabolomics(self):
         spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset(
             self.strains,
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 0002843d..12663871 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -47,6 +47,7 @@ def __init__(self,
         # this is a dict indexed by Strain objects (the strains found in this Spectrum), with
         # the values being dicts of the form {growth_medium: peak intensity} for the parent strain
         self.growth_media = {}
+        # TODO CG: self.family_id should be removed, used in deprecated make_families method
         self.family_id = '-1'
         self.family = None
         # a dict indexed by filename, or "gnps"

From 0f1352a0881297ec04211e2cbd309c37ff101248 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 3 Jul 2023 09:21:59 +0200
Subject: [PATCH 95/95] update type hints for `*args` parameter

---
 src/nplinker/scoring/metcalf_scoring.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index c291a511..53e6f062 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -122,8 +122,7 @@ def setup(npl: NPLinker):
     def datalinks(self) -> DataLinks:
         return MetcalfScoring.DATALINKS
 
-    def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
-                  | tuple[MolecularFamily, ...],
+    def get_links(self, *objects: GCF | Spectrum | MolecularFamily,
                   link_collection: LinkCollection) -> LinkCollection:
         """Get links for the given objects and add them to the given LinkCollection.
 
@@ -131,7 +130,7 @@ def get_links(self, *objects: tuple[GCF, ...] | tuple[Spectrum, ...]
         be GCF, Spectrum or MolecularFamily objects.
 
         Args:
-            objects(tuple): The objects to get links for. Must be GCF, Spectrum
+            objects: The objects to get links for. Must be GCF, Spectrum
                 or MolecularFamily objects.
             link_collection: The LinkCollection object to add the links to.