NPLinker · CunliangGeng · Jul 3, 2023 · Apr 5, 2023 · Apr 5, 2023 · Apr 5, 2023
diff --git a/src/nplinker/annotations.py b/src/nplinker/annotations.py
@@ -14,10 +14,9 @@
 
 import csv
 import os
-
 from deprecated import deprecated
-
-from nplinker.metabolomics.spectrum import Spectrum, GNPS_KEY
+from nplinker.metabolomics.spectrum import GNPS_KEY
+from nplinker.metabolomics.spectrum import Spectrum
 from .logconfig import LogConfig
 
 
@@ -61,22 +60,22 @@ def create_gnps_annotation(spec: Spectrum, gnps_anno: dict):
 
 
 @deprecated(version="1.3.3", reason="Use GNPSAnnotationLoader class instead.")
-def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[int, Spectrum]) -> list[Spectrum]:
+def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[str, Spectrum]) -> list[Spectrum]:
     """Load the annotations from the GNPS annotation file present in root to the spectra.
 
     Args:
         root(str | os.PathLike): Path to the downloaded and extracted GNPS results.
         config(str | os.PathLike): Path to config file for custom file locations.
         spectra(list[Spectrum]): List of spectra to annotate.
-        spec_dict(dict[int, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable.
+        spec_dict(dict[str, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable.
 
     Raises:
         Exception: Raises exception if custom annotation config file has invalid content.
 
     Returns:
         list[Spectrum]: List of annotated spectra.
     """
-    
+
     if not os.path.exists(root):
         logger.debug(f'Annotation directory not found ({root})')
         return spectra
@@ -89,7 +88,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
 
     logger.debug('Found {} annotations .tsv files in {}'.format(
         len(annotation_files), root))
-    
+
     for af in annotation_files:
         with open(af) as f:
             rdr = csv.reader(f, delimiter='\t')
@@ -105,7 +104,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
                 # each line should be a different spec ID here
                 for line in rdr:
                     # read the scan ID column and get the corresponding Spectrum object
-                    scan_id = int(line[scans_index])
+                    scan_id = line[scans_index]
                     if scan_id not in spec_dict:
                         logger.warning(
                             'Unknown spectrum ID found in GNPS annotation file (ID={})'
@@ -147,7 +146,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra
                 # note that might have multiple lines for the same spec ID!
                 spec_annotations = {}
                 for line in rdr:
-                    scan_id = int(line[spec_id_index])
+                    scan_id = line[spec_id_index]
                     if scan_id not in spec_dict:
                         logger.warning(
                             'Unknown spectrum ID found in annotation file "{}", ID is "{}"'

diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import Counter
 import glob
 import os
-from collections import Counter
 from canopus import Canopus
 from canopus.classifications_to_gnps import analyse_canopus
 from ..logconfig import LogConfig
@@ -399,17 +399,17 @@ class prediction for a level. When no class is present, instead of Tuple it will
         molfam_classes = {}
 
         for molfam in molfams:
-            fid = str(molfam.family_id)  # the key
+            fid = molfam.family_id  # the key
             spectra = molfam.spectra
-            # if singleton family, format like '-1_spectrum-id'
-            if fid == '-1':
+            # if singleton family, format like 'fid_spectrum-id'
+            if fid.startswith('singleton-'):
                 spec_id = spectra[0].spectrum_id
                 fid += f'_{spec_id}'
             len_molfam = len(spectra)
 
             classes_per_spectra = []
             for spec in spectra:
-                spec_classes = self.spectra_classes.get(str(spec.spectrum_id))
+                spec_classes = self.spectra_classes.get(spec.spectrum_id)
                 if spec_classes:  # account for spectra without prediction
                     classes_per_spectra.append(spec_classes)
 
@@ -555,6 +555,7 @@ def _read_cf_classes(self, mne_dir):
                 nr_nodes = line.pop(0)
                 # todo: make it easier to query classes of singleton families
                 # if singleton family, format like '-1_spectrum-id' like canopus results
+                # CG: Note that the singleton families id is "singleton-" + uuid.
                 if nr_nodes == '1':
                     component = f'-1_{cluster}'
                 class_info = []

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -4,6 +4,7 @@
 from nplinker.logconfig import LogConfig
 from .aa_pred import predict_aa
 
+
 if TYPE_CHECKING:
     from ..strains import Strain
     from .gcf import GCF
@@ -121,7 +122,7 @@ def strain(self, strain: Strain) -> None:
         self._strain = strain
 
     @property
-    def bigscape_classes(self) -> set[str]:
+    def bigscape_classes(self) -> set[str | None]:
         """Get BiG-SCAPE's BGC classes.
 
         BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
@@ -35,7 +35,6 @@ def __init__(self, gcf_id: str, /) -> None:
         self.bigscape_class: str | None = None
         # CG TODO: remove attribute id, see issue 103
         #    https://github.com/NPLinker/nplinker/issues/103
-        self.id: int | None = None
         self.bgc_ids: set[str] = set()
         self.strains: StrainCollection = StrainCollection()
 

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
@@ -245,7 +245,8 @@ def _filter_gcfs(
 
     for bgc in bgcs_to_remove:
         bgcs.remove(bgc)
-        strains.remove(bgc.strain)
+        if bgc.strain is not None:
+            strains.remove(bgc.strain)
 
     logger.info(
         'Remove GCFs that has only MIBiG BGCs: removing {} GCFs and {} BGCs'.

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -659,12 +659,6 @@ def _load_genomics(self):
             antismash_bgc_loader.get_files(),
             self._bigscape_cutoff)
 
-        # CG TODO: remove the gcf.id, see issue 103
-        #    https://github.com/NPLinker/nplinker/issues/103
-        # This is only place to set gcf.id value.
-        for i, gcf in enumerate(self.gcfs):
-            gcf.id = i
-
         #----------------------------------------------------------------------
         # CG: write unknown strains in genomics to file
         #----------------------------------------------------------------------
@@ -680,6 +674,7 @@ def _load_genomics(self):
 
         return True
 
+    # TODO CG: replace deprecated load_dataset with GPNSLoader
     def _load_metabolomics(self):
         spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset(
             self.strains,

diff --git a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
@@ -2,9 +2,9 @@
 from os import PathLike
 from pathlib import Path
 from typing import Any
-
 from nplinker.metabolomics.abc import AnnotationLoaderBase
 
+
 GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}'
 
 class GNPSAnnotationLoader(AnnotationLoaderBase):
@@ -15,28 +15,28 @@ def __init__(self, file: str | PathLike):
             file(str | PathLike): The GNPS annotation file.
         """
         self._file = Path(file)
-        self._annotations : dict[int, dict] = dict()
+        self._annotations : dict[str, dict] = {}
 
         with open(self._file, mode='rt', encoding='UTF-8') as f:
             header = f.readline().split('\t')
             dict_reader = csv.DictReader(f, header, delimiter='\t')
             for row in dict_reader:
-                scan_id = int(row.pop('#Scan#'))
+                scan_id = row.pop('#Scan#')
                 self._annotations[scan_id] = row
-                
+
                 # also insert useful URLs
                 for t in ['png', 'json', 'svg', 'spectrum']:
                     self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID'])
 
-
 
-    def get_annotations(self) -> dict[int, dict]:
+
+    def get_annotations(self) -> dict[str, dict]:
         """Get annotations.
 
         Returns:
-            dict[int, dict]: Spectra indices are keys and values are the annotations for this spectrum.
+            dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum.
 
         Examples:
             >>> print(loader.annotations()[100])
             """
-        return self._annotations
+        return self._annotations
diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
@@ -17,44 +17,44 @@ def __init__(self, file: str | PathLike):
             file(str | PathLike): str or PathLike object pointing towards the GNPS molecular families file to load.
         """
         self._families: list[MolecularFamily | SingletonFamily] = []
-        
+
         for family_id, spectra_ids in _load_molecular_families(file).items():
-            if family_id == -1:
+            if family_id == '-1': # the "-1" is from GNPS result
                 for spectrum_id in spectra_ids:
-                    family = SingletonFamily()
+                    family = SingletonFamily() ## uuid as family id
                     family.spectra_ids = set([spectrum_id])
                     self._families.append(family)
             else:
                 family = MolecularFamily(family_id)
                 family.spectra_ids = spectra_ids
                 self._families.append(family)
-    
+
     def families(self) -> list[MolecularFamily]:
         return self._families
 
 
-def _load_molecular_families(file: str | PathLike) -> dict[int, set[int]]:
+def _load_molecular_families(file: str | PathLike) -> dict[str, set[str]]:
     """Load ids of molecular families and corresponding spectra from GNPS output file.
 
     Args:
         file(str | PathLike): path to the GNPS file to load molecular families.
 
     Returns:
-        dict[int, set[int]]: Mapping from molecular family/cluster id to the spectra ids.
+        dict[str, set[str]]: Mapping from molecular family/cluster id to the spectra ids.
     """
     logger.debug('loading edges file: %s', file)
 
     families: dict = {}
-    
+
     with open(file, mode='rt', encoding='utf-8') as f:
         reader = csv.reader(f, delimiter='\t')
         headers = next(reader)
         cid1_index, cid2_index, fam_index = _sniff_column_indices(file, headers)
 
         for line in reader:
-            spec1_id = int(line[cid1_index])
-            spec2_id = int(line[cid2_index])
-            family_id = int(line[fam_index])
+            spec1_id = line[cid1_index]
+            spec2_id = line[cid2_index]
+            family_id = line[fam_index]
 
             if families.get(family_id) is None:
                 families[family_id] = set([spec1_id, spec2_id])
@@ -84,5 +84,5 @@ def _sniff_column_indices(file: str | PathLike, headers: list[str]) -> tuple[int
     except ValueError as ve:
         message = f'Unknown or missing column(s) in edges file: {file}'
         raise Exception(message) from ve
-                
+
     return cid1_index,cid2_index,fam_index
diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
@@ -18,15 +18,15 @@ def __init__(self, file: str | PathLike):
         ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(file)])
         logger.info('%d molecules parsed from MGF file', len(ms1))
         self._spectra = _mols_to_spectra(ms2, metadata)
-    
+
     def spectra(self) -> list[Spectrum]:
         """Get the spectra loaded from the file.
 
         Returns:
             list[Spectrum]: the loaded spectra as a list of `Spectrum` objects.
         """
         return self._spectra
-    
+
 
 def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spectrum]:
     """Function to convert ms2 object and metadata to `Spectrum` objects.
@@ -39,14 +39,16 @@ def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spe
         list[Spectrum]: List of mass spectra obtained from ms2 and metadata.
     """
     ms2_dict = {}
+    # an example of m:
+    # (118.487999, 0.0, 18.753, <nplinker.parsers.mg...105f2c970>, 'spectra.mgf', 0.0)
     for m in ms2:
-        if not m[3] in ms2_dict:
+        if not m[3] in ms2_dict: # m[3] is `nplinker.parsers.mgf.MS1` object
             ms2_dict[m[3]] = []
         ms2_dict[m[3]].append((m[0], m[2]))
 
     spectra = []
-    for i, m in enumerate(ms2_dict.keys()):
-        new_spectrum = Spectrum(i, ms2_dict[m], int(m.name),
+    for i, m in enumerate(ms2_dict.keys()): # m is `nplinker.parsers.mgf.MS1` object
+        new_spectrum = Spectrum(i, ms2_dict[m], m.name,
                                 metadata[m.name]['precursormass'],
                                 metadata[m.name]['parentmass'])
         new_spectrum.metadata = metadata[m.name]