Fix local test error (#105)

This is a temporary solution. Loading of strains will be refactored later. See issue 104. * add attribute id to GCF class and use gcf.id in scoring This is a temporary solution, will remove it later. See issue 103 #103.
NPLinker · Feb 23, 2023 · cf8a5f4 · cf8a5f4
1 parent a60e9f1
commit cf8a5f4
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 12 deletions.
diff --git a/notebooks/npclassscore_linking/prospecting/class_linking.py b/notebooks/npclassscore_linking/prospecting/class_linking.py
@@ -2,11 +2,11 @@
 '''
 Initial code for NPClassScore
 '''
-from collections import Counter
-from collections import defaultdict
 import glob
 import os
 import sys
+from collections import Counter
+from collections import defaultdict
 import pandas as pd
 
 

diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -5,6 +5,7 @@
 from Bio import SeqRecord
 from nplinker.genomics import BGC
 from nplinker.logconfig import LogConfig
+from nplinker.strains import Strain
 from nplinker.utils import list_dirs
 from nplinker.utils import list_files
 from ..abc import BGCLoaderBase
@@ -135,6 +136,7 @@ def parse_bgc_genbank(file: str) -> BGC:
     bgc.antismash_file = file
     bgc.antismash_region = features.get("region_number")
     bgc.smiles = features.get("smiles")
+    bgc.strain = Strain(fname)
     return bgc
 
 

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
@@ -35,6 +35,9 @@ def __init__(self, gcf_id: str) -> None:
         self._bgcs: set[BGC] = set()
         self.strains: StrainCollection = StrainCollection()
         self.bigscape_class: str | None = None
+        # CG TODO: remove attribute id, see issue 103
+        #    https://github.com/NPLinker/nplinker/issues/103
+        self.id: int | None = None
 
     def __str__(self):
         return f"GCF(id={self.gcf_id}, #bgcs={len(self.bgcs)}, #strains={len(self.strains)})."

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
@@ -27,6 +27,7 @@
 from pathlib import Path
 
 
+
 logger = LogConfig.getLogger(__name__)
 
 CLUSTER_REGION_REGEX = re.compile('(.+?)\\.(cluster|region)(\\d+).gbk$')

diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -1,5 +1,6 @@
 import os.path
 from nplinker.logconfig import LogConfig
+from nplinker.strains import Strain
 from nplinker.utils import list_files
 from ..abc import BGCLoaderBase
 from ..bgc import BGC
@@ -109,6 +110,7 @@ def parse_bgc_metadata_json(file: str) -> BGC:
     metadata = MibigMetadata(file)
     mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class)
     mibig_bgc.mibig_bgc_class = metadata.biosyn_class
+    mibig_bgc.strain = Strain(metadata.mibig_accession)
     return mibig_bgc
 
 

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -20,14 +20,15 @@
 from nplinker.class_info.class_matches import ClassMatches
 from nplinker.class_info.runcanopus import run_canopus
 from nplinker.genomics import load_gcfs
+from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.mibig import MibigBGCLoader
 from nplinker.genomics.mibig import download_and_extract_mibig_metadata
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.metabolomics import load_dataset
 from nplinker.pairedomics.downloader import Downloader
 from nplinker.pairedomics.runbigscape import run_bigscape
 from nplinker.strain_collection import StrainCollection
-from nplinker.genomics.antismash import AntismashBGCLoader
+
 
 try:
     from importlib.resources import files
@@ -181,12 +182,12 @@ def __init__(self, config_data):
         self.datadir = files('nplinker').joinpath('data')
         self.dataset_id = os.path.split(
             self._root)[-1] if not self._remote_loading else self._platform_id
-        
+
         if self._remote_loading:
             self._downloader = Downloader(self._platform_id)
         else:
             self._downloader = None
-        
+
         self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
         self.mibig_bgc_dict = {}
         self.product_types = []
@@ -279,7 +280,7 @@ def _init_genomics_paths(self):
         # 11. GEN: <root>/mibig_json / mibig_json_dir=<override>
         self.mibig_json_dir = self._overrides.get(
             self.OR_MIBIG_JSON) or os.path.join(self._root, 'mibig_json')
-    
+
     def _init_paths(self):
         # 1. strain mapping are used for everything else so
         self.strain_mappings_file = self._overrides.get(
@@ -331,7 +332,7 @@ def _validate_paths(self):
                 logger.warning(
                     'Optional file/directory "{}" does not exist or is not readable!'
                     .format(f))
-    
+
     def validate(self):
         # check antismash format is recognised
         if self._antismash_format not in self.ANTISMASH_FMTS:
@@ -610,8 +611,13 @@ def _load_genomics(self):
         self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs()
 
         # add mibig bgc strains
+        # CG TODO: update strain assignment logics,
+        #    see issue 104 https://github.com/NPLinker/nplinker/issues/104
         for bgc in self.mibig_bgc_dict.values():
-            self.strains.add(bgc.strain)
+            if bgc.strain is not None:
+                self.strains.add(bgc.strain)
+            else:
+                logger.warning("No strain specified for BGC %s", bgc.bgc_id)
 
         logger.debug('mibig_bgc_dict has {} entries'.format(
             len(self.mibig_bgc_dict)))
@@ -635,6 +641,12 @@ def _load_genomics(self):
             antismash_bgc_loader.get_files(),
             self._bigscape_cutoff)
 
+        # CG TODO: remove the gcf.id, see issue 103
+        #    https://github.com/NPLinker/nplinker/issues/103
+        # This is only place to set gcf.id value.
+        for i, gcf in enumerate(self.gcfs):
+            gcf.id = i
+
         #----------------------------------------------------------------------
         # CG: write unknown strains in genomics to file
         #----------------------------------------------------------------------

diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
@@ -129,7 +129,7 @@ def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]:
             mapping_spec[i, 2] = spectrum.family.family_id
 
         return mapping_spec
-    
+
     def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]:
         num_spectra = sum(len(x.spectra_ids) for x in molfams)
         mapping_spec = np.zeros((num_spectra, 3))
@@ -143,7 +143,7 @@ def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularF
         for i, key in enumerate(inverted_mappings):
             mapping_spec[i, 1] = key
             mapping_spec[i, 2] = inverted_mappings[key]
-        
+
         return mapping_spec
 
     def collect_mappings_gcf(self, gcf_list):

diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
@@ -8,6 +8,10 @@
 from .data_linking_functions import pair_prob_approx
 from .data_linking_functions import pair_prob_hg
 
+
+# CG: TODO get_links function does not work any more, need to update its logics
+
+
 # CG: TODO get_links function does not work any more, need to update its logics
 
 
@@ -442,8 +446,8 @@ def get_links(self,
             link_levels = [0, 1]
 
             # Get necessary ids
-            # CG: TODO update the logics here
-            #   integer gcf.id has been removed, use string gcf.gcf_id instead.
+            # CG: TODO update the logics here:
+            #   don't use integer gcf.id, use string gcf.gcf_id instead.
             input_ids = np.array([gcf.id for gcf in input_object],
                                  dtype=np.int32)