From cf8a5f40a5d813c71d6a3126d54be78c83f319c1 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 23 Feb 2023 14:21:29 +0100 Subject: [PATCH] Fix local test error (#105) This is a temporary solution. Loading of strains will be refactored later. See issue 104. * add attribute id to GCF class and use gcf.id in scoring This is a temporary solution, will remove it later. See issue 103 https://github.com/NPLinker/nplinker/issues/103. --- .../prospecting/class_linking.py | 4 ++-- .../genomics/antismash/antismash_loader.py | 2 ++ src/nplinker/genomics/gcf.py | 3 +++ src/nplinker/genomics/genomics.py | 1 + src/nplinker/genomics/mibig/mibig_loader.py | 2 ++ src/nplinker/loader.py | 24 ++++++++++++++----- src/nplinker/scoring/linking/data_linking.py | 4 ++-- src/nplinker/scoring/linking/link_finder.py | 8 +++++-- 8 files changed, 36 insertions(+), 12 deletions(-) diff --git a/notebooks/npclassscore_linking/prospecting/class_linking.py b/notebooks/npclassscore_linking/prospecting/class_linking.py index 395b4a86..7836bd5a 100644 --- a/notebooks/npclassscore_linking/prospecting/class_linking.py +++ b/notebooks/npclassscore_linking/prospecting/class_linking.py @@ -2,11 +2,11 @@ ''' Initial code for NPClassScore ''' -from collections import Counter -from collections import defaultdict import glob import os import sys +from collections import Counter +from collections import defaultdict import pandas as pd diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index 33617eff..b1d3e7d4 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -5,6 +5,7 @@ from Bio import SeqRecord from nplinker.genomics import BGC from nplinker.logconfig import LogConfig +from nplinker.strains import Strain from nplinker.utils import list_dirs from nplinker.utils import list_files from ..abc import BGCLoaderBase @@ -135,6 +136,7 @@ def parse_bgc_genbank(file: str) -> BGC: bgc.antismash_file = file bgc.antismash_region = features.get("region_number") bgc.smiles = features.get("smiles") + bgc.strain = Strain(fname) return bgc diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 004af910..db19d7af 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -35,6 +35,9 @@ def __init__(self, gcf_id: str) -> None: self._bgcs: set[BGC] = set() self.strains: StrainCollection = StrainCollection() self.bigscape_class: str | None = None + # CG TODO: remove attribute id, see issue 103 + # https://github.com/NPLinker/nplinker/issues/103 + self.id: int | None = None def __str__(self): return f"GCF(id={self.gcf_id}, #bgcs={len(self.bgcs)}, #strains={len(self.strains)})." diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index 31cd425e..e3a59a35 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -27,6 +27,7 @@ from pathlib import Path + logger = LogConfig.getLogger(__name__) CLUSTER_REGION_REGEX = re.compile('(.+?)\\.(cluster|region)(\\d+).gbk$') diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index a3a982ce..279d91b3 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -1,5 +1,6 @@ import os.path from nplinker.logconfig import LogConfig +from nplinker.strains import Strain from nplinker.utils import list_files from ..abc import BGCLoaderBase from ..bgc import BGC @@ -109,6 +110,7 @@ def parse_bgc_metadata_json(file: str) -> BGC: metadata = MibigMetadata(file) mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class) mibig_bgc.mibig_bgc_class = metadata.biosyn_class + mibig_bgc.strain = Strain(metadata.mibig_accession) return mibig_bgc diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index a2f357a3..225c0ca8 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -20,6 +20,7 @@ from nplinker.class_info.class_matches import ClassMatches from nplinker.class_info.runcanopus import run_canopus from nplinker.genomics import load_gcfs +from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.mibig import MibigBGCLoader from nplinker.genomics.mibig import download_and_extract_mibig_metadata from nplinker.logconfig import LogConfig @@ -27,7 +28,7 @@ from nplinker.pairedomics.downloader import Downloader from nplinker.pairedomics.runbigscape import run_bigscape from nplinker.strain_collection import StrainCollection -from nplinker.genomics.antismash import AntismashBGCLoader + try: from importlib.resources import files @@ -181,12 +182,12 @@ def __init__(self, config_data): self.datadir = files('nplinker').joinpath('data') self.dataset_id = os.path.split( self._root)[-1] if not self._remote_loading else self._platform_id - + if self._remote_loading: self._downloader = Downloader(self._platform_id) else: self._downloader = None - + self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], [] self.mibig_bgc_dict = {} self.product_types = [] @@ -279,7 +280,7 @@ def _init_genomics_paths(self): # 11. GEN: /mibig_json / mibig_json_dir= self.mibig_json_dir = self._overrides.get( self.OR_MIBIG_JSON) or os.path.join(self._root, 'mibig_json') - + def _init_paths(self): # 1. strain mapping are used for everything else so self.strain_mappings_file = self._overrides.get( @@ -331,7 +332,7 @@ def _validate_paths(self): logger.warning( 'Optional file/directory "{}" does not exist or is not readable!' .format(f)) - + def validate(self): # check antismash format is recognised if self._antismash_format not in self.ANTISMASH_FMTS: @@ -610,8 +611,13 @@ def _load_genomics(self): self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs() # add mibig bgc strains + # CG TODO: update strain assignment logics, + # see issue 104 https://github.com/NPLinker/nplinker/issues/104 for bgc in self.mibig_bgc_dict.values(): - self.strains.add(bgc.strain) + if bgc.strain is not None: + self.strains.add(bgc.strain) + else: + logger.warning("No strain specified for BGC %s", bgc.bgc_id) logger.debug('mibig_bgc_dict has {} entries'.format( len(self.mibig_bgc_dict))) @@ -635,6 +641,12 @@ def _load_genomics(self): antismash_bgc_loader.get_files(), self._bigscape_cutoff) + # CG TODO: remove the gcf.id, see issue 103 + # https://github.com/NPLinker/nplinker/issues/103 + # This is only place to set gcf.id value. + for i, gcf in enumerate(self.gcfs): + gcf.id = i + #---------------------------------------------------------------------- # CG: write unknown strains in genomics to file #---------------------------------------------------------------------- diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index 7ffe467e..a8c3f3c9 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -129,7 +129,7 @@ def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]: mapping_spec[i, 2] = spectrum.family.family_id return mapping_spec - + def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]: num_spectra = sum(len(x.spectra_ids) for x in molfams) mapping_spec = np.zeros((num_spectra, 3)) @@ -143,7 +143,7 @@ def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularF for i, key in enumerate(inverted_mappings): mapping_spec[i, 1] = key mapping_spec[i, 2] = inverted_mappings[key] - + return mapping_spec def collect_mappings_gcf(self, gcf_list): diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 9db4e58c..2e58acd9 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -8,6 +8,10 @@ from .data_linking_functions import pair_prob_approx from .data_linking_functions import pair_prob_hg + +# CG: TODO get_links function does not work any more, need to update its logics + + # CG: TODO get_links function does not work any more, need to update its logics @@ -442,8 +446,8 @@ def get_links(self, link_levels = [0, 1] # Get necessary ids - # CG: TODO update the logics here - # integer gcf.id has been removed, use string gcf.gcf_id instead. + # CG: TODO update the logics here: + # don't use integer gcf.id, use string gcf.gcf_id instead. input_ids = np.array([gcf.id for gcf in input_object], dtype=np.int32)