Skip to content

Commit

Permalink
Fix local test error (#105)
Browse files Browse the repository at this point in the history
This is a temporary solution. Loading of strains will be refactored later. See issue 104.

* add attribute id to GCF class and use gcf.id in scoring

This is a temporary solution, will remove it later. See issue 103 #103.
  • Loading branch information
CunliangGeng authored Feb 23, 2023
1 parent a60e9f1 commit cf8a5f4
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 12 deletions.
4 changes: 2 additions & 2 deletions notebooks/npclassscore_linking/prospecting/class_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
'''
Initial code for NPClassScore
'''
from collections import Counter
from collections import defaultdict
import glob
import os
import sys
from collections import Counter
from collections import defaultdict
import pandas as pd


Expand Down
2 changes: 2 additions & 0 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from Bio import SeqRecord
from nplinker.genomics import BGC
from nplinker.logconfig import LogConfig
from nplinker.strains import Strain
from nplinker.utils import list_dirs
from nplinker.utils import list_files
from ..abc import BGCLoaderBase
Expand Down Expand Up @@ -135,6 +136,7 @@ def parse_bgc_genbank(file: str) -> BGC:
bgc.antismash_file = file
bgc.antismash_region = features.get("region_number")
bgc.smiles = features.get("smiles")
bgc.strain = Strain(fname)
return bgc


Expand Down
3 changes: 3 additions & 0 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ def __init__(self, gcf_id: str) -> None:
self._bgcs: set[BGC] = set()
self.strains: StrainCollection = StrainCollection()
self.bigscape_class: str | None = None
# CG TODO: remove attribute id, see issue 103
# https://github.com/NPLinker/nplinker/issues/103
self.id: int | None = None

def __str__(self):
return f"GCF(id={self.gcf_id}, #bgcs={len(self.bgcs)}, #strains={len(self.strains)})."
Expand Down
1 change: 1 addition & 0 deletions src/nplinker/genomics/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pathlib import Path



logger = LogConfig.getLogger(__name__)

CLUSTER_REGION_REGEX = re.compile('(.+?)\\.(cluster|region)(\\d+).gbk$')
Expand Down
2 changes: 2 additions & 0 deletions src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os.path
from nplinker.logconfig import LogConfig
from nplinker.strains import Strain
from nplinker.utils import list_files
from ..abc import BGCLoaderBase
from ..bgc import BGC
Expand Down Expand Up @@ -109,6 +110,7 @@ def parse_bgc_metadata_json(file: str) -> BGC:
metadata = MibigMetadata(file)
mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class)
mibig_bgc.mibig_bgc_class = metadata.biosyn_class
mibig_bgc.strain = Strain(metadata.mibig_accession)
return mibig_bgc


Expand Down
24 changes: 18 additions & 6 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
from nplinker.class_info.class_matches import ClassMatches
from nplinker.class_info.runcanopus import run_canopus
from nplinker.genomics import load_gcfs
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.metabolomics import load_dataset
from nplinker.pairedomics.downloader import Downloader
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.strain_collection import StrainCollection
from nplinker.genomics.antismash import AntismashBGCLoader


try:
from importlib.resources import files
Expand Down Expand Up @@ -181,12 +182,12 @@ def __init__(self, config_data):
self.datadir = files('nplinker').joinpath('data')
self.dataset_id = os.path.split(
self._root)[-1] if not self._remote_loading else self._platform_id

if self._remote_loading:
self._downloader = Downloader(self._platform_id)
else:
self._downloader = None

self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
self.mibig_bgc_dict = {}
self.product_types = []
Expand Down Expand Up @@ -279,7 +280,7 @@ def _init_genomics_paths(self):
# 11. GEN: <root>/mibig_json / mibig_json_dir=<override>
self.mibig_json_dir = self._overrides.get(
self.OR_MIBIG_JSON) or os.path.join(self._root, 'mibig_json')

def _init_paths(self):
# 1. strain mapping are used for everything else so
self.strain_mappings_file = self._overrides.get(
Expand Down Expand Up @@ -331,7 +332,7 @@ def _validate_paths(self):
logger.warning(
'Optional file/directory "{}" does not exist or is not readable!'
.format(f))

def validate(self):
# check antismash format is recognised
if self._antismash_format not in self.ANTISMASH_FMTS:
Expand Down Expand Up @@ -610,8 +611,13 @@ def _load_genomics(self):
self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs()

# add mibig bgc strains
# CG TODO: update strain assignment logics,
# see issue 104 https://github.com/NPLinker/nplinker/issues/104
for bgc in self.mibig_bgc_dict.values():
self.strains.add(bgc.strain)
if bgc.strain is not None:
self.strains.add(bgc.strain)
else:
logger.warning("No strain specified for BGC %s", bgc.bgc_id)

logger.debug('mibig_bgc_dict has {} entries'.format(
len(self.mibig_bgc_dict)))
Expand All @@ -635,6 +641,12 @@ def _load_genomics(self):
antismash_bgc_loader.get_files(),
self._bigscape_cutoff)

# CG TODO: remove the gcf.id, see issue 103
# https://github.com/NPLinker/nplinker/issues/103
# This is only place to set gcf.id value.
for i, gcf in enumerate(self.gcfs):
gcf.id = i

#----------------------------------------------------------------------
# CG: write unknown strains in genomics to file
#----------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions src/nplinker/scoring/linking/data_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]:
mapping_spec[i, 2] = spectrum.family.family_id

return mapping_spec

def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]:
num_spectra = sum(len(x.spectra_ids) for x in molfams)
mapping_spec = np.zeros((num_spectra, 3))
Expand All @@ -143,7 +143,7 @@ def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularF
for i, key in enumerate(inverted_mappings):
mapping_spec[i, 1] = key
mapping_spec[i, 2] = inverted_mappings[key]

return mapping_spec

def collect_mappings_gcf(self, gcf_list):
Expand Down
8 changes: 6 additions & 2 deletions src/nplinker/scoring/linking/link_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
from .data_linking_functions import pair_prob_approx
from .data_linking_functions import pair_prob_hg


# CG: TODO get_links function does not work any more, need to update its logics


# CG: TODO get_links function does not work any more, need to update its logics


Expand Down Expand Up @@ -442,8 +446,8 @@ def get_links(self,
link_levels = [0, 1]

# Get necessary ids
# CG: TODO update the logics here
# integer gcf.id has been removed, use string gcf.gcf_id instead.
# CG: TODO update the logics here:
# don't use integer gcf.id, use string gcf.gcf_id instead.
input_ids = np.array([gcf.id for gcf in input_object],
dtype=np.int32)

Expand Down

0 comments on commit cf8a5f4

Please sign in to comment.