Skip to content

Commit

Permalink
Issue152 replace csv with json for strain mappings file (#158)
Browse files Browse the repository at this point in the history
* remove methods `add_from_file` and `save_to_file`

* add method  `to_json`

* add method `read_json`

* add unit tests for new methods

* add global variable `STRAIN_MAPPINGS_FILENAME`

* remove useless method `_init_global_strain_id_mapping`

* fix reading strain mapping file

* change test data from csv to json
  • Loading branch information
CunliangGeng committed Jul 6, 2023
1 parent 0a74dcd commit bea0af6
Show file tree
Hide file tree
Showing 11 changed files with 569 additions and 113 deletions.
Empty file.
2 changes: 1 addition & 1 deletion src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
Note that for MIBiG BGC, same value is used for BGC id and genome id.
Users don't have to provide genome id for MIBiG BGCs in the
`strain_mappings.csv` file.
`strain_mappings.json` file.
Returns:
dict[str, str]: key is BGC id/accession, value is
Expand Down
42 changes: 13 additions & 29 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import glob
import os
import sys
from pathlib import Path
import sys
from nplinker.annotations import load_annotations
from nplinker.class_info.chem_classes import ChemClassPredictions
from nplinker.class_info.class_matches import ClassMatches
from nplinker.class_info.runcanopus import run_canopus
from nplinker.genomics import load_gcfs
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.metabolomics import load_dataset
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.strain_collection import StrainCollection

Expand Down Expand Up @@ -109,6 +110,7 @@ def __init__(self, config_data):
self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
self.mibig_bgc_dict = {}
self.product_types = []
self.strains = StrainCollection()
self.webapp_scoring_cutoff = self._config_webapp.get(
'tables_metcalf_threshold', self.TABLES_CUTOFF_DEFAULT)

Expand All @@ -124,7 +126,7 @@ def validate(self):
"""Download data and build paths for local data"""

# if remote loading mode, need to download the data here
# CG: for PODP workflow, strain_mappings.csv is generated in the download step
# CG: for PODP workflow, strain_mappings.json is generated in the download step
if self._remote_loading:
self._start_downloads()

Expand All @@ -134,7 +136,7 @@ def validate(self):
# 1. after downloading (manual preparation), some files alreay exist, some not
# 2. get the default, constructed or real path for each file/dir (need refactoring)
# - self._config_overrides.get()
# - os.path.join(self._root, 'strain_mappings.csv')
# - os.path.join(self._root, 'strain_mappings.json')
# - find_via_glob() --> actually check if the file/dir exists
# 3. check if (some) file/dir exists
self._init_paths()
Expand Down Expand Up @@ -169,7 +171,7 @@ def load(self):
# or a complete failure to parse things, so bail out
if len(self.strains) == 0:
raise Exception(
'Failed to find *ANY* strains, missing strain_mappings.csv?')
f'Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?')

return True

Expand All @@ -178,7 +180,7 @@ def _start_downloads(self):
self._root = downloader.project_file_cache
logger.debug('remote loading mode, configuring root=%s', self._root)
# CG: to download both MET and GEN data
# CG: Continue to understand how strain_mappings.csv is generated
# CG: Continue to understand how strain_mappings.json is generated
downloader.get(
self._config_docker.get('run_bigscape', self.RUN_BIGSCAPE_DEFAULT),
self._config_docker.get('extra_bigscape_parameters',
Expand All @@ -188,7 +190,7 @@ def _start_downloads(self):
def _init_paths(self):
# 1. strain mapping are used for everything else so
self.strain_mappings_file = self._config_overrides.get(
self.OR_STRAINS) or os.path.join(self._root, 'strain_mappings.csv')
self.OR_STRAINS) or os.path.join(self._root, STRAIN_MAPPINGS_FILENAME)

self._init_metabolomics_paths()

Expand Down Expand Up @@ -325,13 +327,6 @@ def _validate_paths(self):

# TODO: this function should be refactored to Loader class
def _load_strain_mappings(self):
# this file should be a csv file, one line per strain, containing a list
# of possible alternative IDs (the first one being the preferred ID).
#
# this is a per-dataset mapping, and is then merged with the global mapping file
# packaged with nplinker itself
self._init_global_strain_id_mapping()

# now load the dataset mapping in the same way
# TODO: what happens in case of clashes (differing primary IDs?)
# CG: the `if` never happens for PODP pipeline; for non-PODP pipeline,
Expand All @@ -340,29 +335,18 @@ def _load_strain_mappings(self):
if not os.path.exists(self.strain_mappings_file):
# create an empty placeholder file and show a warning
logger.warn(
'No strain_mappings.csv file found! Attempting to create one')
'No strain_mappings.json file found! Attempting to create one')
self.strains.generate_strain_mappings(self.strain_mappings_file,
self.antismash_dir)
# raise Exception('Unable to load strain_mappings file: {}'.format(self.strain_mappings_file))
else:
self.strains.add_from_file(self.strain_mappings_file)
sc = StrainCollection().read_json(self.strain_mappings_file)
for strain in sc:
self.strains.add(strain)
logger.info('Loaded dataset strain IDs ({} total)'.format(
len(self.strains)))

return True

def _init_global_strain_id_mapping(self):
"""The global strain mapping is predefined by the NPLinker package.
See `src/nplinker/strain_id_mapping.csv`
"""
self.strains = StrainCollection()
global_strain_id_file = NPLINKER_APP_DATA_DIR.joinpath(
'strain_id_mapping.csv')
self.strains.add_from_file(global_strain_id_file)
logger.info('Loaded global strain IDs ({} total)'.format(
len(self.strains)))

# TODO CG: replace deprecated load_dataset with GPNSLoader
def _load_metabolomics(self):
spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset(
Expand Down
6 changes: 4 additions & 2 deletions src/nplinker/pairedomics/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
MIBIG_METADATA_URL = 'https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz'
MIBIG_BGC_METADATA_URL = 'https://mibig.secondarymetabolites.org/repository/{}/annotations.json'

STRAIN_MAPPINGS_FILENAME = 'strain_mappings.json'


class PODPDownloader():
# TODO: move to independent config file ---C.Geng
Expand Down Expand Up @@ -120,7 +122,7 @@ def _init_folder_structure(self, local_cache):

# init strain mapping filepath
self.strain_mappings_file = os.path.join(self.project_file_cache,
'strain_mappings.csv')
STRAIN_MAPPINGS_FILENAME)

# init project paths
self.all_project_json_file = os.path.join(self.local_cache,
Expand All @@ -145,7 +147,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig,
self._parse_genome_labels(self.project_json['genome_metabolome_links'],
self.project_json['genomes'])

# CG: it generates the strain_mappings.csv file
# CG: it generates the strain_mappings.json file
self.strains.generate_strain_mappings(
self.strain_mappings_file,
os.path.join(self.project_file_cache, 'antismash'))
Expand Down
63 changes: 38 additions & 25 deletions src/nplinker/strain_collection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import csv
import json
from os import PathLike
from pathlib import Path
from typing import Iterator
Expand Down Expand Up @@ -110,36 +110,49 @@ def lookup(self, name: str) -> Strain:
return self._strain_dict_name[name]
raise KeyError(f"Strain {name} not found in strain collection.")

def add_from_file(self, file: str | PathLike) -> None:
"""Add strains from a strain mapping file.
A strain mapping file is a csv file with the first column being the
id of the strain, and the remaining columns being aliases for the
strain.
@classmethod
def read_json(cls, file: str | PathLike) -> 'StrainCollection':
"""Read a strain mappings JSON file and return a StrainCollection object.
Args:
file(str | PathLike): Path to strain mapping file (.csv).
file(str | PathLike): Path to the strain mappings JSON file.
Returns:
StrainCollection: StrainCollection object.
"""
with open(file) as f:
reader = csv.reader(f)
for names in reader:
if len(names) == 0:
continue
strain = Strain(names[0])
for alias in names[1:]:
strain.add_alias(alias)
self.add(strain)

def save_to_file(self, file: str | PathLike) -> None:
"""Save strains to a strain mapping file (.csv).
with open(file, 'r') as f:
json_data = json.load(f)

strain_collection = cls()
for data in json_data['strain_mappings']:
strain = Strain(data['strain_id'])
for alias in data['strain_alias']:
strain.add_alias(alias)
strain_collection.add(strain)
return strain_collection

def to_json(self, file: str | PathLike | None = None) -> str | None:
"""Convert the StrainCollection object to a JSON string.
Args:
file(str | PathLike): Path to strain mapping file (.csv).
file(str | PathLike | None): Path to output JSON file. If None,
return the JSON string instead.
Returns:
str | None: If `file` is None, return the JSON string. Otherwise,
write the JSON string to the given file.
"""
with open(file, 'w') as f:
for strain in self:
ids = [strain.id] + list(strain.aliases)
f.write(','.join(ids) + '\n')
data_list = [{
"strain_id": strain.id,
"strain_alias": list(strain.aliases)
} for strain in self]
json_data = {"strain_mappings": data_list, "version": "1.0"}

if file is not None:
with open(file, 'w') as f:
json.dump(json_data, f)
return None
return json.dumps(json_data)

# TODO to move this method to a separate class
@deprecated(version="1.3.3", reason="This method will be removed")
Expand Down
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from nplinker.metabolomics.metabolomics import load_spectra
from nplinker.metabolomics.metabolomics import make_families
from nplinker.metabolomics.spectrum import Spectrum
from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
from nplinker.strain_collection import StrainCollection
from nplinker.strains import Strain
from nplinker.utils import extract_archive
Expand Down Expand Up @@ -44,9 +45,8 @@ def spec_dict() -> dict[str, Spectrum]:

@pytest.fixture
def collection_from_file() -> StrainCollection:
filename = DATA_DIR / "strain_mappings.csv"
sut = StrainCollection()
sut.add_from_file(filename)
filename = DATA_DIR / STRAIN_MAPPINGS_FILENAME
sut = StrainCollection().read_json(filename)
return sut


Expand Down
27 changes: 0 additions & 27 deletions tests/data/strain_mappings.csv

This file was deleted.

Loading

0 comments on commit bea0af6

Please sign in to comment.