Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue152 replace csv with json for strain mappings file #158

Merged
merged 8 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
2 changes: 1 addition & 1 deletion src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
Note that for MIBiG BGC, same value is used for BGC id and genome id.
Users don't have to provide genome id for MIBiG BGCs in the
`strain_mappings.csv` file.
`strain_mappings.json` file.
Returns:
dict[str, str]: key is BGC id/accession, value is
Expand Down
42 changes: 13 additions & 29 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import glob
import os
import sys
from pathlib import Path
import sys
from nplinker.annotations import load_annotations
from nplinker.class_info.chem_classes import ChemClassPredictions
from nplinker.class_info.class_matches import ClassMatches
from nplinker.class_info.runcanopus import run_canopus
from nplinker.genomics import load_gcfs
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.metabolomics import load_dataset
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.strain_collection import StrainCollection

Expand Down Expand Up @@ -109,6 +110,7 @@ def __init__(self, config_data):
self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
self.mibig_bgc_dict = {}
self.product_types = []
self.strains = StrainCollection()
self.webapp_scoring_cutoff = self._config_webapp.get(
'tables_metcalf_threshold', self.TABLES_CUTOFF_DEFAULT)

Expand All @@ -124,7 +126,7 @@ def validate(self):
"""Download data and build paths for local data"""

# if remote loading mode, need to download the data here
# CG: for PODP workflow, strain_mappings.csv is generated in the download step
# CG: for PODP workflow, strain_mappings.json is generated in the download step
if self._remote_loading:
self._start_downloads()

Expand All @@ -134,7 +136,7 @@ def validate(self):
# 1. after downloading (manual preparation), some files alreay exist, some not
# 2. get the default, constructed or real path for each file/dir (need refactoring)
# - self._config_overrides.get()
# - os.path.join(self._root, 'strain_mappings.csv')
# - os.path.join(self._root, 'strain_mappings.json')
# - find_via_glob() --> actually check if the file/dir exists
# 3. check if (some) file/dir exists
self._init_paths()
Expand Down Expand Up @@ -169,7 +171,7 @@ def load(self):
# or a complete failure to parse things, so bail out
if len(self.strains) == 0:
raise Exception(
'Failed to find *ANY* strains, missing strain_mappings.csv?')
f'Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?')

return True

Expand All @@ -178,7 +180,7 @@ def _start_downloads(self):
self._root = downloader.project_file_cache
logger.debug('remote loading mode, configuring root=%s', self._root)
# CG: to download both MET and GEN data
# CG: Continue to understand how strain_mappings.csv is generated
# CG: Continue to understand how strain_mappings.json is generated
downloader.get(
self._config_docker.get('run_bigscape', self.RUN_BIGSCAPE_DEFAULT),
self._config_docker.get('extra_bigscape_parameters',
Expand All @@ -188,7 +190,7 @@ def _start_downloads(self):
def _init_paths(self):
# 1. strain mapping are used for everything else so
self.strain_mappings_file = self._config_overrides.get(
self.OR_STRAINS) or os.path.join(self._root, 'strain_mappings.csv')
self.OR_STRAINS) or os.path.join(self._root, STRAIN_MAPPINGS_FILENAME)

self._init_metabolomics_paths()

Expand Down Expand Up @@ -325,13 +327,6 @@ def _validate_paths(self):

# TODO: this function should be refactored to Loader class
def _load_strain_mappings(self):
# this file should be a csv file, one line per strain, containing a list
# of possible alternative IDs (the first one being the preferred ID).
#
# this is a per-dataset mapping, and is then merged with the global mapping file
# packaged with nplinker itself
self._init_global_strain_id_mapping()

# now load the dataset mapping in the same way
# TODO: what happens in case of clashes (differing primary IDs?)
# CG: the `if` never happens for PODP pipeline; for non-PODP pipeline,
Expand All @@ -340,29 +335,18 @@ def _load_strain_mappings(self):
if not os.path.exists(self.strain_mappings_file):
# create an empty placeholder file and show a warning
logger.warn(
'No strain_mappings.csv file found! Attempting to create one')
'No strain_mappings.json file found! Attempting to create one')
self.strains.generate_strain_mappings(self.strain_mappings_file,
self.antismash_dir)
# raise Exception('Unable to load strain_mappings file: {}'.format(self.strain_mappings_file))
else:
self.strains.add_from_file(self.strain_mappings_file)
sc = StrainCollection().read_json(self.strain_mappings_file)
for strain in sc:
self.strains.add(strain)
logger.info('Loaded dataset strain IDs ({} total)'.format(
len(self.strains)))

return True

def _init_global_strain_id_mapping(self):
"""The global strain mapping is predefined by the NPLinker package.

See `src/nplinker/strain_id_mapping.csv`
"""
self.strains = StrainCollection()
global_strain_id_file = NPLINKER_APP_DATA_DIR.joinpath(
'strain_id_mapping.csv')
self.strains.add_from_file(global_strain_id_file)
logger.info('Loaded global strain IDs ({} total)'.format(
len(self.strains)))

# TODO CG: replace deprecated load_dataset with GPNSLoader
def _load_metabolomics(self):
spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset(
Expand Down
6 changes: 4 additions & 2 deletions src/nplinker/pairedomics/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
MIBIG_METADATA_URL = 'https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz'
MIBIG_BGC_METADATA_URL = 'https://mibig.secondarymetabolites.org/repository/{}/annotations.json'

STRAIN_MAPPINGS_FILENAME = 'strain_mappings.json'


class PODPDownloader():
# TODO: move to independent config file ---C.Geng
Expand Down Expand Up @@ -120,7 +122,7 @@ def _init_folder_structure(self, local_cache):

# init strain mapping filepath
self.strain_mappings_file = os.path.join(self.project_file_cache,
'strain_mappings.csv')
STRAIN_MAPPINGS_FILENAME)

# init project paths
self.all_project_json_file = os.path.join(self.local_cache,
Expand All @@ -145,7 +147,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig,
self._parse_genome_labels(self.project_json['genome_metabolome_links'],
self.project_json['genomes'])

# CG: it generates the strain_mappings.csv file
# CG: it generates the strain_mappings.json file
self.strains.generate_strain_mappings(
self.strain_mappings_file,
os.path.join(self.project_file_cache, 'antismash'))
Expand Down
63 changes: 38 additions & 25 deletions src/nplinker/strain_collection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import csv
import json
from os import PathLike
from pathlib import Path
from typing import Iterator
Expand Down Expand Up @@ -110,36 +110,49 @@ def lookup(self, name: str) -> Strain:
return self._strain_dict_name[name]
raise KeyError(f"Strain {name} not found in strain collection.")

def add_from_file(self, file: str | PathLike) -> None:
"""Add strains from a strain mapping file.

A strain mapping file is a csv file with the first column being the
id of the strain, and the remaining columns being aliases for the
strain.
@classmethod
def read_json(cls, file: str | PathLike) -> 'StrainCollection':
"""Read a strain mappings JSON file and return a StrainCollection object.

Args:
file(str | PathLike): Path to strain mapping file (.csv).
file(str | PathLike): Path to the strain mappings JSON file.

Returns:
StrainCollection: StrainCollection object.
"""
with open(file) as f:
reader = csv.reader(f)
for names in reader:
if len(names) == 0:
continue
strain = Strain(names[0])
for alias in names[1:]:
strain.add_alias(alias)
self.add(strain)

def save_to_file(self, file: str | PathLike) -> None:
"""Save strains to a strain mapping file (.csv).
with open(file, 'r') as f:
json_data = json.load(f)

strain_collection = cls()
for data in json_data['strain_mappings']:
strain = Strain(data['strain_id'])
for alias in data['strain_alias']:
strain.add_alias(alias)
strain_collection.add(strain)
return strain_collection

def to_json(self, file: str | PathLike | None = None) -> str | None:
"""Convert the StrainCollection object to a JSON string.

Args:
file(str | PathLike): Path to strain mapping file (.csv).
file(str | PathLike | None): Path to output JSON file. If None,
return the JSON string instead.

Returns:
str | None: If `file` is None, return the JSON string. Otherwise,
write the JSON string to the given file.
"""
with open(file, 'w') as f:
for strain in self:
ids = [strain.id] + list(strain.aliases)
f.write(','.join(ids) + '\n')
data_list = [{
"strain_id": strain.id,
"strain_alias": list(strain.aliases)
} for strain in self]
json_data = {"strain_mappings": data_list, "version": "1.0"}

if file is not None:
with open(file, 'w') as f:
json.dump(json_data, f)
return None
return json.dumps(json_data)

# TODO to move this method to a separate class
@deprecated(version="1.3.3", reason="This method will be removed")
Expand Down
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from nplinker.metabolomics.metabolomics import load_spectra
from nplinker.metabolomics.metabolomics import make_families
from nplinker.metabolomics.spectrum import Spectrum
from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
from nplinker.strain_collection import StrainCollection
from nplinker.strains import Strain
from nplinker.utils import extract_archive
Expand Down Expand Up @@ -44,9 +45,8 @@ def spec_dict() -> dict[str, Spectrum]:

@pytest.fixture
def collection_from_file() -> StrainCollection:
filename = DATA_DIR / "strain_mappings.csv"
sut = StrainCollection()
sut.add_from_file(filename)
filename = DATA_DIR / STRAIN_MAPPINGS_FILENAME
sut = StrainCollection().read_json(filename)
return sut


Expand Down
27 changes: 0 additions & 27 deletions tests/data/strain_mappings.csv

This file was deleted.

Loading
Loading