diff --git a/src/nplinker/data/strain_id_mapping.csv b/src/nplinker/data/strain_id_mapping.csv deleted file mode 100644 index e69de29b..00000000 diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index 827fd709..8ca30bd6 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -31,7 +31,7 @@ def get_bgc_genome_mapping(self) -> dict[str, str]: Note that for MIBiG BGC, same value is used for BGC id and genome id. Users don't have to provide genome id for MIBiG BGCs in the - `strain_mappings.csv` file. + `strain_mappings.json` file. Returns: dict[str, str]: key is BGC id/accession, value is diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 6de3bb2b..24cbc128 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -1,18 +1,19 @@ import glob import os -import sys from pathlib import Path +import sys from nplinker.annotations import load_annotations from nplinker.class_info.chem_classes import ChemClassPredictions from nplinker.class_info.class_matches import ClassMatches from nplinker.class_info.runcanopus import run_canopus from nplinker.genomics import load_gcfs from nplinker.genomics.antismash import AntismashBGCLoader -from nplinker.genomics.mibig import MibigBGCLoader from nplinker.genomics.mibig import download_and_extract_mibig_metadata +from nplinker.genomics.mibig import MibigBGCLoader from nplinker.logconfig import LogConfig from nplinker.metabolomics.metabolomics import load_dataset from nplinker.pairedomics.downloader import PODPDownloader +from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME from nplinker.pairedomics.runbigscape import run_bigscape from nplinker.strain_collection import StrainCollection @@ -109,6 +110,7 @@ def __init__(self, config_data): self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], [] self.mibig_bgc_dict = {} self.product_types = [] + self.strains = StrainCollection() self.webapp_scoring_cutoff = self._config_webapp.get( 'tables_metcalf_threshold', self.TABLES_CUTOFF_DEFAULT) @@ -124,7 +126,7 @@ def validate(self): """Download data and build paths for local data""" # if remote loading mode, need to download the data here - # CG: for PODP workflow, strain_mappings.csv is generated in the download step + # CG: for PODP workflow, strain_mappings.json is generated in the download step if self._remote_loading: self._start_downloads() @@ -134,7 +136,7 @@ def validate(self): # 1. after downloading (manual preparation), some files alreay exist, some not # 2. get the default, constructed or real path for each file/dir (need refactoring) # - self._config_overrides.get() - # - os.path.join(self._root, 'strain_mappings.csv') + # - os.path.join(self._root, 'strain_mappings.json') # - find_via_glob() --> actually check if the file/dir exists # 3. check if (some) file/dir exists self._init_paths() @@ -169,7 +171,7 @@ def load(self): # or a complete failure to parse things, so bail out if len(self.strains) == 0: raise Exception( - 'Failed to find *ANY* strains, missing strain_mappings.csv?') + f'Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?') return True @@ -178,7 +180,7 @@ def _start_downloads(self): self._root = downloader.project_file_cache logger.debug('remote loading mode, configuring root=%s', self._root) # CG: to download both MET and GEN data - # CG: Continue to understand how strain_mappings.csv is generated + # CG: Continue to understand how strain_mappings.json is generated downloader.get( self._config_docker.get('run_bigscape', self.RUN_BIGSCAPE_DEFAULT), self._config_docker.get('extra_bigscape_parameters', @@ -188,7 +190,7 @@ def _start_downloads(self): def _init_paths(self): # 1. strain mapping are used for everything else so self.strain_mappings_file = self._config_overrides.get( - self.OR_STRAINS) or os.path.join(self._root, 'strain_mappings.csv') + self.OR_STRAINS) or os.path.join(self._root, STRAIN_MAPPINGS_FILENAME) self._init_metabolomics_paths() @@ -325,13 +327,6 @@ def _validate_paths(self): # TODO: this function should be refactored to Loader class def _load_strain_mappings(self): - # this file should be a csv file, one line per strain, containing a list - # of possible alternative IDs (the first one being the preferred ID). - # - # this is a per-dataset mapping, and is then merged with the global mapping file - # packaged with nplinker itself - self._init_global_strain_id_mapping() - # now load the dataset mapping in the same way # TODO: what happens in case of clashes (differing primary IDs?) # CG: the `if` never happens for PODP pipeline; for non-PODP pipeline, @@ -340,29 +335,18 @@ def _load_strain_mappings(self): if not os.path.exists(self.strain_mappings_file): # create an empty placeholder file and show a warning logger.warn( - 'No strain_mappings.csv file found! Attempting to create one') + 'No strain_mappings.json file found! Attempting to create one') self.strains.generate_strain_mappings(self.strain_mappings_file, self.antismash_dir) - # raise Exception('Unable to load strain_mappings file: {}'.format(self.strain_mappings_file)) else: - self.strains.add_from_file(self.strain_mappings_file) + sc = StrainCollection().read_json(self.strain_mappings_file) + for strain in sc: + self.strains.add(strain) logger.info('Loaded dataset strain IDs ({} total)'.format( len(self.strains))) return True - def _init_global_strain_id_mapping(self): - """The global strain mapping is predefined by the NPLinker package. - - See `src/nplinker/strain_id_mapping.csv` - """ - self.strains = StrainCollection() - global_strain_id_file = NPLINKER_APP_DATA_DIR.joinpath( - 'strain_id_mapping.csv') - self.strains.add_from_file(global_strain_id_file) - logger.info('Loaded global strain IDs ({} total)'.format( - len(self.strains))) - # TODO CG: replace deprecated load_dataset with GPNSLoader def _load_metabolomics(self): spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset( diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py index b5fb3362..0e9ce528 100644 --- a/src/nplinker/pairedomics/downloader.py +++ b/src/nplinker/pairedomics/downloader.py @@ -24,6 +24,8 @@ MIBIG_METADATA_URL = 'https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz' MIBIG_BGC_METADATA_URL = 'https://mibig.secondarymetabolites.org/repository/{}/annotations.json' +STRAIN_MAPPINGS_FILENAME = 'strain_mappings.json' + class PODPDownloader(): # TODO: move to independent config file ---C.Geng @@ -120,7 +122,7 @@ def _init_folder_structure(self, local_cache): # init strain mapping filepath self.strain_mappings_file = os.path.join(self.project_file_cache, - 'strain_mappings.csv') + STRAIN_MAPPINGS_FILENAME) # init project paths self.all_project_json_file = os.path.join(self.local_cache, @@ -145,7 +147,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig, self._parse_genome_labels(self.project_json['genome_metabolome_links'], self.project_json['genomes']) - # CG: it generates the strain_mappings.csv file + # CG: it generates the strain_mappings.json file self.strains.generate_strain_mappings( self.strain_mappings_file, os.path.join(self.project_file_cache, 'antismash')) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 810c5f81..efdd1e61 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -1,4 +1,4 @@ -import csv +import json from os import PathLike from pathlib import Path from typing import Iterator @@ -110,36 +110,49 @@ def lookup(self, name: str) -> Strain: return self._strain_dict_name[name] raise KeyError(f"Strain {name} not found in strain collection.") - def add_from_file(self, file: str | PathLike) -> None: - """Add strains from a strain mapping file. - - A strain mapping file is a csv file with the first column being the - id of the strain, and the remaining columns being aliases for the - strain. + @classmethod + def read_json(cls, file: str | PathLike) -> 'StrainCollection': + """Read a strain mappings JSON file and return a StrainCollection object. Args: - file(str | PathLike): Path to strain mapping file (.csv). + file(str | PathLike): Path to the strain mappings JSON file. + + Returns: + StrainCollection: StrainCollection object. """ - with open(file) as f: - reader = csv.reader(f) - for names in reader: - if len(names) == 0: - continue - strain = Strain(names[0]) - for alias in names[1:]: - strain.add_alias(alias) - self.add(strain) - - def save_to_file(self, file: str | PathLike) -> None: - """Save strains to a strain mapping file (.csv). + with open(file, 'r') as f: + json_data = json.load(f) + + strain_collection = cls() + for data in json_data['strain_mappings']: + strain = Strain(data['strain_id']) + for alias in data['strain_alias']: + strain.add_alias(alias) + strain_collection.add(strain) + return strain_collection + + def to_json(self, file: str | PathLike | None = None) -> str | None: + """Convert the StrainCollection object to a JSON string. Args: - file(str | PathLike): Path to strain mapping file (.csv). + file(str | PathLike | None): Path to output JSON file. If None, + return the JSON string instead. + + Returns: + str | None: If `file` is None, return the JSON string. Otherwise, + write the JSON string to the given file. """ - with open(file, 'w') as f: - for strain in self: - ids = [strain.id] + list(strain.aliases) - f.write(','.join(ids) + '\n') + data_list = [{ + "strain_id": strain.id, + "strain_alias": list(strain.aliases) + } for strain in self] + json_data = {"strain_mappings": data_list, "version": "1.0"} + + if file is not None: + with open(file, 'w') as f: + json.dump(json_data, f) + return None + return json.dumps(json_data) # TODO to move this method to a separate class @deprecated(version="1.3.3", reason="This method will be removed") diff --git a/tests/conftest.py b/tests/conftest.py index 123e45d4..91e56efc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from nplinker.metabolomics.metabolomics import load_spectra from nplinker.metabolomics.metabolomics import make_families from nplinker.metabolomics.spectrum import Spectrum +from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain from nplinker.utils import extract_archive @@ -44,9 +45,8 @@ def spec_dict() -> dict[str, Spectrum]: @pytest.fixture def collection_from_file() -> StrainCollection: - filename = DATA_DIR / "strain_mappings.csv" - sut = StrainCollection() - sut.add_from_file(filename) + filename = DATA_DIR / STRAIN_MAPPINGS_FILENAME + sut = StrainCollection().read_json(filename) return sut diff --git a/tests/data/strain_mappings.csv b/tests/data/strain_mappings.csv deleted file mode 100644 index 8ae78b56..00000000 --- a/tests/data/strain_mappings.csv +++ /dev/null @@ -1,27 +0,0 @@ -Salinispora arenicola CNB527,42b.mzXML -Salinispora arenicola CNT005,NZ_AZWU01000037,NZ_KI911492,NZ_AZWU01000047,GCF_000514895,NZ_AZWU01000018,NZ_AZWU01000036,NZ_AZWU01000007,GCF_000514895.1,NZ_AZWU01000017,NZ_AZWU01000052,NZ_AZWU01000030,NZ_KI911493,NZ_AZWU01000054,NZ_AZWU01000031,NZ_KI911490,NZ_AZWU01000064,NZ_AZWU01000029,NZ_AZWU01000023,NZ_AZWU01000020,NZ_AZWU01000039,NZ_AZWU01000019,12c.mzXML,NZ_AZWU01000022,NZ_KI911494,NZ_AZWU01000045,12b.mzXML,NZ_AZWU01000021,NZ_AZWU01000038,NZ_AZWU01000046 -Salinispora arenicola CNS205,NC_009953,13a.mzXML,GCF_000018265,13b.mzXML,GCF_000018265.1 -Salinispora arenicola CNH646,NZ_AZWH01000027,NZ_AZWH01000004,NZ_AZWH01000050,NZ_AZWH01000017,NZ_AZWH01000001,NZ_AZWH01000023,NZ_AZWH01000010,GCF_000514635,NZ_AZWH01000005,NZ_AZWH01000022,NZ_AZWH01000015,GCF_000514635.1,15a.mzXML,NZ_AZWH01000048,15b.mzXML,NZ_AZWH01000019,NZ_AZWH01000008,NZ_AZWH01000009,NZ_AZWH01000024,NZ_AZWH01000035,NZ_AZWH01000040,NZ_AZWH01000016,NZ_AZWH01000014 -Salinispora arenicola CNQ748,NZ_AZWY01000070,NZ_AZWY01000017,NZ_AZWY01000050,NZ_AZWY01000016,NZ_KI911506,NZ_AZWY01000007,GCF_000514975.1,NZ_AZWY01000010,GCF_000514975,NZ_AZWY01000055,NZ_AZWY01000025,NZ_AZWY01000034,NZ_AZWY01000041,16b.mzXML,NZ_AZWY01000063,NZ_AZWY01000045,NZ_AZWY01000048,NZ_AZWY01000058,NZ_AZWY01000020,NZ_AZWY01000032,NZ_AZWY01000006,NZ_AZWY01000021,NZ_AZWY01000040,NZ_AZWY01000011,NZ_AZWY01000023,NZ_AZWY01000009,NZ_AZWY01000053,NZ_AZWY01000008,NZ_AZWY01000005,16a.mzXML,NZ_AZWY01000019 -Salinispora arenicola CNT849,GCF_000373825.1,NZ_KB892511,NZ_KB892473,NZ_KB892499,NZ_KB892514,NZ_KB892480,17a.mzXML,GCF_000373825,NZ_KB892479,NZ_KB892482,NZ_KB892490,NZ_KB892476,NZ_KB892481,NZ_KB892500,NZ_KB892485,NZ_KB892505,NZ_KB892497,NZ_KB892501,NZ_KB892477,NZ_KB892494,NZ_KB892475,NZ_KB892483,17b.mzXML,NZ_KB892478,NZ_KB892474,NZ_KB892507 -Salinispora arenicola CNP193,18a.mzXML,18b.mzXML -Salinispora arenicola CNX508,NZ_AZWT01000002,NZ_AZWT01000016,NZ_AZWT01000034,NZ_AZWT01000032,NZ_AZWT01000018,GCF_000514875,NZ_AZWT01000008,NZ_AZWT01000036,NZ_AZWT01000004,NZ_AZWT01000027,NZ_AZWT01000057,NZ_AZWT01000001,NZ_AZWT01000015,NZ_AZWT01000003,NZ_AZWT01000009,NZ_AZWT01000059,NZ_AZWT01000021,20a.mzXML,NZ_AZWT01000005,GCF_000514875.1,NZ_AZWT01000010,NZ_AZWT01000035,NZ_AZWT01000080,NZ_AZWT01000047,NZ_AZWT01000020 -Salinispora pacifica CNR114,9b.mzXML,NZ_AZWO01000060,NZ_AZWO01000029,GCF_000514775.1,NZ_AZWO01000016,NZ_AZWO01000034,NZ_AZWO01000025,NZ_AZWO01000036,NZ_AZWO01000028,NZ_AZWO01000053,NZ_AZWO01000038,NZ_AZWO01000072,NZ_AZWO01000011,NZ_AZWO01000056,NZ_AZWO01000007,NZ_AZWO01000061,GCF_000514775,NZ_KI911467,NZ_AZWO01000019,NZ_AZWO01000008,NZ_AZWO01000004,NZ_AZWO01000048,9a.mzXML,NZ_AZWO01000014,NZ_KI911468,NZ_AZWO01000027 -Salinispora pacifica CNR894,41a.mzXML -Salinispora pacifica CNR942,NZ_KB894861,NZ_KB894873,NZ_KB894848,NZ_KB894883,NZ_KB894871,NZ_KB894851,NZ_KB894875,NZ_KB894863,GCF_000374665.1,34c.mzXML,GCF_000374665,34b.mzXML,NZ_KB894878,NZ_KB894866,NZ_KB894859,NZ_KB894850,NZ_KB894860,NZ_KB894857,NZ_KB894856,NZ_KB894854,NZ_KB894865 -Salinispora pacifica CNS055,NZ_KB894967,NZ_KB894982,11a.mzXML,NZ_KB894966,NZ_KB895001,NZ_KB894969,NZ_KB894963,NZ_KB894993,NZ_KB894962,NZ_KB894970,NZ_KB895000,NZ_KB894991,NZ_KB894971,NZ_KB894979,NZ_KB894997,11c.mzXML,GCF_000374685,NZ_KB894980,NZ_KB895006,NZ_KB894992,GCF_000374685.1 -Salinispora pacifica CNS237,NZ_AUGH01000017,8a.mzXML,GCF_000424905.1,NZ_KE384268,NZ_AUGH01000011,NZ_AUGH01000032,NZ_AUGH01000028,NZ_KE384271,8b.mzXML,NZ_AUGH01000045,NZ_AUGH01000035,NZ_KE384269,NZ_AUGH01000027,NZ_AUGH01000015,NZ_KE384272,GCF_000424905,NZ_AUGH01000030,NZ_AUGH01000019 -Salinispora pacifica CNS863,22a.mzXML,22b.mzXML -Salinispora pacifica CNT003,26c.mzXML,26a.mzXML -Salinispora pacifica CNT029,NZ_AZWB01000024,GCF_000514515.1,NZ_AZWB01000013,NZ_AZWB01000012,NZ_AZWB01000006,NZ_AZWB01000016,NZ_KI911413,NZ_AZWB01000005,NZ_KI911416,NZ_KI911414,NZ_KI911412,27a.mzXML,GCF_000514515,27b.mzXML -Salinispora pacifica CNT138,19a.mzXML,19c.mzXML -Salinispora pacifica CNT148,25c.mzXML,25b.mzXML -Salinispora pacifica CNT150,23a.mzXML -Salinispora pacifica CNT851,35b.mzXML,35a.mzXML -Salinispora pacifica CNT855,NZ_AZWS01000028,NZ_AZWS01000004,29a.mzXML,NZ_KI911483,NZ_AZWS01000040,NZ_AZWS01000038,NZ_AZWS01000048,NZ_AZWS01000025,GCF_000514855,NZ_AZWS01000012,NZ_AZWS01000024,NZ_AZWS01000036,NZ_AZWS01000051,GCF_000514855.1,NZ_AZWS01000008,NZ_AZWS01000001,NZ_AZWS01000014,NZ_AZWS01000053,NZ_AZWS01000016,NZ_AZWS01000037,NZ_AZWS01000015,NZ_AZWS01000062,NZ_AZWS01000013,NZ_AZWS01000034 -Salinispora pacifica CNY202,7a.mzXML,7b.mzXML -Salinispora pacifica CNY330,10b.mzXML -Salinispora tropica CNB536,2a.mzXML,2b.mzXML -Salinispora tropica CNY012,5a.mzXML,5b.mzXML -Salinispora tropica CNS197,NZ_AZWK01000006,NZ_AZWK01000009,36a.mzXML,NZ_AZWK01000061,NZ_KI911452,NZ_AZWK01000027,NZ_AZWK01000038,GCF_000514695.1,NZ_AZWK01000002,NZ_KI911456,NZ_AZWK01000021,GCF_000514695,36b.mzXML,NZ_AZWK01000022,NZ_AZWK01000003,NZ_AZWK01000019,NZ_KI911458 -Salinispora tropica CNB440,38b.mzXML,38a.mzXML,GCF_000016425,NC_009380,GCF_000016425.1 diff --git a/tests/data/strain_mappings.json b/tests/data/strain_mappings.json new file mode 100644 index 00000000..49d2e055 --- /dev/null +++ b/tests/data/strain_mappings.json @@ -0,0 +1,447 @@ +{ + "strain_mappings": [ + { + "strain_id": "Salinispora arenicola CNB527", + "strain_alias": [ + "42b.mzXML" + ] + }, + { + "strain_id": "Salinispora arenicola CNT005", + "strain_alias": [ + "NZ_AZWU01000037", + "NZ_KI911492", + "NZ_AZWU01000047", + "GCF_000514895", + "NZ_AZWU01000018", + "NZ_AZWU01000036", + "NZ_AZWU01000007", + "GCF_000514895.1", + "NZ_AZWU01000017", + "NZ_AZWU01000052", + "NZ_AZWU01000030", + "NZ_KI911493", + "NZ_AZWU01000054", + "NZ_AZWU01000031", + "NZ_KI911490", + "NZ_AZWU01000064", + "NZ_AZWU01000029", + "NZ_AZWU01000023", + "NZ_AZWU01000020", + "NZ_AZWU01000039", + "NZ_AZWU01000019", + "12c.mzXML", + "NZ_AZWU01000022", + "NZ_KI911494", + "NZ_AZWU01000045", + "12b.mzXML", + "NZ_AZWU01000021", + "NZ_AZWU01000038", + "NZ_AZWU01000046" + ] + }, + { + "strain_id": "Salinispora arenicola CNS205", + "strain_alias": [ + "NC_009953", + "13a.mzXML", + "GCF_000018265", + "13b.mzXML", + "GCF_000018265.1" + ] + }, + { + "strain_id": "Salinispora arenicola CNH646", + "strain_alias": [ + "NZ_AZWH01000027", + "NZ_AZWH01000004", + "NZ_AZWH01000050", + "NZ_AZWH01000017", + "NZ_AZWH01000001", + "NZ_AZWH01000023", + "NZ_AZWH01000010", + "GCF_000514635", + "NZ_AZWH01000005", + "NZ_AZWH01000022", + "NZ_AZWH01000015", + "GCF_000514635.1", + "15a.mzXML", + "NZ_AZWH01000048", + "15b.mzXML", + "NZ_AZWH01000019", + "NZ_AZWH01000008", + "NZ_AZWH01000009", + "NZ_AZWH01000024", + "NZ_AZWH01000035", + "NZ_AZWH01000040", + "NZ_AZWH01000016", + "NZ_AZWH01000014" + ] + }, + { + "strain_id": "Salinispora arenicola CNQ748", + "strain_alias": [ + "NZ_AZWY01000070", + "NZ_AZWY01000017", + "NZ_AZWY01000050", + "NZ_AZWY01000016", + "NZ_KI911506", + "NZ_AZWY01000007", + "GCF_000514975.1", + "NZ_AZWY01000010", + "GCF_000514975", + "NZ_AZWY01000055", + "NZ_AZWY01000025", + "NZ_AZWY01000034", + "NZ_AZWY01000041", + "16b.mzXML", + "NZ_AZWY01000063", + "NZ_AZWY01000045", + "NZ_AZWY01000048", + "NZ_AZWY01000058", + "NZ_AZWY01000020", + "NZ_AZWY01000032", + "NZ_AZWY01000006", + "NZ_AZWY01000021", + "NZ_AZWY01000040", + "NZ_AZWY01000011", + "NZ_AZWY01000023", + "NZ_AZWY01000009", + "NZ_AZWY01000053", + "NZ_AZWY01000008", + "NZ_AZWY01000005", + "16a.mzXML", + "NZ_AZWY01000019" + ] + }, + { + "strain_id": "Salinispora arenicola CNT849", + "strain_alias": [ + "GCF_000373825.1", + "NZ_KB892511", + "NZ_KB892473", + "NZ_KB892499", + "NZ_KB892514", + "NZ_KB892480", + "17a.mzXML", + "GCF_000373825", + "NZ_KB892479", + "NZ_KB892482", + "NZ_KB892490", + "NZ_KB892476", + "NZ_KB892481", + "NZ_KB892500", + "NZ_KB892485", + "NZ_KB892505", + "NZ_KB892497", + "NZ_KB892501", + "NZ_KB892477", + "NZ_KB892494", + "NZ_KB892475", + "NZ_KB892483", + "17b.mzXML", + "NZ_KB892478", + "NZ_KB892474", + "NZ_KB892507" + ] + }, + { + "strain_id": "Salinispora arenicola CNP193", + "strain_alias": [ + "18a.mzXML", + "18b.mzXML" + ] + }, + { + "strain_id": "Salinispora arenicola CNX508", + "strain_alias": [ + "NZ_AZWT01000002", + "NZ_AZWT01000016", + "NZ_AZWT01000034", + "NZ_AZWT01000032", + "NZ_AZWT01000018", + "GCF_000514875", + "NZ_AZWT01000008", + "NZ_AZWT01000036", + "NZ_AZWT01000004", + "NZ_AZWT01000027", + "NZ_AZWT01000057", + "NZ_AZWT01000001", + "NZ_AZWT01000015", + "NZ_AZWT01000003", + "NZ_AZWT01000009", + "NZ_AZWT01000059", + "NZ_AZWT01000021", + "20a.mzXML", + "NZ_AZWT01000005", + "GCF_000514875.1", + "NZ_AZWT01000010", + "NZ_AZWT01000035", + "NZ_AZWT01000080", + "NZ_AZWT01000047", + "NZ_AZWT01000020" + ] + }, + { + "strain_id": "Salinispora pacifica CNR114", + "strain_alias": [ + "9b.mzXML", + "NZ_AZWO01000060", + "NZ_AZWO01000029", + "GCF_000514775.1", + "NZ_AZWO01000016", + "NZ_AZWO01000034", + "NZ_AZWO01000025", + "NZ_AZWO01000036", + "NZ_AZWO01000028", + "NZ_AZWO01000053", + "NZ_AZWO01000038", + "NZ_AZWO01000072", + "NZ_AZWO01000011", + "NZ_AZWO01000056", + "NZ_AZWO01000007", + "NZ_AZWO01000061", + "GCF_000514775", + "NZ_KI911467", + "NZ_AZWO01000019", + "NZ_AZWO01000008", + "NZ_AZWO01000004", + "NZ_AZWO01000048", + "9a.mzXML", + "NZ_AZWO01000014", + "NZ_KI911468", + "NZ_AZWO01000027" + ] + }, + { + "strain_id": "Salinispora pacifica CNR894", + "strain_alias": [ + "41a.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNR942", + "strain_alias": [ + "NZ_KB894861", + "NZ_KB894873", + "NZ_KB894848", + "NZ_KB894883", + "NZ_KB894871", + "NZ_KB894851", + "NZ_KB894875", + "NZ_KB894863", + "GCF_000374665.1", + "34c.mzXML", + "GCF_000374665", + "34b.mzXML", + "NZ_KB894878", + "NZ_KB894866", + "NZ_KB894859", + "NZ_KB894850", + "NZ_KB894860", + "NZ_KB894857", + "NZ_KB894856", + "NZ_KB894854", + "NZ_KB894865" + ] + }, + { + "strain_id": "Salinispora pacifica CNS055", + "strain_alias": [ + "NZ_KB894967", + "NZ_KB894982", + "11a.mzXML", + "NZ_KB894966", + "NZ_KB895001", + "NZ_KB894969", + "NZ_KB894963", + "NZ_KB894993", + "NZ_KB894962", + "NZ_KB894970", + "NZ_KB895000", + "NZ_KB894991", + "NZ_KB894971", + "NZ_KB894979", + "NZ_KB894997", + "11c.mzXML", + "GCF_000374685", + "NZ_KB894980", + "NZ_KB895006", + "NZ_KB894992", + "GCF_000374685.1" + ] + }, + { + "strain_id": "Salinispora pacifica CNS237", + "strain_alias": [ + "NZ_AUGH01000017", + "8a.mzXML", + "GCF_000424905.1", + "NZ_KE384268", + "NZ_AUGH01000011", + "NZ_AUGH01000032", + "NZ_AUGH01000028", + "NZ_KE384271", + "8b.mzXML", + "NZ_AUGH01000045", + "NZ_AUGH01000035", + "NZ_KE384269", + "NZ_AUGH01000027", + "NZ_AUGH01000015", + "NZ_KE384272", + "GCF_000424905", + "NZ_AUGH01000030", + "NZ_AUGH01000019" + ] + }, + { + "strain_id": "Salinispora pacifica CNS863", + "strain_alias": [ + "22a.mzXML", + "22b.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT003", + "strain_alias": [ + "26c.mzXML", + "26a.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT029", + "strain_alias": [ + "NZ_AZWB01000024", + "GCF_000514515.1", + "NZ_AZWB01000013", + "NZ_AZWB01000012", + "NZ_AZWB01000006", + "NZ_AZWB01000016", + "NZ_KI911413", + "NZ_AZWB01000005", + "NZ_KI911416", + "NZ_KI911414", + "NZ_KI911412", + "27a.mzXML", + "GCF_000514515", + "27b.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT138", + "strain_alias": [ + "19a.mzXML", + "19c.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT148", + "strain_alias": [ + "25c.mzXML", + "25b.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT150", + "strain_alias": [ + "23a.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT851", + "strain_alias": [ + "35b.mzXML", + "35a.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNT855", + "strain_alias": [ + "NZ_AZWS01000028", + "NZ_AZWS01000004", + "29a.mzXML", + "NZ_KI911483", + "NZ_AZWS01000040", + "NZ_AZWS01000038", + "NZ_AZWS01000048", + "NZ_AZWS01000025", + "GCF_000514855", + "NZ_AZWS01000012", + "NZ_AZWS01000024", + "NZ_AZWS01000036", + "NZ_AZWS01000051", + "GCF_000514855.1", + "NZ_AZWS01000008", + "NZ_AZWS01000001", + "NZ_AZWS01000014", + "NZ_AZWS01000053", + "NZ_AZWS01000016", + "NZ_AZWS01000037", + "NZ_AZWS01000015", + "NZ_AZWS01000062", + "NZ_AZWS01000013", + "NZ_AZWS01000034" + ] + }, + { + "strain_id": "Salinispora pacifica CNY202", + "strain_alias": [ + "7a.mzXML", + "7b.mzXML" + ] + }, + { + "strain_id": "Salinispora pacifica CNY330", + "strain_alias": [ + "10b.mzXML" + ] + }, + { + "strain_id": "Salinispora tropica CNB536", + "strain_alias": [ + "2a.mzXML", + "2b.mzXML" + ] + }, + { + "strain_id": "Salinispora tropica CNY012", + "strain_alias": [ + "5a.mzXML", + "5b.mzXML" + ] + }, + { + "strain_id": "Salinispora tropica CNS197", + "strain_alias": [ + "NZ_AZWK01000006", + "NZ_AZWK01000009", + "36a.mzXML", + "NZ_AZWK01000061", + "NZ_KI911452", + "NZ_AZWK01000027", + "NZ_AZWK01000038", + "GCF_000514695.1", + "NZ_AZWK01000002", + "NZ_KI911456", + "NZ_AZWK01000021", + "GCF_000514695", + "36b.mzXML", + "NZ_AZWK01000022", + "NZ_AZWK01000003", + "NZ_AZWK01000019", + "NZ_KI911458" + ] + }, + { + "strain_id": "Salinispora tropica CNB440", + "strain_alias": [ + "38b.mzXML", + "38a.mzXML", + "GCF_000016425", + "NC_009380", + "GCF_000016425.1" + ] + } + ], + "version": "1.0" +} diff --git a/tests/pairedomics/test_downloader.py b/tests/pairedomics/test_downloader.py index c0e49ac4..1ade1623 100644 --- a/tests/pairedomics/test_downloader.py +++ b/tests/pairedomics/test_downloader.py @@ -6,10 +6,10 @@ import pytest from pytest_lazyfixture import lazy_fixture from nplinker import utils - -from nplinker.pairedomics.downloader import PODPDownloader -from nplinker.pairedomics.downloader import _generate_gnps_download_url from nplinker.pairedomics.downloader import _execute_download +from nplinker.pairedomics.downloader import _generate_gnps_download_url +from nplinker.pairedomics.downloader import PODPDownloader +from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME from .. import DATA_DIR @@ -34,7 +34,7 @@ def test_default(expected: Path): assert sut.local_file_cache == str(expected / 'extracted') assert sut.project_file_cache == str(expected / 'extracted'/ gnps_id) - assert sut.strain_mappings_file == str(expected / 'extracted'/ gnps_id / 'strain_mappings.csv') + assert sut.strain_mappings_file == str(expected / 'extracted'/ gnps_id / STRAIN_MAPPINGS_FILENAME) assert os.path.exists(str(expected / 'extracted'/ gnps_id / 'antismash')) assert os.path.exists(str(expected / 'extracted'/ gnps_id / 'bigscape')) diff --git a/tests/test_loader.py b/tests/test_loader.py index d89c329c..05dcbab6 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -2,11 +2,13 @@ import shutil import pytest from nplinker.loader import DatasetLoader +from nplinker.loader import STRAIN_MAPPINGS_FILENAME from nplinker.metabolomics.gnps.gnps_extractor import GNPSExtractor from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader from nplinker.strain_collection import StrainCollection from . import DATA_DIR + @pytest.fixture def config(): return { @@ -14,7 +16,7 @@ def config(): "root": DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra", "platform_id": "", "overrides": { - "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv") + "strain_mappings_file": str(DATA_DIR / STRAIN_MAPPINGS_FILENAME) } } } @@ -27,7 +29,7 @@ def config_with_new_gnps_extractor(): "root": DATA_DIR / "extracted", "platform_id": "", "overrides": { - "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv") + "strain_mappings_file": str(DATA_DIR / STRAIN_MAPPINGS_FILENAME) } } } @@ -72,7 +74,7 @@ def test_load_metabolomics(config): def test_has_strain_mappings(config): sut = DatasetLoader(config) sut._init_paths() - assert sut.strain_mappings_file == str(DATA_DIR / "strain_mappings.csv") + assert sut.strain_mappings_file == str(DATA_DIR / STRAIN_MAPPINGS_FILENAME) def test_load_strain_mappings(config): @@ -81,7 +83,6 @@ def test_load_strain_mappings(config): sut._load_strain_mappings() actual = sut.strains - expected = StrainCollection() - expected.add_from_file(sut.strain_mappings_file) + expected = StrainCollection().read_json(sut.strain_mappings_file) assert actual == expected diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index dcdad3df..3458c51a 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -1,7 +1,7 @@ +import json import pytest from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain -from tests import DATA_DIR @pytest.fixture @@ -75,19 +75,55 @@ def test_lookup(collection: StrainCollection, strain: Strain): collection.lookup("strain_not_exist") -def test_add_from_file(): - sut = StrainCollection() - sut.add_from_file(DATA_DIR / "strain_mappings.csv") - assert len(sut) == 27 - - -def test_save_to_file(collection: StrainCollection, tmp_path): - collection.add(Strain("strain_2")) - path = tmp_path / "test.csv" - collection.save_to_file(path) - assert path.exists() - with open(path) as f: - lines = f.readlines() - assert len(lines) == 2 - assert lines[0].strip() == "strain_1,strain_1_a" - assert lines[1].strip() == "strain_2" +@pytest.fixture +def json_file(tmp_path): + data = { + "strain_mappings": [{ + "strain_id": "strain_1", + "strain_alias": ["alias_1", "alias_2"] + }, { + "strain_id": "strain_2", + "strain_alias": ["alias_3", "alias_4"] + }] + } + file_path = tmp_path / "test.json" + with open(file_path, "w") as f: + json.dump(data, f) + return file_path + + +def test_read_json(json_file): + expected_strain_1 = Strain("strain_1") + expected_strain_1.add_alias("alias_1") + expected_strain_1.add_alias("alias_2") + expected_strain_2 = Strain("strain_2") + expected_strain_2.add_alias("alias_3") + expected_strain_2.add_alias("alias_4") + expected_collection = StrainCollection() + expected_collection.add(expected_strain_1) + expected_collection.add(expected_strain_2) + + actual_collection = StrainCollection.read_json(json_file) + assert actual_collection == expected_collection + + +def test_to_json(collection: StrainCollection, tmp_path): + # tests writing to string + expected_data = { + "strain_mappings": [{ + "strain_id": "strain_1", + "strain_alias": ["strain_1_a"] + }], + "version": + "1.0" + } + expected_json = json.dumps(expected_data) + actual_json = collection.to_json() + assert actual_json == expected_json + + # tests writing to file + file_path = tmp_path / "test.json" + collection.to_json(file_path) + with open(file_path, "r") as f: + actual_data = json.load(f) + assert actual_data == expected_data