diff --git a/bin/install-nplinker-deps b/bin/install-nplinker-deps index 0413f80d..59ca6b49 100755 --- a/bin/install-nplinker-deps +++ b/bin/install-nplinker-deps @@ -121,19 +121,14 @@ pip install -q -U pip setuptools #------------------------------------------------------------------------------ #--- Install BigScape -## Note: DO NOT pip install bigscape until its modular version echo "🔥 Start installing BigScape ..." - # TODO: use original repo when multiprocessing bug fixed [[ -d BiG-SCAPE ]] || git clone https://github.com/medema-group/BiG-SCAPE.git - # [[ -d BiG-SCAPE ]] || git clone https://github.com/CunliangGeng/BiG-SCAPE.git cd BiG-SCAPE git config --add advice.detachedHead false # disable advice git config pull.ff only git checkout master git pull - git checkout de55e9c0cecae9648320308f98d3897f9aef3a0a # tag v1.1.5 - # git checkout dev - # git pull + git checkout 99d07e57882fc5fa6780f8254823fd3d1abf3bc6 # Commits on Jul 21, 2023 pip install -q -U -r requirements.txt chmod 754 bigscape.py chmod 664 domains_color_file.tsv diff --git a/pyproject.toml b/pyproject.toml index 143cae51..67cc543e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ namespaces = true # enable data directory to be identified [tool.setuptools.package-data] "nplinker.data" = [ "*" ] +"nplinker.schemas" = [ "*" ] [tool.pytest.ini_options] minversion = "6.0" diff --git a/src/nplinker/class_info/class_matches.py b/src/nplinker/class_info/class_matches.py index c1ea9718..f329bfd9 100644 --- a/src/nplinker/class_info/class_matches.py +++ b/src/nplinker/class_info/class_matches.py @@ -245,7 +245,7 @@ def _get_scoring_tables(self): class_matching_tables[chem_key] = {} class_matching_counts[chem_key] = {} # add matching tables as DataFrames - counts_df = pd.DataFrame.from_dict(counts, dtype=int) + counts_df = pd.DataFrame.from_dict(counts) class_matching_tables[bgc_key][chem_key] = ( counts_df / counts_df.sum(axis=0)).fillna(0) class_matching_counts[bgc_key][chem_key] = counts_df.fillna(0) diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index a77a6417..3c1ec2ee 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -4,8 +4,10 @@ from os import PathLike from pathlib import Path from deprecated import deprecated +from jsonschema import validate from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.logconfig import LogConfig +from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA from nplinker.strain_collection import StrainCollection from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -44,7 +46,10 @@ def generate_mappings_genome_id_bgc_id( bgc_ids = [ bgc_id for f in bgc_files if (bgc_id := Path(f).stem) != genome_id ] - genome_bgc_mappings[genome_id] = bgc_ids + if bgc_ids: + genome_bgc_mappings[genome_id] = bgc_ids + else: + logger.warning("No BGC files found in %s", subdir) # sort mappings by genome_id and construct json data genome_bgc_mappings = dict(sorted(genome_bgc_mappings.items())) @@ -52,11 +57,10 @@ def generate_mappings_genome_id_bgc_id( "genome_ID": k, "BGC_ID": v } for k, v in genome_bgc_mappings.items()] - json_data = { - "mappings": json_data, - "count": len(json_data), - "version": "1.0" - } + json_data = {"mappings": json_data, "version": "1.0"} + + # validate json data + validate(instance=json_data, schema=GENOME_BGC_MAPPINGS_SCHEMA) if output_file is None: output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py index 41e4e78a..5ba8d128 100644 --- a/src/nplinker/pairedomics/podp_antismash_downloader.py +++ b/src/nplinker/pairedomics/podp_antismash_downloader.py @@ -1,16 +1,19 @@ import json +from os import PathLike +from pathlib import Path import re import time from urllib.error import HTTPError -from os import PathLike -from pathlib import Path -import httpx from bs4 import BeautifulSoup from bs4 import NavigableString from bs4 import Tag +import httpx +from jsonschema import validate from nplinker.genomics.antismash import download_and_extract_antismash_data from nplinker.globals import GENOME_STATUS_FILENAME from nplinker.logconfig import LogConfig +from nplinker.schemas import GENOME_STATUS_SCHEMA + logger = LogConfig.getLogger(__name__) @@ -64,6 +67,10 @@ def read_json(file: str | PathLike) -> dict[str, 'GenomeStatus']: if Path(file).exists(): with open(file, "r") as f: data = json.load(f) + + # validate json data before using it + validate(data, schema=GENOME_STATUS_SCHEMA) + genome_status_dict = { gs["original_id"]: GenomeStatus(**gs) for gs in data["genome_status"] @@ -90,6 +97,10 @@ def to_json(genome_status_dict: dict[str, 'GenomeStatus'], """ gs_list = [gs._to_dict() for gs in genome_status_dict.values()] json_data = {"genome_status": gs_list, "version": "1.0"} + + # validate json object before dumping + validate(json_data, schema=GENOME_STATUS_SCHEMA) + if file is not None: with open(file, "w") as f: json.dump(json_data, f) @@ -206,8 +217,7 @@ def podp_download_and_extract_antismash_data( logger.warning('Failed to successfully retrieve ANY genome data!') -def get_best_available_genome_id( - genome_id_data: dict[str, str]) -> str | None: +def get_best_available_genome_id(genome_id_data: dict[str, str]) -> str | None: """Get the best available ID from genome_id_data dict. Args: diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py index ff26dd64..091a37e0 100644 --- a/src/nplinker/pairedomics/strain_mappings_generator.py +++ b/src/nplinker/pairedomics/strain_mappings_generator.py @@ -2,8 +2,11 @@ import logging from os import PathLike from pathlib import Path +from jsonschema import validate from nplinker.metabolomics.gnps.gnps_file_mapping_loader import \ GNPSFileMappingLoader +from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA +from nplinker.schemas import validate_podp_json from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain from .podp_antismash_downloader import GenomeStatus @@ -135,6 +138,8 @@ def extract_mappings_strain_id_original_genome_id( with open(podp_project_json_file, 'r') as f: json_data = json.load(f) + validate_podp_json(json_data) + for record in json_data['genomes']: strain_id = record['genome_label'] genome_id = get_best_available_genome_id(record['genome_ID']) @@ -191,6 +196,10 @@ def extract_mappings_resolved_genome_id_bgc_id( """ with open(genome_bgc_mappings_file, 'r') as f: json_data = json.load(f) + + # validate the JSON data + validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA) + return { mapping["genome_ID"]: set(mapping["BGC_ID"]) for mapping in json_data['mappings'] @@ -263,6 +272,8 @@ def extract_mappings_strain_id_ms_filename( with open(podp_project_json_file, 'r') as f: json_data = json.load(f) + validate_podp_json(json_data) + # Extract mappings strain id <-> metabolomics filename for record in json_data['genome_metabolome_links']: strain_id = record['genome_label'] diff --git a/src/nplinker/schemas/__init__.py b/src/nplinker/schemas/__init__.py new file mode 100644 index 00000000..f758aa6f --- /dev/null +++ b/src/nplinker/schemas/__init__.py @@ -0,0 +1,23 @@ +import json +import logging +from pathlib import Path +from .utils import PODP_ADAPTED_SCHEMA +from .utils import validate_podp_json + + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = [ + 'GENOME_STATUS_SCHEMA', 'GENOME_BGC_MAPPINGS_SCHEMA', + 'STRAIN_MAPPINGS_SCHEMA', 'PODP_ADAPTED_SCHEMA', 'validate_podp_json' +] + +SCHEMA_DIR = Path(__file__).parent +with open(SCHEMA_DIR / "genome_status_schema.json", 'r') as f: + GENOME_STATUS_SCHEMA = json.load(f) + +with open(SCHEMA_DIR / "genome_bgc_mappings_schema.json", 'r') as f: + GENOME_BGC_MAPPINGS_SCHEMA = json.load(f) + +with open(SCHEMA_DIR / "strain_mappings_schema.json", 'r') as f: + STRAIN_MAPPINGS_SCHEMA = json.load(f) diff --git a/src/nplinker/schemas/genome_bgc_mappings_schema.json b/src/nplinker/schemas/genome_bgc_mappings_schema.json new file mode 100644 index 00000000..cb373855 --- /dev/null +++ b/src/nplinker/schemas/genome_bgc_mappings_schema.json @@ -0,0 +1,53 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/genome_bgc_mappings_schema.json", + "title": "Mappings from genome ID to BGC IDs", + "description": "A list of mappings from genome ID to BGC (biosynthetic gene cluster) IDs", + "type": "object", + "required": [ + "mappings", + "version" + ], + "properties": { + "mappings": { + "type": "array", + "title": "Mappings from genome ID to BGC IDs", + "description": "A list of mappings from genome ID to BGC IDs", + "items": { + "type": "object", + "required": [ + "genome_ID", + "BGC_ID" + ], + "properties": { + "genome_ID": { + "type": "string", + "title": "Genome ID", + "description": "The genome ID used in BGC database such as antiSMASH", + "minLength": 1 + }, + "BGC_ID": { + "type": "array", + "title": "BGC ID", + "description": "A list of BGC IDs", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + "version": { + "type": "string", + "enum": [ + "1.0" + ] + } + }, + "additionalProperties": false +} diff --git a/src/nplinker/schemas/genome_status_schema.json b/src/nplinker/schemas/genome_status_schema.json new file mode 100644 index 00000000..470c74f4 --- /dev/null +++ b/src/nplinker/schemas/genome_status_schema.json @@ -0,0 +1,59 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/genome_status_schema.json", + "title": "Status of genomes", + "description": "A list of genome status objects, each of which contains information about a single genome", + "type": "object", + "required": [ + "genome_status", + "version" + ], + "properties": { + "genome_status": { + "type": "array", + "title": "Genome status", + "description": "A list of genome status objects", + "items": { + "type": "object", + "required": [ + "original_id", + "resolved_refseq_id", + "resolve_attempted", + "bgc_path" + ], + "properties": { + "original_id": { + "type": "string", + "title": "Original ID", + "description": "The original ID of the genome", + "minLength": 1 + }, + "resolved_refseq_id": { + "type": "string", + "title": "Resolved RefSeq ID", + "description": "The RefSeq ID that was resolved for this genome" + }, + "resolve_attempted": { + "type": "boolean", + "title": "Resolve Attempted", + "description": "Whether or not an attempt was made to resolve this genome" + }, + "bgc_path": { + "type": "string", + "title": "BGC Path", + "description": "The path to the downloaded BGC file for this genome" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + "version": { + "type": "string", + "enum": [ + "1.0" + ] + } + }, + "additionalProperties": false +} diff --git a/src/nplinker/schemas/podp_adapted_schema.json b/src/nplinker/schemas/podp_adapted_schema.json new file mode 100644 index 00000000..ec182ae4 --- /dev/null +++ b/src/nplinker/schemas/podp_adapted_schema.json @@ -0,0 +1,149 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/podp_adapted_schema.json", + "title": "Adapted Paired Omics Data Platform Schema for NPLinker", + "description": "This schema is adapted from PODP schema (https://pairedomicsdata.bioinformatics.nl/schema.json) for NPLinker. It's used to validate the input data for NPLinker. Thus, only required fields for NPLinker are kept in this schema, and some fields are modified to fit NPLinker's requirements.", + "type": "object", + "required": [ + "version", + "metabolomics", + "genomes", + "genome_metabolome_links" + ], + "properties": { + "version": { + "type": "string", + "readOnly": true, + "default": "3", + "enum": [ + "3" + ] + }, + "metabolomics": { + "type": "object", + "title": "2. Metabolomics Information", + "description": "Please provide basic information on the publicly available metabolomics project from which paired data is available. Currently, we allow for links to mass spectrometry data deposited in GNPS-MaSSIVE or MetaboLights.", + "properties": { + "project": { + "type": "object", + "required": [ + "molecular_network" + ], + "title": "GNPS-MassIVE", + "properties": { + "GNPSMassIVE_ID": { + "type": "string", + "title": "GNPS-MassIVE identifier", + "description": "Please provide the GNPS-MassIVE identifier of your metabolomics data set, e.g., MSV000078839.", + "pattern": "^MSV[0-9]{9}$" + }, + "MaSSIVE_URL": { + "type": "string", + "title": "Link to MassIVE upload", + "description": "Please provide the link to the MassIVE upload, e.g., https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=a507232a787243a5afd69a6c6fa1e508&view=advanced_view. Warning, there cannot be spaces in the URI.", + "format": "uri" + }, + "molecular_network": { + "type": "string", + "pattern": "^[0-9a-z]{32}$", + "title": "Molecular Network Task ID", + "description": "If you have run a Molecular Network on GNPS, please provide the task ID of the Molecular Network job. It can be found in the URL of the Molecular Networking job, e.g., in https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=c36f90ba29fe44c18e96db802de0c6b9 the task ID is c36f90ba29fe44c18e96db802de0c6b9." + } + } + } + }, + "required": [ + "project" + ], + "additionalProperties": true + }, + "genomes": { + "type": "array", + "title": "3. (Meta)genomics Information", + "description": "Please add all genomes and/or metagenomes for which paired data is available as separate entries.", + "items": { + "type": "object", + "required": [ + "genome_ID", + "genome_label" + ], + "properties": { + "genome_ID": { + "type": "object", + "title": "Genome accession", + "description": "At least one of the three identifiers is required.", + "anyOf": [ + { + "required": [ + "GenBank_accession" + ] + }, + { + "required": [ + "RefSeq_accession" + ] + }, + { + "required": [ + "JGI_Genome_ID" + ] + } + ], + "properties": { + "GenBank_accession": { + "type": "string", + "title": "GenBank accession number", + "description": "If the publicly available genome got a GenBank accession number assigned, e.g., AL645882, please provide it here. The genome sequence must be submitted to GenBank/ENA/DDBJ (and an accession number must be received) before this form can be filled out. In case of a whole genome sequence, please use master records. At least one identifier must be entered." + }, + "RefSeq_accession": { + "type": "string", + "title": "RefSeq accession number", + "description": "For example: NC_003888.3" + }, + "JGI_Genome_ID": { + "type": "string", + "title": "JGI IMG genome ID", + "description": "For example: 641228474" + } + } + }, + "genome_label": { + "type": "string", + "title": "Genome label", + "description": "Please assign a unique Genome Label for this genome or metagenome to help you recall it during the linking step. For example 'Streptomyces sp. CNB091'", + "minLength": 1 + } + } + }, + "minItems": 1 + }, + "genome_metabolome_links": { + "type": "array", + "title": "6. Genome - Proteome - Metabolome Links", + "description": "Create a linked pair by selecting the Genome Label and optional Proteome label as provided earlier. Subsequently links to the metabolomics data file belonging to that genome/proteome with appropriate experimental methods.", + "items": { + "type": "object", + "required": [ + "genome_label", + "metabolomics_file" + ], + "properties": { + "genome_label": { + "type": "string", + "title": "Genome/Metagenome", + "description": "Please select the Genome Label to be linked to a metabolomics data file." + }, + "metabolomics_file": { + "type": "string", + "title": "Location of metabolomics data file", + "description": "Please provide a direct link to the metabolomics data file location, e.g. ftp://massive.ucsd.edu/MSV000078839/spectrum/R5/CNB091_R5_M.mzXML found in the FTP download of a MassIVE dataset or https://www.ebi.ac.uk/metabolights/MTBLS307/files/Urine_44_fullscan1_pos.mzXML found in the Files section of a MetaboLights study. Warning, there cannot be spaces in the URI.", + "format": "uri" + } + }, + "additionalProperties": true + }, + "minItems": 1 + } + }, + "additionalProperties": true +} diff --git a/src/nplinker/schemas/strain_mappings_schema.json b/src/nplinker/schemas/strain_mappings_schema.json new file mode 100644 index 00000000..bc22821b --- /dev/null +++ b/src/nplinker/schemas/strain_mappings_schema.json @@ -0,0 +1,53 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/strain_mappings_schema.json", + "title": "Strain mappings", + "description": "A list of mappings from strain ID to strain aliases", + "type": "object", + "required": [ + "strain_mappings", + "version" + ], + "properties": { + "strain_mappings": { + "type": "array", + "title": "Strain mappings", + "description": "A list of strain mappings", + "items": { + "type": "object", + "required": [ + "strain_id", + "strain_alias" + ], + "properties": { + "strain_id": { + "type": "string", + "title": "Strain ID", + "description": "Strain ID, which could be any strain name or accession number", + "minLength": 1 + }, + "strain_alias": { + "type": "array", + "title": "Strain aliases", + "description": "A list of strain aliases, which could be any names that refer to the same strain", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + "version": { + "type": "string", + "enum": [ + "1.0" + ] + } + }, + "additionalProperties": false +} diff --git a/src/nplinker/schemas/utils.py b/src/nplinker/schemas/utils.py new file mode 100644 index 00000000..4b1e4d59 --- /dev/null +++ b/src/nplinker/schemas/utils.py @@ -0,0 +1,29 @@ +import json +from pathlib import Path +from jsonschema import Draft7Validator + + +with open(Path(__file__).parent / "podp_adapted_schema.json", 'r') as f: + PODP_ADAPTED_SCHEMA = json.load(f) + + +def validate_podp_json(json_data: dict) -> None: + """ + Validate a dictionary of JSON data against the PODP JSON schema. + + All validation error messages are collected and raised as a single + ValueError. + + Parameters: + json_data (dict): The JSON data to validate. + + Raises: + ValueError: If the JSON data does not match the schema. + """ + validator = Draft7Validator(PODP_ADAPTED_SCHEMA) + errors = sorted(validator.iter_errors(json_data), key=lambda e: e.path) + if errors: + error_messages = [f"{e.json_path}: {e.message}" for e in errors] + raise ValueError( + "Not match PODP adapted schema, here are the detailed error:\n - " + + "\n - ".join(error_messages)) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 3453b0a5..d221ce86 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -3,6 +3,8 @@ from pathlib import Path from typing import Iterator from deprecated import deprecated +from jsonschema import validate +from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA from .logconfig import LogConfig from .strains import Strain from .utils import list_dirs @@ -124,6 +126,9 @@ def read_json(file: str | PathLike) -> 'StrainCollection': with open(file, 'r') as f: json_data = json.load(f) + # validate json data + validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA) + strain_collection = StrainCollection() for data in json_data['strain_mappings']: strain = Strain(data['strain_id']) @@ -149,6 +154,9 @@ def to_json(self, file: str | PathLike | None = None) -> str | None: } for strain in self] json_data = {"strain_mappings": data_list, "version": "1.0"} + # validate json data + validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA) + if file is not None: with open(file, 'w') as f: json.dump(json_data, f) diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index 1b15cff1..8b566b90 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -34,7 +34,7 @@ def test_generate_mappings_genome_id_bgc_id(tmp_path): assert mappings == mappings_with_outfile # then check the content - assert mappings["count"] == 2 + assert len(mappings["mappings"]) == 2 assert mappings["mappings"][0]["genome_ID"] == "GCF_000514515.1" assert len(mappings["mappings"][0]["BGC_ID"]) == 20 @@ -51,6 +51,25 @@ def test_generate_mappings_genome_id_bgc_id(tmp_path): (bgc_dir / GENOME_BGC_MAPPINGS_FILENAME).unlink() +def test_generate_mappings_genome_id_bgc_id_empty_dir(tmp_path, caplog): + # prepare dir and file + bgc_dir = tmp_path / "GCF_1" + bgc_file = bgc_dir / "BGC_1.gbk" + bgc_dir.mkdir() + bgc_file.touch() + empty_dir = tmp_path / "empty_dir" + empty_dir.mkdir() + + generate_mappings_genome_id_bgc_id(tmp_path) + assert "No BGC files found" in caplog.text + + with open(tmp_path / GENOME_BGC_MAPPINGS_FILENAME) as f: + mappings = json.load(f) + assert len(mappings["mappings"]) == 1 + assert mappings["mappings"][0]["genome_ID"] == "GCF_1" + assert mappings["mappings"][0]["BGC_ID"] == ["BGC_1"] + + @pytest.fixture def strain_collection() -> StrainCollection: sc = StrainCollection() diff --git a/tests/pairedomics/test_strain_mappings_generator.py b/tests/pairedomics/test_strain_mappings_generator.py index c1f87a26..af3d6ae6 100644 --- a/tests/pairedomics/test_strain_mappings_generator.py +++ b/tests/pairedomics/test_strain_mappings_generator.py @@ -94,7 +94,20 @@ def test_extract_mappings_strain_id_original_genome_id(tmp_path): "RefSeq_accession": "id3" } }, - ] + ], + "metabolomics": { + "project": { + "molecular_network": "01234567890123456789012345678901" + } + }, + "genome_metabolome_links": [ + { + "metabolomics_file": "ftp://example.org/001.mzXML", + "genome_label": "strain1" + }, + ], + "version": + "3" } test_file = tmp_path / "test_data.json" with open(test_file, "w") as f: @@ -129,7 +142,9 @@ def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): "resolve_attempted": True, "bgc_path": "" }, - ] + ], + "version": + "1.0" } test_file = tmp_path / "test_data.json" with open(test_file, "w") as f: @@ -143,29 +158,20 @@ def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): def test_extract_mappings_resolved_genome_id_bgc_id(tmp_path): test_data = { - "mappings": [ - { - "genome_ID": "id1", - "BGC_ID": ["bgc1", "bgc2"] - }, - { - "genome_ID": "id2", - "BGC_ID": ["bgc3"] - }, - { - "genome_ID": "id3", - "BGC_ID": [] - }, - ] + "mappings": [{ + "genome_ID": "id1", + "BGC_ID": ["bgc1", "bgc2"] + }, { + "genome_ID": "id2", + "BGC_ID": ["bgc3"] + }], + "version": + "1.0" } test_file = tmp_path / "test_data.json" with open(test_file, "w") as f: json.dump(test_data, f) - expected_result = { - "id1": {"bgc1", "bgc2"}, - "id2": {"bgc3"}, - "id3": set(), - } + expected_result = {"id1": {"bgc1", "bgc2"}, "id2": {"bgc3"}} assert extract_mappings_resolved_genome_id_bgc_id( test_file) == expected_result @@ -237,7 +243,22 @@ def test_extract_mappings_strain_id_ms_filename(tmp_path): "genome_label": "strain3", "metabolomics_file": "http://example.com/file4.mzXML" }, - ] + ], + "genomes": [ + { + "genome_label": "strain1", + "genome_ID": { + "RefSeq_accession": "id1" + } + }, + ], + "metabolomics": { + "project": { + "molecular_network": "01234567890123456789012345678901" + } + }, + "version": + "3" } test_file = tmp_path / "test_data.json" with open(test_file, "w") as f: diff --git a/tests/schemas/test_genome_bgc_mappings_schema.py b/tests/schemas/test_genome_bgc_mappings_schema.py new file mode 100644 index 00000000..4feb0243 --- /dev/null +++ b/tests/schemas/test_genome_bgc_mappings_schema.py @@ -0,0 +1,137 @@ +from jsonschema import validate +from jsonschema.exceptions import ValidationError +import pytest +from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA + + +# Note: +# The function `validate` will first verify that the provided schema is itself valid (SchemaError exception), +# and then check that the given data is valid against the schema (ValidationError exception). +# It's assumed that the schema is valid, so we only need to test the ValidationError exception. +# see https://python-jsonschema.readthedocs.io/en/stable/validate/#the-basics + +# Prepare invalid data +data_no_mappings = {"version": "1.0"} + +data_empty_mappings = {"mappings": [], "version": "1.0"} + +data_no_genome_id = { + "mappings": [{ + "BGC_ID": ["bgc1", "bgc2"] + }], + "version": "1.0" +} + +data_empty_genome_id = { + "mappings": [{ + "genome_ID": "", + "BGC_ID": ["bgc1"] + }], + "version": "1.0" +} + +data_invalid_genome_id = { + "mappings": [{ + "genome_ID": 1, + "BGC_ID": ["bgc1"] + }], + "version": "1.0" +} + +data_no_bgc_id = {"mappings": [{"genome_ID": "genome1"}], "version": "1.0"} + +data_empty_bgc_id_list = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": [] + }], + "version": "1.0" +} + +data_empty_bgc_id = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": [""] + }], + "version": "1.0" +} + +data_invalid_bgc_id = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": [1] + }], + "version": "1.0" +} + +data_duplicate_bgc_id = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": ["bgc1", "bgc1"] + }], + "version": "1.0" +} + +data_no_version = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": ["bgc1", "bgc2"] + }] +} + +data_empty_version = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": ["bgc1", "bgc2"] + }], + "version": "" +} + +data_invalid_version = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": ["bgc1"] + }], + "version": "1.0.0" +} + + +# Test schema aginast invalid data +@pytest.mark.parametrize( + "data, expected", + [[data_no_mappings, "'mappings' is a required property"], + [data_empty_mappings, "[] is too short"], + [data_no_genome_id, "'genome_ID' is a required property"], + [data_empty_genome_id, "'' is too short"], + [data_invalid_genome_id, "1 is not of type 'string'"], + [data_no_bgc_id, "'BGC_ID' is a required property"], + [data_empty_bgc_id_list, "[] is too short"], + [data_empty_bgc_id, "'' is too short"], + [data_invalid_bgc_id, "1 is not of type 'string'"], + [data_duplicate_bgc_id, "['bgc1', 'bgc1'] has non-unique elements"], + [data_no_version, "'version' is a required property"], + [data_empty_version, "'' is not one of ['1.0']"], + [data_invalid_version, "'1.0.0' is not one of ['1.0']"]]) +def test_invalid_data(data, expected): + with pytest.raises(ValidationError) as e: + validate(data, GENOME_BGC_MAPPINGS_SCHEMA) + assert e.value.message == expected + + +# Test schema aginast valid data +def test_valid_data(): + data = { + "mappings": [{ + "genome_ID": "genome1", + "BGC_ID": ["bgc1", "bgc2"] + }, { + "genome_ID": "genome2", + "BGC_ID": ["bgc3"] + }], + "version": + "1.0" + } + try: + validate(data, GENOME_BGC_MAPPINGS_SCHEMA) + except ValidationError: + pytest.fail("Unexpected ValidationError") diff --git a/tests/schemas/test_genome_status_schema.py b/tests/schemas/test_genome_status_schema.py new file mode 100644 index 00000000..ed4b5ea3 --- /dev/null +++ b/tests/schemas/test_genome_status_schema.py @@ -0,0 +1,178 @@ +from jsonschema import validate +from jsonschema.exceptions import ValidationError +import pytest +from nplinker.schemas import GENOME_STATUS_SCHEMA + + +# Prepare invalid data +data_no_genome_status = {"version": "1.0"} + +data_empty_genome_status = {"genome_status": [], "version": "1.0"} + +data_no_original_id = { + "genome_status": [{ + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": True, + "bgc_path": "" + }], + "version": + "1.0" +} + +data_empty_original_id = { + "genome_status": [{ + "original_id": "", + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": True, + "bgc_path": "" + }], + "version": + "1.0" +} + +data_invalid_original_id = { + "genome_status": [{ + "original_id": 1, + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": True, + "bgc_path": "" + }], + "version": + "1.0" +} + +data_no_resolved_refseq_id = { + "genome_status": [{ + "original_id": "id1", + "resolve_attempted": True, + "bgc_path": "" + }], + "version": + "1.0" +} + +data_invalid_resolved_refseq_id = { + "genome_status": [{ + "original_id": "id1", + "resolved_refseq_id": 1, + "resolve_attempted": True, + "bgc_path": "" + }], + "version": + "1.0" +} + +data_no_resolve_attempted = { + "genome_status": [{ + "original_id": "id1", + "resolved_refseq_id": "id1_refseq", + "bgc_path": "" + }], + "version": + "1.0" +} + +data_invalid_resolve_attempted = { + "genome_status": [{ + "original_id": "id1", + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": 1, + "bgc_path": "" + }], + "version": + "1.0" +} + +data_no_bgc_path = { + "genome_status": [{ + "original_id": "id1", + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": True + }], + "version": + "1.0" +} + +data_invalid_bgc_path = { + "genome_status": [{ + "original_id": "id1", + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": True, + "bgc_path": 1 + }], + "version": + "1.0" +} + +data_no_version = { + "genome_status": [{ + "strain_id": "strain1", + "strain_alias": ["alias1", "alias2"] + }] +} + +data_empty_version = { + "genome_status": [{ + "strain_id": "strain1", + "strain_alias": ["alias1", "alias2"] + }], + "version": + "" + "" +} + +data_invalid_version = { + "genome_status": [{ + "strain_id": "strain1", + "strain_alias": ["alias1"] + }], + "version": "1.0.0" +} + + +# Test schema aginast invalid data +@pytest.mark.parametrize( + "data, expected", + [[data_no_genome_status, "'genome_status' is a required property"], + [data_empty_genome_status, "[] is too short"], + [data_no_original_id, "'original_id' is a required property"], + [data_empty_original_id, "'' is too short"], + [data_invalid_original_id, "1 is not of type 'string'"], + [ + data_no_resolved_refseq_id, + "'resolved_refseq_id' is a required property" + ], [data_invalid_resolved_refseq_id, "1 is not of type 'string'"], + [data_no_resolve_attempted, "'resolve_attempted' is a required property"], + [data_invalid_resolve_attempted, "1 is not of type 'boolean'"], + [data_no_bgc_path, "'bgc_path' is a required property"], + [data_invalid_bgc_path, "1 is not of type 'string'"], + [data_no_version, "'version' is a required property"], + [data_empty_version, "'' is not one of ['1.0']"], + [data_invalid_version, "'1.0.0' is not one of ['1.0']"]]) +def test_invalid_data(data, expected): + with pytest.raises(ValidationError) as e: + validate(data, GENOME_STATUS_SCHEMA) + assert e.value.message == expected + + +# Test schema aginast valid data +def test_valid_data(): + data = { + "genome_status": [{ + "original_id": "id1", + "resolved_refseq_id": "id1_refseq", + "resolve_attempted": True, + "bgc_path": "" + }, { + "original_id": "id2", + "resolved_refseq_id": "id2_refseq", + "resolve_attempted": False, + "bgc_path": "" + }], + "version": + "1.0" + } + try: + validate(data, GENOME_STATUS_SCHEMA) + except ValidationError: + pytest.fail("Unexpected ValidationError") diff --git a/tests/schemas/test_strain_mappings_schema.py b/tests/schemas/test_strain_mappings_schema.py new file mode 100644 index 00000000..897990d8 --- /dev/null +++ b/tests/schemas/test_strain_mappings_schema.py @@ -0,0 +1,140 @@ +from jsonschema import validate +from jsonschema.exceptions import ValidationError +import pytest +from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA + + +# Prepare invalid data +data_no_mappings = {"version": "1.0"} + +data_empty_mappings = {"strain_mappings": [], "version": "1.0"} + +data_no_strain_id = { + "strain_mappings": [{ + "strain_alias": ["alias1", "alias2"] + }], + "version": "1.0" +} + +data_empty_strain_id = { + "strain_mappings": [{ + "strain_id": "", + "strain_alias": ["alias1"] + }], + "version": "1.0" +} + +data_invalid_strain_id = { + "strain_mappings": [{ + "strain_id": 1, + "strain_alias": ["alias1"] + }], + "version": "1.0" +} + +data_no_strain_alias = { + "strain_mappings": [{ + "strain_id": "strain1" + }], + "version": "1.0" +} + +data_empty_strain_alias_list = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": [] + }], + "version": "1.0" +} + +data_empty_strain_alias = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": [""] + }], + "version": "1.0" +} + +data_invalid_strain_alias = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": [1] + }], + "version": "1.0" +} + +data_duplicate_strain_alias = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": ["alias1", "alias1"] + }], + "version": + "1.0" +} + +data_no_version = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": ["alias1", "alias2"] + }] +} + +data_empty_version = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": ["alias1", "alias2"] + }], + "version": "" + "" +} + +data_invalid_version = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": ["alias1"] + }], + "version": "1.0.0" +} + + +# Test schema aginast invalid data +@pytest.mark.parametrize( + "data, expected", + [[data_no_mappings, "'strain_mappings' is a required property"], + [data_empty_mappings, "[] is too short"], + [data_no_strain_id, "'strain_id' is a required property"], + [data_empty_strain_id, "'' is too short"], + [data_invalid_strain_id, "1 is not of type 'string'"], + [data_no_strain_alias, "'strain_alias' is a required property"], + [data_empty_strain_alias_list, "[] is too short"], + [data_empty_strain_alias, "'' is too short"], + [data_invalid_strain_alias, "1 is not of type 'string'"], + [ + data_duplicate_strain_alias, + "['alias1', 'alias1'] has non-unique elements" + ], [data_no_version, "'version' is a required property"], + [data_empty_version, "'' is not one of ['1.0']"], + [data_invalid_version, "'1.0.0' is not one of ['1.0']"]]) +def test_invalid_data(data, expected): + with pytest.raises(ValidationError) as e: + validate(data, STRAIN_MAPPINGS_SCHEMA) + assert e.value.message == expected + + +# Test schema aginast valid data +def test_valid_data(): + data = { + "strain_mappings": [{ + "strain_id": "strain1", + "strain_alias": ["alias1", "alias2"] + }, { + "strain_id": "strain2", + "strain_alias": ["alias3"] + }], + "version": + "1.0" + } + try: + validate(data, STRAIN_MAPPINGS_SCHEMA) + except ValidationError: + pytest.fail("Unexpected ValidationError") diff --git a/tests/schemas/test_utils.py b/tests/schemas/test_utils.py new file mode 100644 index 00000000..46b7108c --- /dev/null +++ b/tests/schemas/test_utils.py @@ -0,0 +1,57 @@ +from jsonschema import Draft7Validator +import pytest +from nplinker.schemas import PODP_ADAPTED_SCHEMA +from nplinker.schemas import validate_podp_json + + +def test_podp_adapted_schema_itself(): + validator = Draft7Validator(Draft7Validator.META_SCHEMA) + errors = validator.iter_errors(PODP_ADAPTED_SCHEMA) + assert list(errors) == [] + + +def test_validate_podp_json_minimum_valid_data(): + # minimum valid data, containing only required fields + data = { + "version": + "3", + "metabolomics": { + "project": { + "molecular_network": "01234567890123456789012345678901" + }, + }, + "genomes": [ + { + "genome_label": "strain1", + "genome_ID": { + "RefSeq_accession": "GCF_1" + } + }, + ], + "genome_metabolome_links": [ + { + "metabolomics_file": "ftp://example.org/001.mzXML", + "genome_label": "strain1" + }, + ] + } + try: + validate_podp_json(data) + except ValueError: + pytest.fail('Unexpected ValueError') + + +def test_validate_podp_json_invalid_data(): + data = {} + with pytest.raises(ValueError) as e: + validate_podp_json(data) + + expected = """ +Not match PODP adapted schema, here are the detailed error: + - $: 'version' is a required property + - $: 'metabolomics' is a required property + - $: 'genomes' is a required property + - $: 'genome_metabolome_links' is a required property +""" + + assert e.value.args[0] == expected.strip() diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index 3458c51a..c809759c 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -84,7 +84,8 @@ def json_file(tmp_path): }, { "strain_id": "strain_2", "strain_alias": ["alias_3", "alias_4"] - }] + }], + "version": "1.0" } file_path = tmp_path / "test.json" with open(file_path, "w") as f: