Skip to content

Commit

Permalink
Add schema validation for json data (#167)
Browse files Browse the repository at this point in the history
* remove useless item "count" from genome-bgc mappings

* add schemas

* include schemas as package data

* add tests for schemas

* add json validation to GenomeStatus methods

* add json validation for genome bgc mappings

* add json validation for strain mappings

* disallow empty list of bgc ids for genome bgc mappings

* add version to test data to match schema

* add validation for PODP json data

* fix podp test data to pass json validation

* fix installation failure cased by deprecated sklearn issue in bigscape

* update test json data to match schema requirement

* fix data frame dtype bug to pass unit test for `ClassMatches`
  • Loading branch information
CunliangGeng committed Aug 9, 2023
1 parent 817dabd commit e1973da
Show file tree
Hide file tree
Showing 20 changed files with 990 additions and 42 deletions.
7 changes: 1 addition & 6 deletions bin/install-nplinker-deps
Original file line number Diff line number Diff line change
Expand Up @@ -121,19 +121,14 @@ pip install -q -U pip setuptools
#------------------------------------------------------------------------------

#--- Install BigScape
## Note: DO NOT pip install bigscape until its modular version
echo "🔥 Start installing BigScape ..."
# TODO: use original repo when multiprocessing bug fixed
[[ -d BiG-SCAPE ]] || git clone https://github.com/medema-group/BiG-SCAPE.git
# [[ -d BiG-SCAPE ]] || git clone https://github.com/CunliangGeng/BiG-SCAPE.git
cd BiG-SCAPE
git config --add advice.detachedHead false # disable advice
git config pull.ff only
git checkout master
git pull
git checkout de55e9c0cecae9648320308f98d3897f9aef3a0a # tag v1.1.5
# git checkout dev
# git pull
git checkout 99d07e57882fc5fa6780f8254823fd3d1abf3bc6 # Commits on Jul 21, 2023
pip install -q -U -r requirements.txt
chmod 754 bigscape.py
chmod 664 domains_color_file.tsv
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ namespaces = true # enable data directory to be identified

[tool.setuptools.package-data]
"nplinker.data" = [ "*" ]
"nplinker.schemas" = [ "*" ]

[tool.pytest.ini_options]
minversion = "6.0"
Expand Down
2 changes: 1 addition & 1 deletion src/nplinker/class_info/class_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def _get_scoring_tables(self):
class_matching_tables[chem_key] = {}
class_matching_counts[chem_key] = {}
# add matching tables as DataFrames
counts_df = pd.DataFrame.from_dict(counts, dtype=int)
counts_df = pd.DataFrame.from_dict(counts)
class_matching_tables[bgc_key][chem_key] = (
counts_df / counts_df.sum(axis=0)).fillna(0)
class_matching_counts[bgc_key][chem_key] = counts_df.fillna(0)
Expand Down
16 changes: 10 additions & 6 deletions src/nplinker/genomics/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from os import PathLike
from pathlib import Path
from deprecated import deprecated
from jsonschema import validate
from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
from nplinker.logconfig import LogConfig
from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA
from nplinker.strain_collection import StrainCollection
from nplinker.utils import list_dirs
from nplinker.utils import list_files
Expand Down Expand Up @@ -44,19 +46,21 @@ def generate_mappings_genome_id_bgc_id(
bgc_ids = [
bgc_id for f in bgc_files if (bgc_id := Path(f).stem) != genome_id
]
genome_bgc_mappings[genome_id] = bgc_ids
if bgc_ids:
genome_bgc_mappings[genome_id] = bgc_ids
else:
logger.warning("No BGC files found in %s", subdir)

# sort mappings by genome_id and construct json data
genome_bgc_mappings = dict(sorted(genome_bgc_mappings.items()))
json_data = [{
"genome_ID": k,
"BGC_ID": v
} for k, v in genome_bgc_mappings.items()]
json_data = {
"mappings": json_data,
"count": len(json_data),
"version": "1.0"
}
json_data = {"mappings": json_data, "version": "1.0"}

# validate json data
validate(instance=json_data, schema=GENOME_BGC_MAPPINGS_SCHEMA)

if output_file is None:
output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME
Expand Down
20 changes: 15 additions & 5 deletions src/nplinker/pairedomics/podp_antismash_downloader.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import json
from os import PathLike
from pathlib import Path
import re
import time
from urllib.error import HTTPError
from os import PathLike
from pathlib import Path
import httpx
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
import httpx
from jsonschema import validate
from nplinker.genomics.antismash import download_and_extract_antismash_data
from nplinker.globals import GENOME_STATUS_FILENAME
from nplinker.logconfig import LogConfig
from nplinker.schemas import GENOME_STATUS_SCHEMA


logger = LogConfig.getLogger(__name__)

Expand Down Expand Up @@ -64,6 +67,10 @@ def read_json(file: str | PathLike) -> dict[str, 'GenomeStatus']:
if Path(file).exists():
with open(file, "r") as f:
data = json.load(f)

# validate json data before using it
validate(data, schema=GENOME_STATUS_SCHEMA)

genome_status_dict = {
gs["original_id"]: GenomeStatus(**gs)
for gs in data["genome_status"]
Expand All @@ -90,6 +97,10 @@ def to_json(genome_status_dict: dict[str, 'GenomeStatus'],
"""
gs_list = [gs._to_dict() for gs in genome_status_dict.values()]
json_data = {"genome_status": gs_list, "version": "1.0"}

# validate json object before dumping
validate(json_data, schema=GENOME_STATUS_SCHEMA)

if file is not None:
with open(file, "w") as f:
json.dump(json_data, f)
Expand Down Expand Up @@ -206,8 +217,7 @@ def podp_download_and_extract_antismash_data(
logger.warning('Failed to successfully retrieve ANY genome data!')


def get_best_available_genome_id(
genome_id_data: dict[str, str]) -> str | None:
def get_best_available_genome_id(genome_id_data: dict[str, str]) -> str | None:
"""Get the best available ID from genome_id_data dict.
Args:
Expand Down
11 changes: 11 additions & 0 deletions src/nplinker/pairedomics/strain_mappings_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import logging
from os import PathLike
from pathlib import Path
from jsonschema import validate
from nplinker.metabolomics.gnps.gnps_file_mapping_loader import \
GNPSFileMappingLoader
from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA
from nplinker.schemas import validate_podp_json
from nplinker.strain_collection import StrainCollection
from nplinker.strains import Strain
from .podp_antismash_downloader import GenomeStatus
Expand Down Expand Up @@ -135,6 +138,8 @@ def extract_mappings_strain_id_original_genome_id(
with open(podp_project_json_file, 'r') as f:
json_data = json.load(f)

validate_podp_json(json_data)

for record in json_data['genomes']:
strain_id = record['genome_label']
genome_id = get_best_available_genome_id(record['genome_ID'])
Expand Down Expand Up @@ -191,6 +196,10 @@ def extract_mappings_resolved_genome_id_bgc_id(
"""
with open(genome_bgc_mappings_file, 'r') as f:
json_data = json.load(f)

# validate the JSON data
validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA)

return {
mapping["genome_ID"]: set(mapping["BGC_ID"])
for mapping in json_data['mappings']
Expand Down Expand Up @@ -263,6 +272,8 @@ def extract_mappings_strain_id_ms_filename(
with open(podp_project_json_file, 'r') as f:
json_data = json.load(f)

validate_podp_json(json_data)

# Extract mappings strain id <-> metabolomics filename
for record in json_data['genome_metabolome_links']:
strain_id = record['genome_label']
Expand Down
23 changes: 23 additions & 0 deletions src/nplinker/schemas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import json
import logging
from pathlib import Path
from .utils import PODP_ADAPTED_SCHEMA
from .utils import validate_podp_json


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = [
'GENOME_STATUS_SCHEMA', 'GENOME_BGC_MAPPINGS_SCHEMA',
'STRAIN_MAPPINGS_SCHEMA', 'PODP_ADAPTED_SCHEMA', 'validate_podp_json'
]

SCHEMA_DIR = Path(__file__).parent
with open(SCHEMA_DIR / "genome_status_schema.json", 'r') as f:
GENOME_STATUS_SCHEMA = json.load(f)

with open(SCHEMA_DIR / "genome_bgc_mappings_schema.json", 'r') as f:
GENOME_BGC_MAPPINGS_SCHEMA = json.load(f)

with open(SCHEMA_DIR / "strain_mappings_schema.json", 'r') as f:
STRAIN_MAPPINGS_SCHEMA = json.load(f)
53 changes: 53 additions & 0 deletions src/nplinker/schemas/genome_bgc_mappings_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/genome_bgc_mappings_schema.json",
"title": "Mappings from genome ID to BGC IDs",
"description": "A list of mappings from genome ID to BGC (biosynthetic gene cluster) IDs",
"type": "object",
"required": [
"mappings",
"version"
],
"properties": {
"mappings": {
"type": "array",
"title": "Mappings from genome ID to BGC IDs",
"description": "A list of mappings from genome ID to BGC IDs",
"items": {
"type": "object",
"required": [
"genome_ID",
"BGC_ID"
],
"properties": {
"genome_ID": {
"type": "string",
"title": "Genome ID",
"description": "The genome ID used in BGC database such as antiSMASH",
"minLength": 1
},
"BGC_ID": {
"type": "array",
"title": "BGC ID",
"description": "A list of BGC IDs",
"items": {
"type": "string",
"minLength": 1
},
"minItems": 1,
"uniqueItems": true
}
}
},
"minItems": 1,
"uniqueItems": true
},
"version": {
"type": "string",
"enum": [
"1.0"
]
}
},
"additionalProperties": false
}
59 changes: 59 additions & 0 deletions src/nplinker/schemas/genome_status_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/genome_status_schema.json",
"title": "Status of genomes",
"description": "A list of genome status objects, each of which contains information about a single genome",
"type": "object",
"required": [
"genome_status",
"version"
],
"properties": {
"genome_status": {
"type": "array",
"title": "Genome status",
"description": "A list of genome status objects",
"items": {
"type": "object",
"required": [
"original_id",
"resolved_refseq_id",
"resolve_attempted",
"bgc_path"
],
"properties": {
"original_id": {
"type": "string",
"title": "Original ID",
"description": "The original ID of the genome",
"minLength": 1
},
"resolved_refseq_id": {
"type": "string",
"title": "Resolved RefSeq ID",
"description": "The RefSeq ID that was resolved for this genome"
},
"resolve_attempted": {
"type": "boolean",
"title": "Resolve Attempted",
"description": "Whether or not an attempt was made to resolve this genome"
},
"bgc_path": {
"type": "string",
"title": "BGC Path",
"description": "The path to the downloaded BGC file for this genome"
}
}
},
"minItems": 1,
"uniqueItems": true
},
"version": {
"type": "string",
"enum": [
"1.0"
]
}
},
"additionalProperties": false
}
Loading

0 comments on commit e1973da

Please sign in to comment.