Add schema validation for json data (#167)

* remove useless item "count" from genome-bgc mappings * add schemas * include schemas as package data * add tests for schemas * add json validation to GenomeStatus methods * add json validation for genome bgc mappings * add json validation for strain mappings * disallow empty list of bgc ids for genome bgc mappings * add version to test data to match schema * add validation for PODP json data * fix podp test data to pass json validation * fix installation failure cased by deprecated sklearn issue in bigscape * update test json data to match schema requirement * fix data frame dtype bug to pass unit test for `ClassMatches`
NPLinker · Aug 9, 2023 · e1973da · e1973da
1 parent 817dabd
commit e1973da
Show file tree

Hide file tree

Showing 20 changed files with 990 additions and 42 deletions.
diff --git a/bin/install-nplinker-deps b/bin/install-nplinker-deps
@@ -121,19 +121,14 @@ pip install -q -U pip setuptools
 #------------------------------------------------------------------------------
 
 #--- Install BigScape
-## Note: DO NOT pip install bigscape until its modular version
 echo "🔥 Start installing BigScape ..."
-    # TODO: use original repo when multiprocessing bug fixed
     [[ -d BiG-SCAPE ]] || git clone https://github.com/medema-group/BiG-SCAPE.git
-    # [[ -d BiG-SCAPE ]] || git clone https://github.com/CunliangGeng/BiG-SCAPE.git
     cd BiG-SCAPE
     git config --add advice.detachedHead false  # disable advice
     git config pull.ff only
     git checkout master
     git pull
-    git checkout de55e9c0cecae9648320308f98d3897f9aef3a0a # tag v1.1.5
-    # git checkout dev
-    # git pull
+    git checkout 99d07e57882fc5fa6780f8254823fd3d1abf3bc6 # Commits on Jul 21, 2023
     pip install -q -U -r requirements.txt
     chmod 754 bigscape.py
     chmod 664 domains_color_file.tsv

diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,7 @@ namespaces = true  # enable data directory to be identified
 
 [tool.setuptools.package-data]
 "nplinker.data" = [ "*" ]
+"nplinker.schemas" = [ "*" ]
 
 [tool.pytest.ini_options]
 minversion = "6.0"

diff --git a/src/nplinker/class_info/class_matches.py b/src/nplinker/class_info/class_matches.py
@@ -245,7 +245,7 @@ def _get_scoring_tables(self):
                     class_matching_tables[chem_key] = {}
                     class_matching_counts[chem_key] = {}
                 # add matching tables as DataFrames
-                counts_df = pd.DataFrame.from_dict(counts, dtype=int)
+                counts_df = pd.DataFrame.from_dict(counts)
                 class_matching_tables[bgc_key][chem_key] = (
                     counts_df / counts_df.sum(axis=0)).fillna(0)
                 class_matching_counts[bgc_key][chem_key] = counts_df.fillna(0)

diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py
@@ -4,8 +4,10 @@
 from os import PathLike
 from pathlib import Path
 from deprecated import deprecated
+from jsonschema import validate
 from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
 from nplinker.logconfig import LogConfig
+from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA
 from nplinker.strain_collection import StrainCollection
 from nplinker.utils import list_dirs
 from nplinker.utils import list_files
@@ -44,19 +46,21 @@ def generate_mappings_genome_id_bgc_id(
         bgc_ids = [
             bgc_id for f in bgc_files if (bgc_id := Path(f).stem) != genome_id
         ]
-        genome_bgc_mappings[genome_id] = bgc_ids
+        if bgc_ids:
+            genome_bgc_mappings[genome_id] = bgc_ids
+        else:
+            logger.warning("No BGC files found in %s", subdir)
 
     # sort mappings by genome_id and construct json data
     genome_bgc_mappings = dict(sorted(genome_bgc_mappings.items()))
     json_data = [{
         "genome_ID": k,
         "BGC_ID": v
     } for k, v in genome_bgc_mappings.items()]
-    json_data = {
-        "mappings": json_data,
-        "count": len(json_data),
-        "version": "1.0"
-    }
+    json_data = {"mappings": json_data, "version": "1.0"}
+
+    # validate json data
+    validate(instance=json_data, schema=GENOME_BGC_MAPPINGS_SCHEMA)
 
     if output_file is None:
         output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME

diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py
@@ -1,16 +1,19 @@
 import json
+from os import PathLike
+from pathlib import Path
 import re
 import time
 from urllib.error import HTTPError
-from os import PathLike
-from pathlib import Path
-import httpx
 from bs4 import BeautifulSoup
 from bs4 import NavigableString
 from bs4 import Tag
+import httpx
+from jsonschema import validate
 from nplinker.genomics.antismash import download_and_extract_antismash_data
 from nplinker.globals import GENOME_STATUS_FILENAME
 from nplinker.logconfig import LogConfig
+from nplinker.schemas import GENOME_STATUS_SCHEMA
+
 
 logger = LogConfig.getLogger(__name__)
 
@@ -64,6 +67,10 @@ def read_json(file: str | PathLike) -> dict[str, 'GenomeStatus']:
         if Path(file).exists():
             with open(file, "r") as f:
                 data = json.load(f)
+
+            # validate json data before using it
+            validate(data, schema=GENOME_STATUS_SCHEMA)
+
             genome_status_dict = {
                 gs["original_id"]: GenomeStatus(**gs)
                 for gs in data["genome_status"]
@@ -90,6 +97,10 @@ def to_json(genome_status_dict: dict[str, 'GenomeStatus'],
         """
         gs_list = [gs._to_dict() for gs in genome_status_dict.values()]
         json_data = {"genome_status": gs_list, "version": "1.0"}
+
+        # validate json object before dumping
+        validate(json_data, schema=GENOME_STATUS_SCHEMA)
+
         if file is not None:
             with open(file, "w") as f:
                 json.dump(json_data, f)
@@ -206,8 +217,7 @@ def podp_download_and_extract_antismash_data(
         logger.warning('Failed to successfully retrieve ANY genome data!')
 
 
-def get_best_available_genome_id(
-        genome_id_data: dict[str, str]) -> str | None:
+def get_best_available_genome_id(genome_id_data: dict[str, str]) -> str | None:
     """Get the best available ID from genome_id_data dict.
 
     Args:

diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py
@@ -2,8 +2,11 @@
 import logging
 from os import PathLike
 from pathlib import Path
+from jsonschema import validate
 from nplinker.metabolomics.gnps.gnps_file_mapping_loader import \
     GNPSFileMappingLoader
+from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA
+from nplinker.schemas import validate_podp_json
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
 from .podp_antismash_downloader import GenomeStatus
@@ -135,6 +138,8 @@ def extract_mappings_strain_id_original_genome_id(
     with open(podp_project_json_file, 'r') as f:
         json_data = json.load(f)
 
+    validate_podp_json(json_data)
+
     for record in json_data['genomes']:
         strain_id = record['genome_label']
         genome_id = get_best_available_genome_id(record['genome_ID'])
@@ -191,6 +196,10 @@ def extract_mappings_resolved_genome_id_bgc_id(
     """
     with open(genome_bgc_mappings_file, 'r') as f:
         json_data = json.load(f)
+
+    # validate the JSON data
+    validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA)
+
     return {
         mapping["genome_ID"]: set(mapping["BGC_ID"])
         for mapping in json_data['mappings']
@@ -263,6 +272,8 @@ def extract_mappings_strain_id_ms_filename(
     with open(podp_project_json_file, 'r') as f:
         json_data = json.load(f)
 
+    validate_podp_json(json_data)
+
     # Extract mappings strain id <-> metabolomics filename
     for record in json_data['genome_metabolome_links']:
         strain_id = record['genome_label']

diff --git a/src/nplinker/schemas/__init__.py b/src/nplinker/schemas/__init__.py
@@ -0,0 +1,23 @@
+import json
+import logging
+from pathlib import Path
+from .utils import PODP_ADAPTED_SCHEMA
+from .utils import validate_podp_json
+
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
+__all__ = [
+    'GENOME_STATUS_SCHEMA', 'GENOME_BGC_MAPPINGS_SCHEMA',
+    'STRAIN_MAPPINGS_SCHEMA', 'PODP_ADAPTED_SCHEMA', 'validate_podp_json'
+]
+
+SCHEMA_DIR = Path(__file__).parent
+with open(SCHEMA_DIR / "genome_status_schema.json", 'r') as f:
+    GENOME_STATUS_SCHEMA = json.load(f)
+
+with open(SCHEMA_DIR / "genome_bgc_mappings_schema.json", 'r') as f:
+    GENOME_BGC_MAPPINGS_SCHEMA = json.load(f)
+
+with open(SCHEMA_DIR / "strain_mappings_schema.json", 'r') as f:
+    STRAIN_MAPPINGS_SCHEMA = json.load(f)
diff --git a/src/nplinker/schemas/genome_bgc_mappings_schema.json b/src/nplinker/schemas/genome_bgc_mappings_schema.json
@@ -0,0 +1,53 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/genome_bgc_mappings_schema.json",
+  "title": "Mappings from genome ID to BGC IDs",
+  "description": "A list of mappings from genome ID to BGC (biosynthetic gene cluster) IDs",
+  "type": "object",
+  "required": [
+    "mappings",
+    "version"
+  ],
+  "properties": {
+    "mappings": {
+      "type": "array",
+      "title": "Mappings from genome ID to BGC IDs",
+      "description": "A list of mappings from genome ID to BGC IDs",
+      "items": {
+        "type": "object",
+        "required": [
+          "genome_ID",
+          "BGC_ID"
+        ],
+        "properties": {
+          "genome_ID": {
+            "type": "string",
+            "title": "Genome ID",
+            "description": "The genome ID used in BGC database such as antiSMASH",
+            "minLength": 1
+          },
+          "BGC_ID": {
+            "type": "array",
+            "title": "BGC ID",
+            "description": "A list of BGC IDs",
+            "items": {
+              "type": "string",
+              "minLength": 1
+            },
+            "minItems": 1,
+            "uniqueItems": true
+          }
+        }
+      },
+      "minItems": 1,
+      "uniqueItems": true
+    },
+    "version": {
+      "type": "string",
+      "enum": [
+        "1.0"
+      ]
+    }
+  },
+  "additionalProperties": false
+}
diff --git a/src/nplinker/schemas/genome_status_schema.json b/src/nplinker/schemas/genome_status_schema.json
@@ -0,0 +1,59 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/genome_status_schema.json",
+  "title": "Status of genomes",
+  "description": "A list of genome status objects, each of which contains information about a single genome",
+  "type": "object",
+  "required": [
+    "genome_status",
+    "version"
+  ],
+  "properties": {
+    "genome_status": {
+      "type": "array",
+      "title": "Genome status",
+      "description": "A list of genome status objects",
+      "items": {
+        "type": "object",
+        "required": [
+          "original_id",
+          "resolved_refseq_id",
+          "resolve_attempted",
+          "bgc_path"
+        ],
+        "properties": {
+          "original_id": {
+            "type": "string",
+            "title": "Original ID",
+            "description": "The original ID of the genome",
+            "minLength": 1
+          },
+          "resolved_refseq_id": {
+            "type": "string",
+            "title": "Resolved RefSeq ID",
+            "description": "The RefSeq ID that was resolved for this genome"
+          },
+          "resolve_attempted": {
+            "type": "boolean",
+            "title": "Resolve Attempted",
+            "description": "Whether or not an attempt was made to resolve this genome"
+          },
+          "bgc_path": {
+            "type": "string",
+            "title": "BGC Path",
+            "description": "The path to the downloaded BGC file for this genome"
+          }
+        }
+      },
+      "minItems": 1,
+      "uniqueItems": true
+    },
+    "version": {
+      "type": "string",
+      "enum": [
+        "1.0"
+      ]
+    }
+  },
+  "additionalProperties": false
+}