Issue152 replace csv with json for strain mappings file (#158)

* remove methods `add_from_file` and `save_to_file` * add method `to_json` * add method `read_json` * add unit tests for new methods * add global variable `STRAIN_MAPPINGS_FILENAME` * remove useless method `_init_global_strain_id_mapping` * fix reading strain mapping file * change test data from csv to json
NPLinker · Jul 6, 2023 · bea0af6 · bea0af6
1 parent 0a74dcd
commit bea0af6
Show file tree

Hide file tree

Showing 11 changed files with 569 additions and 113 deletions.
diff --git a/src/nplinker/data/strain_id_mapping.csv b/src/nplinker/data/strain_id_mapping.csv
diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -31,7 +31,7 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
 
         Note that for MIBiG BGC, same value is used for BGC id and genome id.
         Users don't have to provide genome id for MIBiG BGCs in the
-        `strain_mappings.csv` file.
+        `strain_mappings.json` file.
 
         Returns:
             dict[str, str]: key is BGC id/accession, value is

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -1,18 +1,19 @@
 import glob
 import os
-import sys
 from pathlib import Path
+import sys
 from nplinker.annotations import load_annotations
 from nplinker.class_info.chem_classes import ChemClassPredictions
 from nplinker.class_info.class_matches import ClassMatches
 from nplinker.class_info.runcanopus import run_canopus
 from nplinker.genomics import load_gcfs
 from nplinker.genomics.antismash import AntismashBGCLoader
-from nplinker.genomics.mibig import MibigBGCLoader
 from nplinker.genomics.mibig import download_and_extract_mibig_metadata
+from nplinker.genomics.mibig import MibigBGCLoader
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.metabolomics import load_dataset
 from nplinker.pairedomics.downloader import PODPDownloader
+from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
 from nplinker.pairedomics.runbigscape import run_bigscape
 from nplinker.strain_collection import StrainCollection
 
@@ -109,6 +110,7 @@ def __init__(self, config_data):
         self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
         self.mibig_bgc_dict = {}
         self.product_types = []
+        self.strains = StrainCollection()
         self.webapp_scoring_cutoff = self._config_webapp.get(
             'tables_metcalf_threshold', self.TABLES_CUTOFF_DEFAULT)
 
@@ -124,7 +126,7 @@ def validate(self):
         """Download data and build paths for local data"""
 
         # if remote loading mode, need to download the data here
-        # CG: for PODP workflow, strain_mappings.csv is generated in the download step
+        # CG: for PODP workflow, strain_mappings.json is generated in the download step
         if self._remote_loading:
             self._start_downloads()
 
@@ -134,7 +136,7 @@ def validate(self):
         # 1. after downloading (manual preparation), some files alreay exist, some not
         # 2. get the default, constructed or real path for each file/dir (need refactoring)
         #   - self._config_overrides.get()
-        #   - os.path.join(self._root, 'strain_mappings.csv')
+        #   - os.path.join(self._root, 'strain_mappings.json')
         #   - find_via_glob() --> actually check if the file/dir exists
         # 3. check if (some) file/dir exists
         self._init_paths()
@@ -169,7 +171,7 @@ def load(self):
         # or a complete failure to parse things, so bail out
         if len(self.strains) == 0:
             raise Exception(
-                'Failed to find *ANY* strains, missing strain_mappings.csv?')
+                f'Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?')
 
         return True
 
@@ -178,7 +180,7 @@ def _start_downloads(self):
         self._root = downloader.project_file_cache
         logger.debug('remote loading mode, configuring root=%s', self._root)
         # CG: to download both MET and GEN data
-        # CG: Continue to understand how strain_mappings.csv is generated
+        # CG: Continue to understand how strain_mappings.json is generated
         downloader.get(
             self._config_docker.get('run_bigscape', self.RUN_BIGSCAPE_DEFAULT),
             self._config_docker.get('extra_bigscape_parameters',
@@ -188,7 +190,7 @@ def _start_downloads(self):
     def _init_paths(self):
         # 1. strain mapping are used for everything else so
         self.strain_mappings_file = self._config_overrides.get(
-            self.OR_STRAINS) or os.path.join(self._root, 'strain_mappings.csv')
+            self.OR_STRAINS) or os.path.join(self._root, STRAIN_MAPPINGS_FILENAME)
 
         self._init_metabolomics_paths()
 
@@ -325,13 +327,6 @@ def _validate_paths(self):
 
     # TODO: this function should be refactored to Loader class
     def _load_strain_mappings(self):
-        # this file should be a csv file, one line per strain, containing a list
-        # of possible alternative IDs (the first one being the preferred ID).
-        #
-        # this is a per-dataset mapping, and is then merged with the global mapping file
-        # packaged with nplinker itself
-        self._init_global_strain_id_mapping()
-
         # now load the dataset mapping in the same way
         # TODO: what happens in case of clashes (differing primary IDs?)
         # CG: the `if` never happens for PODP pipeline; for non-PODP pipeline,
@@ -340,29 +335,18 @@ def _load_strain_mappings(self):
         if not os.path.exists(self.strain_mappings_file):
             # create an empty placeholder file and show a warning
             logger.warn(
-                'No strain_mappings.csv file found! Attempting to create one')
+                'No strain_mappings.json file found! Attempting to create one')
             self.strains.generate_strain_mappings(self.strain_mappings_file,
                                                   self.antismash_dir)
-            # raise Exception('Unable to load strain_mappings file: {}'.format(self.strain_mappings_file))
         else:
-            self.strains.add_from_file(self.strain_mappings_file)
+            sc = StrainCollection().read_json(self.strain_mappings_file)
+            for strain in sc:
+                self.strains.add(strain)
             logger.info('Loaded dataset strain IDs ({} total)'.format(
                 len(self.strains)))
 
         return True
 
-    def _init_global_strain_id_mapping(self):
-        """The global strain mapping is predefined by the NPLinker package.
-
-            See `src/nplinker/strain_id_mapping.csv`
-        """
-        self.strains = StrainCollection()
-        global_strain_id_file = NPLINKER_APP_DATA_DIR.joinpath(
-            'strain_id_mapping.csv')
-        self.strains.add_from_file(global_strain_id_file)
-        logger.info('Loaded global strain IDs ({} total)'.format(
-            len(self.strains)))
-
     # TODO CG: replace deprecated load_dataset with GPNSLoader
     def _load_metabolomics(self):
         spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset(

diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py
@@ -24,6 +24,8 @@
 MIBIG_METADATA_URL = 'https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz'
 MIBIG_BGC_METADATA_URL = 'https://mibig.secondarymetabolites.org/repository/{}/annotations.json'
 
+STRAIN_MAPPINGS_FILENAME = 'strain_mappings.json'
+
 
 class PODPDownloader():
     # TODO: move to independent config file  ---C.Geng
@@ -120,7 +122,7 @@ def _init_folder_structure(self, local_cache):
 
         # init strain mapping filepath
         self.strain_mappings_file = os.path.join(self.project_file_cache,
-                                                 'strain_mappings.csv')
+                                                 STRAIN_MAPPINGS_FILENAME)
 
         # init project paths
         self.all_project_json_file = os.path.join(self.local_cache,
@@ -145,7 +147,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig,
         self._parse_genome_labels(self.project_json['genome_metabolome_links'],
                                   self.project_json['genomes'])
 
-        # CG: it generates the strain_mappings.csv file
+        # CG: it generates the strain_mappings.json file
         self.strains.generate_strain_mappings(
             self.strain_mappings_file,
             os.path.join(self.project_file_cache, 'antismash'))

diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py
@@ -1,4 +1,4 @@
-import csv
+import json
 from os import PathLike
 from pathlib import Path
 from typing import Iterator
@@ -110,36 +110,49 @@ def lookup(self, name: str) -> Strain:
             return self._strain_dict_name[name]
         raise KeyError(f"Strain {name} not found in strain collection.")
 
-    def add_from_file(self, file: str | PathLike) -> None:
-        """Add strains from a strain mapping file.
-
-        A strain mapping file is a csv file with the first column being the
-        id of the strain, and the remaining columns being aliases for the
-        strain.
+    @classmethod
+    def read_json(cls, file: str | PathLike) -> 'StrainCollection':
+        """Read a strain mappings JSON file and return a StrainCollection object.
 
         Args:
-            file(str | PathLike): Path to strain mapping file (.csv).
+            file(str | PathLike): Path to the strain mappings JSON file.
+
+        Returns:
+            StrainCollection: StrainCollection object.
         """
-        with open(file) as f:
-            reader = csv.reader(f)
-            for names in reader:
-                if len(names) == 0:
-                    continue
-                strain = Strain(names[0])
-                for alias in names[1:]:
-                    strain.add_alias(alias)
-                self.add(strain)
-
-    def save_to_file(self, file: str | PathLike) -> None:
-        """Save strains to a strain mapping file (.csv).
+        with open(file, 'r') as f:
+            json_data = json.load(f)
+
+        strain_collection = cls()
+        for data in json_data['strain_mappings']:
+            strain = Strain(data['strain_id'])
+            for alias in data['strain_alias']:
+                strain.add_alias(alias)
+            strain_collection.add(strain)
+        return strain_collection
+
+    def to_json(self, file: str | PathLike | None = None) -> str | None:
+        """Convert the StrainCollection object to a JSON string.
 
         Args:
-            file(str | PathLike): Path to strain mapping file (.csv).
+            file(str | PathLike | None): Path to output JSON file. If None,
+                return the JSON string instead.
+
+        Returns:
+            str | None: If `file` is None, return the JSON string. Otherwise,
+                write the JSON string to the given file.
         """
-        with open(file, 'w') as f:
-            for strain in self:
-                ids = [strain.id] + list(strain.aliases)
-                f.write(','.join(ids) + '\n')
+        data_list = [{
+            "strain_id": strain.id,
+            "strain_alias": list(strain.aliases)
+        } for strain in self]
+        json_data = {"strain_mappings": data_list, "version": "1.0"}
+
+        if file is not None:
+            with open(file, 'w') as f:
+                json.dump(json_data, f)
+            return None
+        return json.dumps(json_data)
 
     # TODO to move this method to a separate class
     @deprecated(version="1.3.3", reason="This method will be removed")

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,7 @@
 from nplinker.metabolomics.metabolomics import load_spectra
 from nplinker.metabolomics.metabolomics import make_families
 from nplinker.metabolomics.spectrum import Spectrum
+from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
 from nplinker.strain_collection import StrainCollection
 from nplinker.strains import Strain
 from nplinker.utils import extract_archive
@@ -44,9 +45,8 @@ def spec_dict() -> dict[str, Spectrum]:
 
 @pytest.fixture
 def collection_from_file() -> StrainCollection:
-    filename = DATA_DIR / "strain_mappings.csv"
-    sut = StrainCollection()
-    sut.add_from_file(filename)
+    filename = DATA_DIR / STRAIN_MAPPINGS_FILENAME
+    sut = StrainCollection().read_json(filename)
     return sut
 
 

diff --git a/tests/data/strain_mappings.csv b/tests/data/strain_mappings.csv