diff --git a/src/nplinker/data/nplinker.toml b/src/nplinker/data/nplinker.toml index 9bd0f778..c2cee5e1 100644 --- a/src/nplinker/data/nplinker.toml +++ b/src/nplinker/data/nplinker.toml @@ -22,48 +22,46 @@ repro_file = "" # Dataset configuration # --------------------- # NPLinker supports two basic methods for loading datasets: -# +# # 1. All files stored locally # 2. Some/all files retrieved from the paired omics platform (http://pairedomicsdata.bioinformatics.nl/) # -# The method you want to use determines the values that should be populated in +# The method you want to use determines the values that should be populated in # the [dataset] section below. If working with a purely local dataset, NPLinker # defaults to looking for all the necessary files in a single directory, given -# by the "root" parameter. +# by the "root" parameter. # # Alternatively, to load metabolomics data from the paired platform, set the "root" -# parameter to "platform:", where "" is taken from the platform +# parameter to "platform:", where "" is taken from the platform # project list. For example, "platform:MSV000079284" would select the dataset with # the ID MSV000079284. # -# For more details see below. +# For more details see below. # # 1. Loading local datasets # ------------------------- # Generally speaking the dataset layout the application expects matches the structure # of the output from a GNPS job. Workflows that are known to work so far are: -# -# - METABOLOMICS-SNETS (version 1.2.3) -# - METABOLOMICS-SNETS-V2 (version release_14) +# +# - METABOLOMICS-SNETS (version 1.2.3) +# - METABOLOMICS-SNETS-V2 (version release_14) # - FEATURE-BASED-MOLECULAR-NETWORKING (version 1.2.3) # -# The simplest starting point is to download the "Clustered Spectra as MGF" zip +# The simplest starting point is to download the "Clustered Spectra as MGF" zip # file from GNPS and extract that into a folder to serve as the root directory for the dataset. -# antiSMASH and BiG-SCAPE output are then added as additional subfolders. +# antiSMASH and BiG-SCAPE output are then added as additional subfolders. # # Typically all you will need to do is tell NPLinker where the root directory # is located. Otherwise you can customise the locations of the individual elements -# using the various override settings below. +# using the various override settings below. # # The layout is as follows (see the documentation for more details): # # |- strain_mappings.csv (strain ID mappings) # | (METABOLOMICS DATA) # |- clusterinfo*/.tsv (spectrum metadata, NOTE: NPLinker will search any folder beginning with "clusterinfo") -# |- metadata_table/metadata_table-00000.txt (only present in FBMN GNPS jobs?) # |- networkedges_selfloop/.selfloop (the "edges" file for spectra network) # |- spectra/*.mgf (mass spec data) -# |- quantification_table_reformatted/.csv ("extra" spectrum metadata, only present in FBMN jobs? TODO proper name for this?) # |- DB_result/*.tsv (GNPS and other spectral annotation files, optional) # |- result_specnets_DB/*.tsv (GNPS and other spectral annotation files, optional) # |- DB_result/annotations.tsv (annotation data to extract from each file, see docs for details) @@ -77,16 +75,16 @@ repro_file = "" # # 2. Loading platform datasets # ---------------------------- -# Given a platform ID, NPLinker will retrieve the associated metabolomics data using the -# GNPS task ID. By default, it will also attempt to retrieve any available -# genomics data from the antiSMASH database using the RefSeq accession labels -# in the platform project data. -# +# Given a platform ID, NPLinker will retrieve the associated metabolomics data using the +# GNPS task ID. By default, it will also attempt to retrieve any available +# genomics data from the antiSMASH database using the RefSeq accession labels +# in the platform project data. +# # However, if you have local antismash results instead, you should also set the -# location of those files using the "antismash_dir" parameter in the -# [dataset.overrides] section. NPLinker can optionally also run bigscape on +# location of those files using the "antismash_dir" parameter in the +# [dataset.overrides] section. NPLinker can optionally also run bigscape on # these results during the loading process. If you already have bigscape results -# you can additionally set the "bigscape_dir" parameter to the appropriate +# you can additionally set the "bigscape_dir" parameter to the appropriate # location to skip this step (or simply place them inside the /bigscape folder) # # All files are downloaded and extracted to locations inside ~/nplinker_data. On Windows @@ -97,10 +95,10 @@ repro_file = "" # If the dataset has the expected directory structure, this value is all that's required. # It should be set to the path of the local directory containing the various data files # described above. -# +# # If you want to load a dataset from the paired platform, the value should be a string -# of the form: "platform:datasetID". For example, "platform:MSV000079284" would -# load the dataset with ID MSV000079284. +# of the form: "platform:datasetID". For example, "platform:MSV000079284" would +# load the dataset with ID MSV000079284. root = "" # you can optionally set the BIGSCAPE clustering cutoff value here. the default value @@ -110,47 +108,36 @@ root = "" # data to be reloaded. Possibly only useful for the webapp #bigscape_cutoff = 30 -# For datasets using the GNPS FBMN workflow, NPLinker can optionally try to parse -# more data out of the "metadata table" file included in the output. This can both -# simplify the process of creating a strain mappings file and allow NPLinker to -# extract growth media labels for each strain, which are then displayed in the -# web application. This option defaults to being disabled to provide a consistent -# experience with other types of dataset, but you can choose to enable it if you -# have a compatible metadata table file. -# -# For more information: https://github.com/sdrogers/nplinker/wiki/Strain-mappings -#extended_metadata_table_parsing = false - # NPLinker can automatically download a copy of the MiBIG database in the JSON # format it expects to find, and place it in the same folder as the current dataset. # This behaviour is enabled by default. If you wish to disable it for any reason -# change the value of this setting to "false". +# change the value of this setting to "false". #use_mibig = true # When the "use_mibig" setting described above is set to "true", NPLinker will use # the value of this setting to determine which version of the MiBIG database should -# be downloaded. The default is "1.4". If you need a newer version, define the +# be downloaded. The default is "1.4". If you need a newer version, define the # value to be the appropriate version, e.g. "2.0". Currently tested versions are -# "1.4" and "2.0". +# "1.4" and "2.0". # # NOTE: the value MUST be a string (i.e. enclosed by " characters: "2.0" not 2.0) #mibig_version = "1.4" [antismash] -# antismash file structure. Should be either 'default' or 'flat'. +# antismash file structure. Should be either 'default' or 'flat'. # default = the standard structure with nested subdirectories # flat = all .gbk files placed in a single flat directory -# TODO: is "flat" required any more? +# TODO: is "flat" required any more? antismash_format = "default" # NPLinker needs to know how to parse antiSMASH filenames from BiG-SCAPE output -# to identify strain labels as part of the process of loading a dataset. +# to identify strain labels as part of the process of loading a dataset. # Since the format of the filenames can vary from dataset to dataset, there isn't # a single rule that can be applied here. In most cases, the required action is -# "take all text up to the first occurrence of ", e.g. the first +# "take all text up to the first occurrence of ", e.g. the first # underscore or period character. By default NPLinker will try the set of characters # defined below in sequence until a match is found. If you need to override this -# then define a new list of at least one character to search for (note that it +# then define a new list of at least one character to search for (note that it # doesn't need to be a single character, e.g. ".abc" would also work) # antismash_delimiters = [".", "_", "-"] @@ -158,13 +145,13 @@ antismash_format = "default" # automatically, you can choose to have the application automatically rename any # files under the /antismash folder which contain spaces in their filenames # (both subfolders and .gbk files). This is to avoid BiG-SCAPE throwing up errors -# because it can't handle filenames with spaces in them. If you already have +# because it can't handle filenames with spaces in them. If you already have # BiG-SCAPE results or don't want NPLinker to rename anything automatically, set # this to true instead ignore_spaces = false [docker] -# this optional section contains various settings that only apply to the dockerised +# this optional section contains various settings that only apply to the dockerised # version of NPLinker. The Docker image contains BiG-SCAPE, and this tool can be run # as part of the dataset loading process. If you want to enable/disable this, and # configure the parameters used, this can be done by modifying the settings below. @@ -211,7 +198,7 @@ run_bigscape = true # # The default simply enables MiBIG, disables the extra GCF clustering step. If you # need to select a particular version of the MiBIG database, see the "mibig_version" -# setting. +# setting. # Note that BiG-SCAPE defaults to using 0.3 as a clustering cutoff if "--cutoffs" # is not set. #extra_bigscape_parameters = "--mibig --clans-off" @@ -267,14 +254,14 @@ run_canopus = false extra_canopus_parameters = "--maxmz 600 formula zodiac structure canopus" # this optional section contains various settings that only affect the webapp interface -# for NPLinker. If you're not using it, you can leave this out of your configuration. +# for NPLinker. If you're not using it, you can leave this out of your configuration. [webapp] # A fundamental part of the webapp is a set of tables which display objects in the dataset -# that have been found to have links, based on the output of the Metcalf scoring method. +# that have been found to have links, based on the output of the Metcalf scoring method. # The value of this parameter is the minimum Metcalf score that a link must have for its # associated objects to appear in the table. In other words, the higher this is set, the # smaller the number of objects that will remain to be displayed in the tables. This may -# be useful for improving performance on larger datasets. +# be useful for improving performance on larger datasets. # The default value is 2.0 tables_metcalf_threshold = 2.0 @@ -283,11 +270,11 @@ tables_metcalf_threshold = 2.0 # the "Rosetta" scoring method involves some preprocessing steps that can take # significant time. NPLinker will automatically run these steps as it loads the -# dataset and cache the results. if you would like to adjust the parameters used +# dataset and cache the results. if you would like to adjust the parameters used # by the Rosetta method you can do by setting them below (note that changing # these values will invalidate any cached data and force the preprocessing steps # to be run again) -# +# # TODO document what these do [scoring.rosetta] # ms1_tol = 100 @@ -302,29 +289,16 @@ tables_metcalf_threshold = 2.0 #strain_mappings_file = "" # MGF filename. This path is passed to glob.glob, default is /spectra/*.mgf -#mgf_file = "" +#gnps_mgf_file = "" # nodes filename. This path is passed to glob.glob, default is /clusterinfo_summary/*.tsv -#nodes_file = "" - -# don't know what to call this yet? TODO -# "extra" spectrum metadata file, default is /*_quant.csv -#extra_nodes_file = "" +#gnps_nodes_file = "" # edges filename. This path is passed to glob.glob, default is /networkedges_selfloop/*.selfloop -#edges_file = "" - -# metadata table filename. This path is passed to glob.glob, default is /metadata_table/metadata_table-*.txt -#metadata_table_file = "" - -# quantification table filename. This path is passed to glob.glob, default is /quantification_table/quantification_table-*.csv -#quantification_table_file = "" - -# GNPS spectral annotations directory, default is /DB_result -#annotations_dir = "" +#gnps_edges_file = "" # annotation configuration file, default is /annotations.tsv -#annotations_config_file = "" +#gnps_annotations_file = "" # Antismash data directory, default is /antismash #antismash_dir = "" diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index ccdedb39..d6e4f94a 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -48,7 +48,6 @@ class DatasetLoader: TABLES_CUTOFF_DEFAULT = 2.0 BIGSCAPE_CUTOFF_DEFAULT = 30 - EXTENDED_METADATA_TABLE_PARSING_DEFAULT = False USE_MIBIG_DEFAULT = True MIBIG_VERSION_DEFAULT = "3.1" @@ -62,14 +61,10 @@ class DatasetLoader: EXTRA_CANOPUS_PARAMS_DEFAULT = "--maxmz 600 formula zodiac structure canopus" # keys for overriding metabolomics data elements - OR_NODES = "nodes_file" - OR_EDGES = "edges_file" - OR_EXTRA_NODES = "extra_nodes_file" - OR_MGF = "mgf_file" - OR_METADATA = "metadata_table_file" - OR_QUANT = "quantification_table_file" - OR_ANNO = "annotations_dir" - OR_ANNO_CONFIG = "annotations_config_file" + OR_GNPS_NODES = "gnps_nodes_file" + OR_GNPS_EDGES = "gnps_edges_file" + OR_GNPS_MGF = "gnps_mgf_file" + OR_GNPS_ANNOTATIONS = "gnps_annotations_file" # and the same for genomics data OR_ANTISMASH = "antismash_dir" OR_BIGSCAPE = "bigscape_dir" @@ -109,9 +104,6 @@ def __init__(self, config_data): self._bigscape_cutoff = self._config_dataset.get( "bigscape_cutoff", self.BIGSCAPE_CUTOFF_DEFAULT ) - self._extended_metadata_table_parsing = self._config_dataset.get( - "extended_metadata_table_parsing", self.EXTENDED_METADATA_TABLE_PARSING_DEFAULT - ) self._use_mibig = self._config_dataset.get("use_mibig", self.USE_MIBIG_DEFAULT) self._mibig_version = self._config_dataset.get("mibig_version", self.MIBIG_VERSION_DEFAULT) self._root = Path(self._config_dataset["root"]) @@ -140,9 +132,9 @@ def __init__(self, config_data): def __repr__(self): return "Root={}\n MGF={}\n EDGES={}\n NODES={}\n BIGSCAPE={}\n ANTISMASH={}\n".format( self._root, - self.mgf_file, - self.edges_file, - self.nodes_file, + self.gnps_mgf_file, + self.gnps_edges_file, + self.gnps_nodes_file, self.bigscape_dir, self.antismash_dir, ) @@ -246,76 +238,51 @@ def _init_paths(self): ) or os.path.join(self._root, "molnetenhancer") def _init_metabolomics_paths(self): - # 2. MET: /clusterinfo_summary/.tsv (or .clustersummary apparently...) / nodes_file= - self.nodes_file = self._config_overrides.get(self.OR_NODES) or find_via_glob_alts( + """Initializes the paths for metabolomics data.""" + # GNPS nodes_files is `file_mappings.tsv/csv` file + self.gnps_nodes_file = self._config_overrides.get(self.OR_GNPS_NODES) or find_via_glob_alts( [ + os.path.join(self._root, "file_mappings.csv"), os.path.join(self._root, "file_mappings.tsv"), os.path.join(self._root, "clusterinfo*", "*.tsv"), os.path.join(self._root, "clusterinfo*", "*.clustersummary"), ], - self.OR_NODES, + self.OR_GNPS_NODES, ) - # 3. MET: /networkedges_selfloop/.selfloop (new) or .pairsinfo (old) / edges_file= - self.edges_file = self._config_overrides.get(self.OR_EDGES) or find_via_glob_alts( + # GNPS edges_file is `molecular_families.tsv` file + self.gnps_edges_file = self._config_overrides.get(self.OR_GNPS_EDGES) or find_via_glob_alts( [ + os.path.join(self._root, "molecular_families.tsv"), os.path.join(self._root, "*.pairsinfo"), os.path.join(self._root, "networkedges_selfloop", "*.pairsinfo"), os.path.join(self._root, "networkedges_selfloop", "*.selfloop"), ], - self.OR_EDGES, - ) - - # 4. MET: /*.csv / extra_nodes_file= - # TODO is the glob input OK? - # => wait for updated dataset with latest output format - # NOTE: only optional for Crusemann or Crusemann-like dataset format! - self.extra_nodes_file = self._config_overrides.get(self.OR_EXTRA_NODES) or find_via_glob( - os.path.join(self._root, "quantification_table_reformatted", "*.csv"), - self.OR_EXTRA_NODES, - optional=True, - ) - - # 5. MET: /spectra/*.mgf (or /*.mgf)/ mgf_file= - self.mgf_file = self._config_overrides.get(self.OR_MGF) or find_via_glob_alts( - [os.path.join(self._root, "*.mgf"), os.path.join(self._root, "spectra", "*.mgf")], - self.OR_MGF, + self.OR_GNPS_EDGES, ) - # 6. MET: /metadata_table/metadata_table-.txt / metadata_table_file= - self.metadata_table_file = self._config_overrides.get(self.OR_METADATA) or find_via_glob( - os.path.join(self._root, "metadata_table", "metadata_table*.txt"), - self.OR_METADATA, - optional=True, + # GNPS mgf_file is `spectra.mgf` file + self.gnps_mgf_file = self._config_overrides.get(self.OR_GNPS_MGF) or find_via_glob_alts( + [ + os.path.join("spectra.mgf"), + os.path.join(self._root, "*.mgf"), + os.path.join(self._root, "spectra", "*.mgf"), + ], + self.OR_GNPS_MGF, ) - # 7. MET: /quantification_table/quantification_table-.csv / quantification_table_file= - self.quantification_table_file = self._config_overrides.get(self.OR_QUANT) or find_via_glob( - os.path.join(self._root, "quantification_table", "quantification_table*.csv"), - self.OR_QUANT, - optional=True, + # GNPS annotations_file is `annotations.tsv` file + self.gnps_annotations_file = self._config_overrides.get( + self.OR_GNPS_ANNOTATIONS + ) or find_via_glob_alts( + [ + os.path.join(self._root, "annotations.tsv"), + os.path.join(self._root, "DB_result", "*.tsv"), + os.path.join(self._root, "result_specnets_DB", "*.tsv"), + ], + self.OR_GNPS_ANNOTATIONS, ) - # 8. MET: /DB_result/*.tsv (new) or /result_specnets_DB/*.tsv (old) / annotations_dir= - if Path.is_file(Path(self._root) / "annotations.tsv"): - self.annotations_dir = str(self._root) - self.annotations_config_file = os.path.join(self._root, "annotations.tsv") - else: - self.annotations_dir = self._config_overrides.get( - self.OR_ANNO - ) or find_via_glob_alts_dir( - [ - os.path.join(self._root, "DB_result"), - os.path.join(self._root, "result_specnets_DB"), - ], - self.OR_ANNO, - optional=False, - ) - if self.annotations_dir is not None: - self.annotations_config_file = self._config_overrides.get( - self.OR_ANNO_CONFIG - ) or os.path.join(self.annotations_dir, "annotations.tsv") - def _init_genomics_paths(self): # 9. GEN: /antismash / antismash_dir= self.antismash_dir = self._config_overrides.get(self.OR_ANTISMASH) or os.path.join( @@ -344,17 +311,17 @@ def _init_genomics_paths(self): def _validate_paths(self): """Validates that the required files and directories exist before loading starts.""" - required_paths = [self.nodes_file, self.edges_file, self.mgf_file, self.antismash_dir] - optional_paths = [self.annotations_dir] + required_paths = [ + self.gnps_nodes_file, + self.gnps_edges_file, + self.gnps_mgf_file, + self.antismash_dir, + ] for f in required_paths: if not os.path.exists(str(f)): raise FileNotFoundError(f'File/directory "{f}" does not exist.') - for f in optional_paths: - if not os.path.exists(str(f)): - logger.warning('Optional file/directory "%s" does not exist', f) - def _load_strain_mappings(self): # 1. load strain mappings sc = StrainCollection.read_json(self.strain_mappings_file) @@ -387,11 +354,11 @@ def _load_metabolomics(self): logger.debug("\nLoading metabolomics data starts...") # Step 1: load all Spectrum objects - raw_spectra = GNPSSpectrumLoader(self.mgf_file).spectra + raw_spectra = GNPSSpectrumLoader(self.gnps_mgf_file).spectra # Step 2: load all GNPS annotations - raw_annotations = GNPSAnnotationLoader(self.annotations_config_file).annotations + raw_annotations = GNPSAnnotationLoader(self.gnps_annotations_file).annotations # Step 3: load all MolecularFamily objects - raw_molfams = GNPSMolecularFamilyLoader(self.edges_file).get_mfs(keep_singleton=False) + raw_molfams = GNPSMolecularFamilyLoader(self.gnps_edges_file).get_mfs(keep_singleton=False) # Step 4: add GNPS annotations to Spectrum.gnps_annotations add_annotation_to_spectrum(raw_annotations, raw_spectra) @@ -530,14 +497,14 @@ def _load_class_info(self): ) ) try: - run_canopus(self.mgf_file, self.canopus_dir, extra_canopus_parameters) + run_canopus(self.gnps_mgf_file, self.canopus_dir, extra_canopus_parameters) except Exception as e: logger.warning( 'Failed to run CANOPUS on mgf file with docker, error was "{}"'.format(e) ) logger.info("Trying to run CANOPUS again using SIRIUS from path") try: - run_canopus(self.mgf_file, self.canopus_dir, extra_canopus_parameters) + run_canopus(self.gnps_mgf_file, self.canopus_dir, extra_canopus_parameters) except Exception as e: logger.warning( 'Again failed to run CANOPUS on mgf file using sirius from path, error was "{}"'.format( @@ -558,40 +525,6 @@ def _load_class_info(self): return True -def find_via_glob(path, file_type, optional=False): - try: - filename = glob.glob(path)[0] - return filename - except (OSError, IndexError): - if not optional: - # "from None" suppresses the traceback for the original exception, which isn't really needed - raise Exception( - 'ERROR: unable to find {} in path "{}"'.format(file_type, path) - ) from None - - logger.warn('WARNING: unable to find {} in path "{}"'.format(file_type, path)) - return None - - -def find_via_glob_alts_dir(paths, file_type, optional=False): - path = None - for p in paths: - if os.path.exists(p): - path = p - break - - if path is None and not optional: - raise Exception( - "ERROR: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths) - ) - elif path is None: - logger.warning( - "WARNING: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths) - ) - - return path - - def find_via_glob_alts(paths, file_type, optional=False): filename = None for path in paths: diff --git a/tests/test_nplinker_local.py b/tests/test_nplinker_local.py index c7833240..fd92d139 100644 --- a/tests/test_nplinker_local.py +++ b/tests/test_nplinker_local.py @@ -42,9 +42,34 @@ def npl() -> NPLinker: return npl +# --------------------------------------------------------------------------------------------------- +# After manually checking data files for PODP MSV000079284, we have the following numbers: +# 370 BGCs from antismash files +# 114 GCFs, including: +# - 49 singleton GCFs +# - 1 mibig-only GCF (not singleton) +# - 12 GCFs (neither singleton nor mibig-only) have mibig bgcs and in total 20 mibig BGCs are used +# 25935 spectra, including: +# - 24652 spectra have strain info (from strain mapping file) +# - 1283 spectra do not have strain info +# 25769 molecular families, including: +# - 25740 singleton families +# - 29 non-singleton families +# 26 strains from strain mapping file +# --------------------------------------------------------------------------------------------------- +# So, after data loading, we should get following numbers in the tests: +# 390 BGCs = 370 antismash BGCs + 20 mibig BGCs +# 64 GCFs (neither singleton nor mibig-only) = 114 GCFs - 49 singleton GCFs - 1 mibig-only GCF +# 24652 spectra (having strain info) +# 29 molecular families (non-singleton) +# 46 strains = 26 strains from strain mapping file + 20 strains from mibig +# --------------------------------------------------------------------------------------------------- + + @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip when running on CI") def test_load_data(npl: NPLinker): assert len(npl.bgcs) == 390 - assert len(npl.gcfs) == 113 - assert len(npl.spectra) == 25935 - assert len(npl.molfams) == 25769 + assert len(npl.gcfs) == 64 + assert len(npl.spectra) == 24652 + assert len(npl.molfams) == 29 + assert len(npl.strains) == 46