diff --git a/src/nplinker/data/nplinker.toml b/src/nplinker/data/nplinker.toml
index 9bd0f778..c2cee5e1 100644
--- a/src/nplinker/data/nplinker.toml
+++ b/src/nplinker/data/nplinker.toml
@@ -22,48 +22,46 @@ repro_file = ""
 # Dataset configuration
 # ---------------------
 # NPLinker supports two basic methods for loading datasets:
-# 
+#
 #   1. All files stored locally
 #   2. Some/all files retrieved from the paired omics platform (http://pairedomicsdata.bioinformatics.nl/)
 #
-# The method you want to use determines the values that should be populated in 
+# The method you want to use determines the values that should be populated in
 # the [dataset] section below. If working with a purely local dataset, NPLinker
 # defaults to looking for all the necessary files in a single directory, given
-# by the "root" parameter. 
+# by the "root" parameter.
 #
 # Alternatively, to load metabolomics data from the paired platform, set the "root"
-# parameter to "platform:<projectID>", where "<projectID>" is taken from the platform 
+# parameter to "platform:<projectID>", where "<projectID>" is taken from the platform
 # project list. For example, "platform:MSV000079284" would select the dataset with
 # the ID MSV000079284.
 #
-# For more details see below. 
+# For more details see below.
 #
 # 1. Loading local datasets
 # -------------------------
 # Generally speaking the dataset layout the application expects matches the structure
 # of the output from a GNPS job. Workflows that are known to work so far are:
-# 
-#  - METABOLOMICS-SNETS (version 1.2.3) 
-#  - METABOLOMICS-SNETS-V2 (version release_14) 
+#
+#  - METABOLOMICS-SNETS (version 1.2.3)
+#  - METABOLOMICS-SNETS-V2 (version release_14)
 #  - FEATURE-BASED-MOLECULAR-NETWORKING (version 1.2.3)
 #
-# The simplest starting point is to download the "Clustered Spectra as MGF" zip 
+# The simplest starting point is to download the "Clustered Spectra as MGF" zip
 # file from GNPS and extract that into a folder to serve as the root directory for the dataset.
-# antiSMASH and BiG-SCAPE output are then added as additional subfolders. 
+# antiSMASH and BiG-SCAPE output are then added as additional subfolders.
 #
 # Typically all you will need to do is tell NPLinker where the root directory
 # is located. Otherwise you can customise the locations of the individual elements
-# using the various override settings below. 
+# using the various override settings below.
 #
 # The layout is as follows (see the documentation for more details):
 # <root>
 #   |- strain_mappings.csv (strain ID mappings)
 #   |   (METABOLOMICS DATA)
 #   |- clusterinfo*/<UID>.tsv (spectrum metadata, NOTE: NPLinker will search any folder beginning with "clusterinfo")
-#   |- metadata_table/metadata_table-00000.txt (only present in FBMN GNPS jobs?)
 #   |- networkedges_selfloop/<UID>.selfloop (the "edges" file for spectra network)
 #   |- spectra/*.mgf (mass spec data)
-#   |- quantification_table_reformatted/<UID>.csv ("extra" spectrum metadata, only present in FBMN jobs? TODO proper name for this?)
 #   |- DB_result/*.tsv (GNPS and other spectral annotation files, optional)
 #   |- result_specnets_DB/*.tsv (GNPS and other spectral annotation files, optional)
 #   |- DB_result/annotations.tsv (annotation data to extract from each file, see docs for details)
@@ -77,16 +75,16 @@ repro_file = ""
 #
 # 2. Loading platform datasets
 # ----------------------------
-# Given a platform ID, NPLinker will retrieve the associated metabolomics data using the 
-# GNPS task ID. By default, it will also attempt to retrieve any available 
-# genomics data from the antiSMASH database using the RefSeq accession labels 
-# in the platform project data. 
-# 
+# Given a platform ID, NPLinker will retrieve the associated metabolomics data using the
+# GNPS task ID. By default, it will also attempt to retrieve any available
+# genomics data from the antiSMASH database using the RefSeq accession labels
+# in the platform project data.
+#
 # However, if you have local antismash results instead, you should also set the
-# location of those files using the "antismash_dir" parameter in the 
-# [dataset.overrides] section. NPLinker can optionally also run bigscape on 
+# location of those files using the "antismash_dir" parameter in the
+# [dataset.overrides] section. NPLinker can optionally also run bigscape on
 # these results during the loading process. If you already have bigscape results
-# you can additionally set the "bigscape_dir" parameter to the appropriate 
+# you can additionally set the "bigscape_dir" parameter to the appropriate
 # location to skip this step (or simply place them inside the <root>/bigscape folder)
 #
 # All files are downloaded and extracted to locations inside ~/nplinker_data. On Windows
@@ -97,10 +95,10 @@ repro_file = ""
 # If the dataset has the expected directory structure, this value is all that's required.
 # It should be set to the path of the local directory containing the various data files
 # described above.
-# 
+#
 # If you want to load a dataset from the paired platform, the value should be a string
-# of the form: "platform:datasetID". For example, "platform:MSV000079284" would 
-# load the dataset with ID MSV000079284. 
+# of the form: "platform:datasetID". For example, "platform:MSV000079284" would
+# load the dataset with ID MSV000079284.
 root = "<root directory of dataset>"
 
 # you can optionally set the BIGSCAPE clustering cutoff value here. the default value
@@ -110,47 +108,36 @@ root = "<root directory of dataset>"
 # data to be reloaded. Possibly only useful for the webapp
 #bigscape_cutoff = 30
 
-# For datasets using the GNPS FBMN workflow, NPLinker can optionally try to parse
-# more data out of the "metadata table" file included in the output. This can both
-# simplify the process of creating a strain mappings file and allow NPLinker to 
-# extract growth media labels for each strain, which are then displayed in the 
-# web application. This option defaults to being disabled to provide a consistent
-# experience with other types of dataset, but you can choose to enable it if you
-# have a compatible metadata table file. 
-#
-# For more information: https://github.com/sdrogers/nplinker/wiki/Strain-mappings
-#extended_metadata_table_parsing = false
-
 # NPLinker can automatically download a copy of the MiBIG database in the JSON
 # format it expects to find, and place it in the same folder as the current dataset.
 # This behaviour is enabled by default. If you wish to disable it for any reason
-# change the value of this setting to "false". 
+# change the value of this setting to "false".
 #use_mibig = true
 
 # When the "use_mibig" setting described above is set to "true", NPLinker will use
 # the value of this setting to determine which version of the MiBIG database should
-# be downloaded. The default is "1.4". If you need a newer version, define the 
+# be downloaded. The default is "1.4". If you need a newer version, define the
 # value to be the appropriate version, e.g. "2.0". Currently tested versions are
-# "1.4" and "2.0". 
+# "1.4" and "2.0".
 #
 # NOTE: the value MUST be a string (i.e. enclosed by " characters: "2.0" not 2.0)
 #mibig_version = "1.4"
 
 [antismash]
-# antismash file structure. Should be either 'default' or 'flat'. 
+# antismash file structure. Should be either 'default' or 'flat'.
 # default = the standard structure with nested subdirectories
 # flat = all .gbk files placed in a single flat directory
-# TODO: is "flat" required any more? 
+# TODO: is "flat" required any more?
 antismash_format = "default"
 
 # NPLinker needs to know how to parse antiSMASH filenames from BiG-SCAPE output
-# to identify strain labels as part of the process of loading a dataset. 
+# to identify strain labels as part of the process of loading a dataset.
 # Since the format of the filenames can vary from dataset to dataset, there isn't
 # a single rule that can be applied here. In most cases, the required action is
-# "take all text up to the first occurrence of <character>", e.g. the first 
+# "take all text up to the first occurrence of <character>", e.g. the first
 # underscore or period character. By default NPLinker will try the set of characters
 # defined below in sequence until a match is found. If you need to override this
-# then define a new list of at least one character to search for (note that it 
+# then define a new list of at least one character to search for (note that it
 # doesn't need to be a single character, e.g. ".abc" would also work)
 # antismash_delimiters = [".", "_", "-"]
 
@@ -158,13 +145,13 @@ antismash_format = "default"
 # automatically, you can choose to have the application automatically rename any
 # files under the <root>/antismash folder which contain spaces in their filenames
 # (both subfolders and .gbk files). This is to avoid BiG-SCAPE throwing up errors
-# because it can't handle filenames with spaces in them. If you already have 
+# because it can't handle filenames with spaces in them. If you already have
 # BiG-SCAPE results or don't want NPLinker to rename anything automatically, set
 # this to true instead
 ignore_spaces = false
 
 [docker]
-# this optional section contains various settings that only apply to the dockerised 
+# this optional section contains various settings that only apply to the dockerised
 # version of NPLinker. The Docker image contains BiG-SCAPE, and this tool can be run
 # as part of the dataset loading process. If you want to enable/disable this, and
 # configure the parameters used, this can be done by modifying the settings below.
@@ -211,7 +198,7 @@ run_bigscape = true
 #
 # The default simply enables MiBIG, disables the extra GCF clustering step. If you
 # need to select a particular version of the MiBIG database, see the "mibig_version"
-# setting. 
+# setting.
 # Note that BiG-SCAPE defaults to using 0.3 as a clustering cutoff if "--cutoffs"
 # is not set.
 #extra_bigscape_parameters = "--mibig --clans-off"
@@ -267,14 +254,14 @@ run_canopus = false
 extra_canopus_parameters = "--maxmz 600 formula zodiac structure canopus"
 
 # this optional section contains various settings that only affect the webapp interface
-# for NPLinker. If you're not using it, you can leave this out of your configuration. 
+# for NPLinker. If you're not using it, you can leave this out of your configuration.
 [webapp]
 # A fundamental part of the webapp is a set of tables which display objects in the dataset
-# that have been found to have links, based on the output of the Metcalf scoring method. 
+# that have been found to have links, based on the output of the Metcalf scoring method.
 # The value of this parameter is the minimum Metcalf score that a link must have for its
 # associated objects to appear in the table. In other words, the higher this is set, the
 # smaller the number of objects that will remain to be displayed in the tables. This may
-# be useful for improving performance on larger datasets. 
+# be useful for improving performance on larger datasets.
 # The default value is 2.0
 tables_metcalf_threshold = 2.0
 
@@ -283,11 +270,11 @@ tables_metcalf_threshold = 2.0
 
 # the "Rosetta" scoring method involves some preprocessing steps that can take
 # significant time. NPLinker will automatically run these steps as it loads the
-# dataset and cache the results. if you would like to adjust the parameters used 
+# dataset and cache the results. if you would like to adjust the parameters used
 # by the Rosetta method you can do by setting them below (note that changing
 # these values will invalidate any cached data and force the preprocessing steps
 # to be run again)
-# 
+#
 # TODO document what these do
 [scoring.rosetta]
 # ms1_tol = 100
@@ -302,29 +289,16 @@ tables_metcalf_threshold = 2.0
 #strain_mappings_file = ""
 
 # MGF filename. This path is passed to glob.glob, default is <root>/spectra/*.mgf
-#mgf_file = ""
+#gnps_mgf_file = ""
 
 # nodes filename. This path is passed to glob.glob, default is <root>/clusterinfo_summary/*.tsv
-#nodes_file = ""
-
-# don't know what to call this yet? TODO
-# "extra" spectrum metadata file, default is <root>/*_quant.csv
-#extra_nodes_file = ""
+#gnps_nodes_file = ""
 
 # edges filename. This path is passed to glob.glob, default is <root>/networkedges_selfloop/*.selfloop
-#edges_file = ""
-
-# metadata table filename. This path is passed to glob.glob, default is <root>/metadata_table/metadata_table-*.txt
-#metadata_table_file = ""
-
-# quantification table filename. This path is passed to glob.glob, default is <root>/quantification_table/quantification_table-*.csv
-#quantification_table_file = ""
-
-# GNPS spectral annotations directory, default is <root>/DB_result
-#annotations_dir = ""
+#gnps_edges_file = ""
 
 # annotation configuration file, default is <root>/annotations.tsv
-#annotations_config_file = ""
+#gnps_annotations_file = ""
 
 # Antismash data directory, default is <root>/antismash
 #antismash_dir = ""
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index ccdedb39..d6e4f94a 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -48,7 +48,6 @@ class DatasetLoader:
     TABLES_CUTOFF_DEFAULT = 2.0
 
     BIGSCAPE_CUTOFF_DEFAULT = 30
-    EXTENDED_METADATA_TABLE_PARSING_DEFAULT = False
     USE_MIBIG_DEFAULT = True
     MIBIG_VERSION_DEFAULT = "3.1"
 
@@ -62,14 +61,10 @@ class DatasetLoader:
     EXTRA_CANOPUS_PARAMS_DEFAULT = "--maxmz 600 formula zodiac structure canopus"
 
     # keys for overriding metabolomics data elements
-    OR_NODES = "nodes_file"
-    OR_EDGES = "edges_file"
-    OR_EXTRA_NODES = "extra_nodes_file"
-    OR_MGF = "mgf_file"
-    OR_METADATA = "metadata_table_file"
-    OR_QUANT = "quantification_table_file"
-    OR_ANNO = "annotations_dir"
-    OR_ANNO_CONFIG = "annotations_config_file"
+    OR_GNPS_NODES = "gnps_nodes_file"
+    OR_GNPS_EDGES = "gnps_edges_file"
+    OR_GNPS_MGF = "gnps_mgf_file"
+    OR_GNPS_ANNOTATIONS = "gnps_annotations_file"
     # and the same for genomics data
     OR_ANTISMASH = "antismash_dir"
     OR_BIGSCAPE = "bigscape_dir"
@@ -109,9 +104,6 @@ def __init__(self, config_data):
         self._bigscape_cutoff = self._config_dataset.get(
             "bigscape_cutoff", self.BIGSCAPE_CUTOFF_DEFAULT
         )
-        self._extended_metadata_table_parsing = self._config_dataset.get(
-            "extended_metadata_table_parsing", self.EXTENDED_METADATA_TABLE_PARSING_DEFAULT
-        )
         self._use_mibig = self._config_dataset.get("use_mibig", self.USE_MIBIG_DEFAULT)
         self._mibig_version = self._config_dataset.get("mibig_version", self.MIBIG_VERSION_DEFAULT)
         self._root = Path(self._config_dataset["root"])
@@ -140,9 +132,9 @@ def __init__(self, config_data):
     def __repr__(self):
         return "Root={}\n   MGF={}\n   EDGES={}\n   NODES={}\n   BIGSCAPE={}\n   ANTISMASH={}\n".format(
             self._root,
-            self.mgf_file,
-            self.edges_file,
-            self.nodes_file,
+            self.gnps_mgf_file,
+            self.gnps_edges_file,
+            self.gnps_nodes_file,
             self.bigscape_dir,
             self.antismash_dir,
         )
@@ -246,76 +238,51 @@ def _init_paths(self):
         ) or os.path.join(self._root, "molnetenhancer")
 
     def _init_metabolomics_paths(self):
-        # 2. MET: <root>/clusterinfo_summary/<some UID>.tsv (or .clustersummary apparently...) / nodes_file=<override>
-        self.nodes_file = self._config_overrides.get(self.OR_NODES) or find_via_glob_alts(
+        """Initializes the paths for metabolomics data."""
+        # GNPS nodes_files is `file_mappings.tsv/csv` file
+        self.gnps_nodes_file = self._config_overrides.get(self.OR_GNPS_NODES) or find_via_glob_alts(
             [
+                os.path.join(self._root, "file_mappings.csv"),
                 os.path.join(self._root, "file_mappings.tsv"),
                 os.path.join(self._root, "clusterinfo*", "*.tsv"),
                 os.path.join(self._root, "clusterinfo*", "*.clustersummary"),
             ],
-            self.OR_NODES,
+            self.OR_GNPS_NODES,
         )
 
-        # 3. MET: <root>/networkedges_selfloop/<some UID>.selfloop (new) or .pairsinfo (old) / edges_file=<override>
-        self.edges_file = self._config_overrides.get(self.OR_EDGES) or find_via_glob_alts(
+        # GNPS edges_file is `molecular_families.tsv` file
+        self.gnps_edges_file = self._config_overrides.get(self.OR_GNPS_EDGES) or find_via_glob_alts(
             [
+                os.path.join(self._root, "molecular_families.tsv"),
                 os.path.join(self._root, "*.pairsinfo"),
                 os.path.join(self._root, "networkedges_selfloop", "*.pairsinfo"),
                 os.path.join(self._root, "networkedges_selfloop", "*.selfloop"),
             ],
-            self.OR_EDGES,
-        )
-
-        # 4. MET: <root>/*.csv / extra_nodes_file=<override>
-        # TODO is the glob input OK?
-        # => wait for updated dataset with latest output format
-        # NOTE: only optional for Crusemann or Crusemann-like dataset format!
-        self.extra_nodes_file = self._config_overrides.get(self.OR_EXTRA_NODES) or find_via_glob(
-            os.path.join(self._root, "quantification_table_reformatted", "*.csv"),
-            self.OR_EXTRA_NODES,
-            optional=True,
-        )
-
-        # 5. MET: <root>/spectra/*.mgf (or <root>/*.mgf)/ mgf_file=<override>
-        self.mgf_file = self._config_overrides.get(self.OR_MGF) or find_via_glob_alts(
-            [os.path.join(self._root, "*.mgf"), os.path.join(self._root, "spectra", "*.mgf")],
-            self.OR_MGF,
+            self.OR_GNPS_EDGES,
         )
 
-        # 6. MET: <root>/metadata_table/metadata_table-<number>.txt / metadata_table_file=<override>
-        self.metadata_table_file = self._config_overrides.get(self.OR_METADATA) or find_via_glob(
-            os.path.join(self._root, "metadata_table", "metadata_table*.txt"),
-            self.OR_METADATA,
-            optional=True,
+        # GNPS mgf_file is `spectra.mgf` file
+        self.gnps_mgf_file = self._config_overrides.get(self.OR_GNPS_MGF) or find_via_glob_alts(
+            [
+                os.path.join("spectra.mgf"),
+                os.path.join(self._root, "*.mgf"),
+                os.path.join(self._root, "spectra", "*.mgf"),
+            ],
+            self.OR_GNPS_MGF,
         )
 
-        # 7. MET: <root>/quantification_table/quantification_table-<number>.csv / quantification_table_file=<override>
-        self.quantification_table_file = self._config_overrides.get(self.OR_QUANT) or find_via_glob(
-            os.path.join(self._root, "quantification_table", "quantification_table*.csv"),
-            self.OR_QUANT,
-            optional=True,
+        # GNPS annotations_file is `annotations.tsv` file
+        self.gnps_annotations_file = self._config_overrides.get(
+            self.OR_GNPS_ANNOTATIONS
+        ) or find_via_glob_alts(
+            [
+                os.path.join(self._root, "annotations.tsv"),
+                os.path.join(self._root, "DB_result", "*.tsv"),
+                os.path.join(self._root, "result_specnets_DB", "*.tsv"),
+            ],
+            self.OR_GNPS_ANNOTATIONS,
         )
 
-        # 8. MET: <root>/DB_result/*.tsv (new) or <root>/result_specnets_DB/*.tsv (old) / annotations_dir=<override>
-        if Path.is_file(Path(self._root) / "annotations.tsv"):
-            self.annotations_dir = str(self._root)
-            self.annotations_config_file = os.path.join(self._root, "annotations.tsv")
-        else:
-            self.annotations_dir = self._config_overrides.get(
-                self.OR_ANNO
-            ) or find_via_glob_alts_dir(
-                [
-                    os.path.join(self._root, "DB_result"),
-                    os.path.join(self._root, "result_specnets_DB"),
-                ],
-                self.OR_ANNO,
-                optional=False,
-            )
-            if self.annotations_dir is not None:
-                self.annotations_config_file = self._config_overrides.get(
-                    self.OR_ANNO_CONFIG
-                ) or os.path.join(self.annotations_dir, "annotations.tsv")
-
     def _init_genomics_paths(self):
         # 9. GEN: <root>/antismash / antismash_dir=<override>
         self.antismash_dir = self._config_overrides.get(self.OR_ANTISMASH) or os.path.join(
@@ -344,17 +311,17 @@ def _init_genomics_paths(self):
 
     def _validate_paths(self):
         """Validates that the required files and directories exist before loading starts."""
-        required_paths = [self.nodes_file, self.edges_file, self.mgf_file, self.antismash_dir]
-        optional_paths = [self.annotations_dir]
+        required_paths = [
+            self.gnps_nodes_file,
+            self.gnps_edges_file,
+            self.gnps_mgf_file,
+            self.antismash_dir,
+        ]
 
         for f in required_paths:
             if not os.path.exists(str(f)):
                 raise FileNotFoundError(f'File/directory "{f}" does not exist.')
 
-        for f in optional_paths:
-            if not os.path.exists(str(f)):
-                logger.warning('Optional file/directory "%s" does not exist', f)
-
     def _load_strain_mappings(self):
         # 1. load strain mappings
         sc = StrainCollection.read_json(self.strain_mappings_file)
@@ -387,11 +354,11 @@ def _load_metabolomics(self):
         logger.debug("\nLoading metabolomics data starts...")
 
         # Step 1: load all Spectrum objects
-        raw_spectra = GNPSSpectrumLoader(self.mgf_file).spectra
+        raw_spectra = GNPSSpectrumLoader(self.gnps_mgf_file).spectra
         # Step 2: load all GNPS annotations
-        raw_annotations = GNPSAnnotationLoader(self.annotations_config_file).annotations
+        raw_annotations = GNPSAnnotationLoader(self.gnps_annotations_file).annotations
         # Step 3: load all MolecularFamily objects
-        raw_molfams = GNPSMolecularFamilyLoader(self.edges_file).get_mfs(keep_singleton=False)
+        raw_molfams = GNPSMolecularFamilyLoader(self.gnps_edges_file).get_mfs(keep_singleton=False)
 
         # Step 4: add GNPS annotations to Spectrum.gnps_annotations
         add_annotation_to_spectrum(raw_annotations, raw_spectra)
@@ -530,14 +497,14 @@ def _load_class_info(self):
                     )
                 )
                 try:
-                    run_canopus(self.mgf_file, self.canopus_dir, extra_canopus_parameters)
+                    run_canopus(self.gnps_mgf_file, self.canopus_dir, extra_canopus_parameters)
                 except Exception as e:
                     logger.warning(
                         'Failed to run CANOPUS on mgf file with docker, error was "{}"'.format(e)
                     )
                     logger.info("Trying to run CANOPUS again using SIRIUS from path")
                     try:
-                        run_canopus(self.mgf_file, self.canopus_dir, extra_canopus_parameters)
+                        run_canopus(self.gnps_mgf_file, self.canopus_dir, extra_canopus_parameters)
                     except Exception as e:
                         logger.warning(
                             'Again failed to run CANOPUS on mgf file using sirius from path, error was "{}"'.format(
@@ -558,40 +525,6 @@ def _load_class_info(self):
         return True
 
 
-def find_via_glob(path, file_type, optional=False):
-    try:
-        filename = glob.glob(path)[0]
-        return filename
-    except (OSError, IndexError):
-        if not optional:
-            # "from None" suppresses the traceback for the original exception, which isn't really needed
-            raise Exception(
-                'ERROR: unable to find {} in path "{}"'.format(file_type, path)
-            ) from None
-
-        logger.warn('WARNING: unable to find {} in path "{}"'.format(file_type, path))
-        return None
-
-
-def find_via_glob_alts_dir(paths, file_type, optional=False):
-    path = None
-    for p in paths:
-        if os.path.exists(p):
-            path = p
-            break
-
-    if path is None and not optional:
-        raise Exception(
-            "ERROR: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths)
-        )
-    elif path is None:
-        logger.warning(
-            "WARNING: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths)
-        )
-
-    return path
-
-
 def find_via_glob_alts(paths, file_type, optional=False):
     filename = None
     for path in paths:
diff --git a/tests/test_nplinker_local.py b/tests/test_nplinker_local.py
index c7833240..fd92d139 100644
--- a/tests/test_nplinker_local.py
+++ b/tests/test_nplinker_local.py
@@ -42,9 +42,34 @@ def npl() -> NPLinker:
     return npl
 
 
+# ---------------------------------------------------------------------------------------------------
+# After manually checking data files for PODP MSV000079284, we have the following numbers:
+# 370 BGCs from antismash files
+# 114 GCFs, including:
+#   - 49 singleton GCFs
+#   - 1 mibig-only GCF (not singleton)
+#   - 12 GCFs (neither singleton nor mibig-only) have mibig bgcs and in total 20 mibig BGCs are used
+# 25935 spectra, including:
+#   - 24652 spectra have strain info (from strain mapping file)
+#   - 1283 spectra do not have strain info
+# 25769 molecular families, including:
+#   - 25740 singleton families
+#   - 29 non-singleton families
+# 26 strains from strain mapping file
+# ---------------------------------------------------------------------------------------------------
+# So, after data loading, we should get following numbers in the tests:
+# 390 BGCs = 370 antismash BGCs + 20 mibig BGCs
+# 64 GCFs (neither singleton nor mibig-only) = 114 GCFs - 49 singleton GCFs - 1 mibig-only GCF
+# 24652 spectra (having strain info)
+# 29 molecular families (non-singleton)
+# 46 strains = 26 strains from strain mapping file + 20 strains from mibig
+# ---------------------------------------------------------------------------------------------------
+
+
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip when running on CI")
 def test_load_data(npl: NPLinker):
     assert len(npl.bgcs) == 390
-    assert len(npl.gcfs) == 113
-    assert len(npl.spectra) == 25935
-    assert len(npl.molfams) == 25769
+    assert len(npl.gcfs) == 64
+    assert len(npl.spectra) == 24652
+    assert len(npl.molfams) == 29
+    assert len(npl.strains) == 46