rename molfam to mf to uniform the abbreviation for MolecularFami…

…ly (#255)
NPLinker · Jun 14, 2024 · 06fe1f9 · 06fe1f9
1 parent 10e9d87
commit 06fe1f9
Show file tree

Hide file tree

Showing 9 changed files with 92 additions and 96 deletions.
diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py
@@ -45,7 +45,7 @@ def __init__(self, canopus_dir, mne_dir, gnps_dir):
         class_predict_options = []
         if self._canopus.spectra_classes:
             class_predict_options.append("canopus")
-        if self._molnetenhancer.spectra2molfam:
+        if self._molnetenhancer.spectra2mf:
             class_predict_options.append("molnetenhancer")
         if class_predict_options:
             class_predict_options = ["mix", "main"] + class_predict_options
@@ -70,7 +70,7 @@ class CanopusResults:
 
     The results from the canopus dir are read and combined with the MN from GNPS
     using canopus_treemap: github.com/louwenjjr/canopus_treemap/tree/master/canopus
-    This creates the two files that are read for the spectra and molfams:
+    This creates the two files that are read for the spectra and mfs:
         -cluster_index_classifications.txt
         -component_index_classifications.txt
 
@@ -91,7 +91,7 @@ def __init__(self, canopus_dir, gnps_dir):
         """
         self._canopus_dir = canopus_dir
         self._gnps_dir = gnps_dir
-        self._molfam_classes, self._molfam_classes_names, self._molfam_classes_names_inds = (
+        self._mf_classes, self._mf_classes_names, self._mf_classes_names_inds = (
             None,
             None,
             None,
@@ -163,21 +163,19 @@ def _read_all_classes(self):
             spectra_classes_names, spectra_classes = self._read_spectra_classes(ci_file)
 
             if os.path.isfile(compi_file):
-                molfam_classes_names, molfam_classes = self._read_molfam_classes(compi_file)
-                self._molfam_classes = molfam_classes
-                self._molfam_classes_names = molfam_classes_names
-                self._molfam_classes_names_inds = {
-                    elem: i for i, elem in enumerate(molfam_classes_names)
-                }
+                mf_classes_names, mf_classes = self._read_mf_classes(compi_file)
+                self._mf_classes = mf_classes
+                self._mf_classes_names = mf_classes_names
+                self._mf_classes_names_inds = {elem: i for i, elem in enumerate(mf_classes_names)}
         else:
             # use canopus output correctly (only for spectra)
             logger.info(
                 "Attempting to read spectra classes directly from "
                 "canopus_dir (canopus_summary.tsv)"
             )
             spectra_classes_names, spectra_classes = self._read_spectra_classes_directly()
-            # molfams have to be added later with info about molfam <- spectra
-            # this happens with transfer_spec_classes_to_molfams() in loader.py
+            # mfs have to be added later with info about mf <- spectra
+            # this happens with transfer_spec_classes_to_mfs() in loader.py
 
         self._spectra_classes = spectra_classes
         self._spectra_classes_names = spectra_classes_names
@@ -331,15 +329,15 @@ class prediction for a level. When no class is present, instead of Tuple it will
                     outf.write("\t".join(output_l) + "\n")
         return can_classes_names, can_classes
 
-    def _read_molfam_classes(self, input_file):
-        """Read canopus classes for molfams, return classes_names, classes.
+    def _read_mf_classes(self, input_file):
+        """Read canopus classes for mfs, return classes_names, classes.
 
         Args:
             input_file: str, component_index_classifications.txt
         Returns:
             Tuple of:
             - compi_classes_names: list of str - the names of each different level
-            - compi_classes: dict of {str: lists of tuple(str, float)} - per molfam index (key) the classes for each level
+            - compi_classes: dict of {str: lists of tuple(str, float)} - per mf index (key) the classes for each level
                 where each level is a list of (class_name, fraction) sorted on best choice so index 0 is the best
                 class prediction for a level. When no class is present, instead of Tuple it will be None for that level.
         """
@@ -376,33 +374,33 @@ class prediction for a level. When no class is present, instead of Tuple it will
             ]
         return compi_classes_names, compi_classes
 
-    def transfer_spec_classes_to_molfams(self, molfams, fraction_cutoff=0.0):
-        """Set _molfam_classes(_names) from spectra_classes and return classes.
+    def transfer_spec_classes_to_mfs(self, mfs, fraction_cutoff=0.0):
+        """Set _mf_classes(_names) from spectra_classes and return classes.
 
-        This can be used in the _loader to get molfam classes when the GNPS MN
+        This can be used in the _loader to get mf classes when the GNPS MN
         version is too old and canopus_treemap fails to work directly.
 
         Args:
-            molfams: list of MolecularFamily from the NPLinker space
+            mfs: list of MolecularFamily from the NPLinker space
             fraction_cutoff: float, cut-off for the fraction of class terms
-                needed to be included in the molfam
+                needed to be included in the mf
         Returns:
-            dict of {str: lists of tuple(str, float)} - per molfam (key) the classes for each level
+            dict of {str: lists of tuple(str, float)} - per mf (key) the classes for each level
                 where each level is a list of (class_name, fraction) sorted on best choice so index 0 is the best
                 class prediction for a level. When no class is present, instead of Tuple it will be None for that level.
         """
-        self._molfam_classes_names = self._spectra_classes_names
-        self._molfam_classes_names_inds = self._spectra_classes_names_inds
-        molfam_classes = {}
+        self._mf_classes_names = self._spectra_classes_names
+        self._mf_classes_names_inds = self._spectra_classes_names_inds
+        mf_classes = {}
 
-        for molfam in molfams:
-            fid = molfam.id  # the key
-            spectra = molfam.spectra
+        for mf in mfs:
+            fid = mf.id  # the key
+            spectra = mf.spectra
             # if singleton family, format like 'fid_spectrum-id'
             if fid.startswith("singleton-"):
                 spec_id = spectra[0].id
                 fid += f"_{spec_id}"
-            len_molfam = len(spectra)
+            len_mf = len(spectra)
 
             classes_per_spectra = []
             for spec in spectra:
@@ -411,10 +409,10 @@ class prediction for a level. When no class is present, instead of Tuple it will
                     classes_per_spectra.append(spec_classes)
 
             if not classes_per_spectra:
-                continue  # no spectra with classes for this molfam
+                continue  # no spectra with classes for this mf
 
             sorted_classes = []
-            for i, class_level in enumerate(self._molfam_classes_names):
+            for i, class_level in enumerate(self._mf_classes_names):
                 # 1. aggregate classes from all spectra for this class level
                 classes_cur_level = []
                 for spec_classes in classes_per_spectra:
@@ -423,7 +421,7 @@ class prediction for a level. When no class is present, instead of Tuple it will
                             if class_tup:
                                 classes_cur_level.append(class_tup[0])
                     except IndexError:
-                        print(self._molfam_classes_names)
+                        print(self._mf_classes_names)
                         print(i, class_level)
                         print(classes_per_spectra)
                         print(spec_classes)
@@ -433,20 +431,20 @@ class prediction for a level. When no class is present, instead of Tuple it will
                 # 3. calculate fraction and sort high to low, filter out Nones
                 fraction_tups = sorted(
                     (
-                        (cls, count / len_molfam)
+                        (cls, count / len_mf)
                         for cls, count in counts_cur_level.most_common()
-                        if count / len_molfam >= fraction_cutoff
+                        if count / len_mf >= fraction_cutoff
                     ),
                     key=lambda x: x[1],
                     reverse=True,
                 )
                 if not fraction_tups:
                     fraction_tups = [None]
                 sorted_classes.append(fraction_tups)
-            molfam_classes[fid] = sorted_classes
+            mf_classes[fid] = sorted_classes
 
-        self._molfam_classes = molfam_classes
-        return molfam_classes
+        self._mf_classes = mf_classes
+        return mf_classes
 
     def show(self, objects):
         """Show a table of predicted chemical compound classes for spectrum/MF.
@@ -471,16 +469,16 @@ def spectra_classes_names_inds(self):
         return self._spectra_classes_names_inds
 
     @property
-    def molfam_classes(self):
-        return self._molfam_classes
+    def mf_classes(self):
+        return self._mf_classes
 
     @property
-    def molfam_classes_names(self):
-        return self._molfam_classes_names
+    def mf_classes_names(self):
+        return self._mf_classes_names
 
     @property
-    def molfam_classes_names_inds(self):
-        return self._molfam_classes_names_inds
+    def mf_classes_names_inds(self):
+        return self._mf_classes_names_inds
 
 
 class MolNetEnhancerResults:
@@ -496,9 +494,9 @@ def __init__(self, mne_dir):
         Args:
             mne_dir: str, mne_dir found in root_dir of nplinker project
         """
-        cf_classes_names, molfam_classes, spectra2molfam = self._read_cf_classes(mne_dir)
-        self._spectra2molfam = spectra2molfam
-        self._molfam_classes = molfam_classes
+        cf_classes_names, mf_classes, spectra2mf = self._read_cf_classes(mne_dir)
+        self._spectra2mf = spectra2mf
+        self._mf_classes = mf_classes
         self._spectra_classes_names = cf_classes_names  # if NPC gets implemented, add here
         self._spectra_classes_names_inds = {elem: i for i, elem in enumerate(cf_classes_names)}
 
@@ -510,9 +508,9 @@ def _read_cf_classes(self, mne_dir):
         Returns:
             tuple of:
             -list of str - names of the classes in order
-            -dict of {str: [(str, float)]} - linking molfams to (classes, scores) in order of names,
+            -dict of {str: [(str, float)]} - linking mfs to (classes, scores) in order of names,
                 singleton families are denoted with S[\d]+
-            -dict of {str:str} - linking spectra to molfams
+            -dict of {str:str} - linking spectra to mfs
         """
         columns = []
         mne_component_dict = {}
@@ -579,22 +577,22 @@ def _read_cf_classes(self, mne_dir):
         return columns, mne_component_dict, mne_cluster2component
 
     def spectra_classes(self, spectrum_id):
-        """Return classes by relating spectrum_id in the molfam_classes.
+        """Return classes by relating spectrum_id in the mf_classes.
 
         Args:
             spectrum_id: int/str, spectrum_id - ints will be converted to str
         """
         classes = []
         if isinstance(spectrum_id, int):
             spectrum_id = str(spectrum_id)
-        molfam_id = self.spectra2molfam.get(spectrum_id)
-        if molfam_id:
-            classes = self.molfam_classes.get(molfam_id)
+        mf_id = self.spectra2mf.get(spectrum_id)
+        if mf_id:
+            classes = self.mf_classes.get(mf_id)
         return classes
 
     @property
-    def spectra2molfam(self):
-        return self._spectra2molfam
+    def spectra2mf(self):
+        return self._spectra2mf
 
     @property
     def spectra_classes_names(self):
@@ -605,5 +603,5 @@ def spectra_classes_names_inds(self):
         return self._spectra_classes_names_inds
 
     @property
-    def molfam_classes(self):
-        return self._molfam_classes
+    def mf_classes(self):
+        return self._mf_classes
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -35,7 +35,7 @@ class DatasetLoader:
         bgcs: A list of BGC objects.
         gcfs: A list of GCF objects.
         spectra: A list of Spectrum objects.
-        molfams: A list of MolecularFamily objects.
+        mfs: A list of MolecularFamily objects.
         mibig_bgcs: A list of MIBiG BGC objects.
         mibig_strains_in_use: A StrainCollection object that contains the strains in use from MIBiG.
         product_types: A list of product types.
@@ -60,7 +60,7 @@ def __init__(self, config: Dynaconf):
         """
         self.config = config
 
-        self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
+        self.bgcs, self.gcfs, self.spectra, self.mfs = [], [], [], []
         self.mibig_bgcs = []
         self.mibig_strains_in_use = StrainCollection()
         self.product_types = []
@@ -114,7 +114,7 @@ def _load_metabolomics(self):
         objects added (i.e. `Spectrum.strains` updated). If a Spectrum object does not have Strain
         objects, it is not added to `self.spectra`.
 
-        The attribute of `self.molfams` is set to the loaded MolecularFamily objects that have
+        The attribute of `self.mfs` is set to the loaded MolecularFamily objects that have
         Strain objects added (i.e. `MolecularFamily._strains` updated). This means only Spectra
         objects with updated strains (i.e. `self.spectra`) can be added to MolecularFamily objects.
         """
@@ -129,7 +129,7 @@ def _load_metabolomics(self):
             gnps_dir / defaults.GNPS_ANNOTATIONS_FILENAME
         ).annotations
         # Step 3: load all MolecularFamily objects
-        raw_molfams = GNPSMolecularFamilyLoader(
+        raw_mfs = GNPSMolecularFamilyLoader(
             gnps_dir / defaults.GNPS_MOLECULAR_FAMILY_FILENAME
         ).get_mfs(keep_singleton=False)
 
@@ -139,11 +139,11 @@ def _load_metabolomics(self):
         spectra_with_strains, _ = add_strains_to_spectrum(self.strains, raw_spectra)
 
         # Step 6: add Spectrum objects to MolecularFamily
-        mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_molfams)
+        mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_mfs)
 
-        # Step 7: set attributes of self.spectra and self.molfams with valid objects
+        # Step 7: set attributes of self.spectra and self.mfs with valid objects
         self.spectra = spectra_with_strains
-        self.molfams = mf_with_spec
+        self.mfs = mf_with_spec
 
         logger.info("Loading metabolomics data completed\n")
         return True
@@ -266,10 +266,10 @@ def _load_class_info(self):
 
         # load Chem_class_predictions (canopus, molnetenhancer are loaded)
         chem_classes = ChemClassPredictions(self.canopus_dir, self.molnetenhancer_dir, self._root)  # noqa
-        # if no molfam classes transfer them from spectra (due to old style MN)
-        if not chem_classes.canopus.molfam_classes and chem_classes.canopus.spectra_classes:
+        # if no mf classes transfer them from spectra (due to old style MN)
+        if not chem_classes.canopus.mf_classes and chem_classes.canopus.spectra_classes:
             logger.info("Added chemical compound classes for MFs")
-            chem_classes.canopus.transfer_spec_classes_to_molfams(self.molfams)
+            chem_classes.canopus.transfer_spec_classes_to_mfs(self.mfs)
         # include them in loader
         self.chem_classes = chem_classes
         return True
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -61,7 +61,7 @@ def __init__(self, config_file: str | PathLike):
         self._gcfs = []
         self._strains = None
         self._metadata = {}
-        self._molfams = []
+        self._mfs = []
         self._mibig_bgcs = []
         self._chem_classes = None
         self._class_matches = None
@@ -147,7 +147,7 @@ def load_data(self):
         loader.load()
 
         self._spectra = loader.spectra
-        self._molfams = loader.molfams
+        self._mfs = loader.mfs
         self._bgcs = loader.bgcs
         self._gcfs = loader.gcfs
         self._mibig_bgcs = loader.mibig_bgcs
@@ -160,7 +160,7 @@ def load_data(self):
     def get_links(
         self, input_objects: list, scoring_methods: list, and_mode: bool = True
     ) -> LinkCollection:
-        """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams).
+        """Find links for a set of input objects (BGCs/GCFs/Spectra/mfs).
 
         The input objects can be any mix of the following NPLinker types:
 
@@ -303,9 +303,9 @@ def spectra(self):
         return self._spectra
 
     @property
-    def molfams(self):
+    def mfs(self):
         """Returns a list of all the MolecularFamilies in the dataset."""
-        return self._molfams
+        return self._mfs
 
     @property
     def metadata(self):

diff --git a/src/nplinker/pickler.py b/src/nplinker/pickler.py
@@ -75,7 +75,7 @@ def persistent_load(self, pid):
         elif obj_type == "Spectrum":
             return self.nplinker.spectra[obj_id]
         elif obj_type == "MolecularFamily":
-            return self.nplinker.molfams[obj_id]
+            return self.nplinker.mfs[obj_id]
         elif obj_type == "ScoringMethod":
             return self.nplinker.scoring_method(obj_id)
 

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
@@ -102,14 +102,14 @@ def setup(cls, npl: NPLinker):
 
         logger.info(
             f"MetcalfScoring.setup starts: #bgcs={len(npl.bgcs)}, #gcfs={len(npl.gcfs)}, "
-            f"#spectra={len(npl.spectra)}, #molfams={len(npl.molfams)}, #strains={npl.strains}"
+            f"#spectra={len(npl.spectra)}, #mfs={len(npl.mfs)}, #strains={npl.strains}"
         )
         cls.npl = npl
 
         # calculate presence of gcfs/spectra/mfs with respect to strains
         cls.presence_gcf_strain = get_presence_gcf_strain(npl.gcfs, npl.strains)
         cls.presence_spec_strain = get_presence_spec_strain(npl.spectra, npl.strains)
-        cls.presence_mf_strain = get_presence_mf_strain(npl.molfams, npl.strains)
+        cls.presence_mf_strain = get_presence_mf_strain(npl.mfs, npl.strains)
 
         # calculate raw Metcalf scores for spec-gcf links
         raw_score_spec_gcf = cls._calc_raw_score(