Skip to content

Commit

Permalink
Update logics of loading metabolomics data
Browse files Browse the repository at this point in the history
Major changes:
- update loading process in `DatasetLoader._load_metabolomics`.
- add utility functions for metabolomics loading
  • Loading branch information
CunliangGeng authored Jan 24, 2024
1 parent 5e97659 commit 6d012f5
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 2 deletions.
39 changes: 38 additions & 1 deletion src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
from nplinker.globals import PFAM_PATH
from nplinker.globals import STRAIN_MAPPINGS_FILENAME
from nplinker.logconfig import LogConfig
from nplinker.metabolomics import add_annotation_to_spectrum
from nplinker.metabolomics import add_spectrum_to_mf
from nplinker.metabolomics import add_strains_to_spectrum
from nplinker.metabolomics.gnps import GNPSAnnotationLoader
from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader
from nplinker.metabolomics.gnps import GNPSSpectrumLoader
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings
Expand Down Expand Up @@ -399,8 +405,39 @@ def _load_strain_mappings(self):

return True

# TODO CG: rewrite the loading process using GPNSLoader
def _load_metabolomics(self):
"""Loads metabolomics data to Spectrum and MolecularFamily objects.
The attribute of `self.spectra` is set to the loaded Spectrum objects that have Strain
objects added (i.e. `Spectrum.strains` updated). If a Spectrum object does not have Strain
objects, it is not added to `self.spectra`.
The attribute of `self.molfams` is set to the loaded MolecularFamily objects that have
Strain objects added (i.e. `MolecularFamily._strains` updated). This means only Spectra
objects with updated strains (i.e. `self.spectra`) can be added to MolecularFamily objects.
"""
logger.debug("\nLoading metabolomics data starts...")

# Step 1: load all Spectrum objects
raw_spectra = GNPSSpectrumLoader(self.mgf_file).spectra
# Step 2: load all GNPS annotations
raw_annotations = GNPSAnnotationLoader(self.annotations_config_file).annotations
# Step 3: load all MolecularFamily objects
raw_molfams = GNPSMolecularFamilyLoader(self.edges_file).get_mfs(keep_singleton=False)

# Step 4: add GNPS annotations to Spectrum.gnps_annotations
add_annotation_to_spectrum(raw_annotations, raw_spectra)
# Step 5: add strains to Spectrum.strains
spectra_with_strains, _ = add_strains_to_spectrum(self.strains, raw_spectra)

# Step 6: add Spectrum objects to MolecularFamily
mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_molfams)

# Step 7: set attributes of self.spectra and self.molfams with valid objects
self.spectra = spectra_with_strains
self.molfams = mf_with_spec

logger.debug("Loading metabolomics data completed\n")
return True

def _load_genomics(self):
Expand Down
12 changes: 11 additions & 1 deletion src/nplinker/metabolomics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
import logging
from .molecular_family import MolecularFamily
from .spectrum import Spectrum
from .utils import add_annotation_to_spectrum
from .utils import add_spectrum_to_mf
from .utils import add_strains_to_spectrum


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["MolecularFamily", "Spectrum"]

__all__ = [
"MolecularFamily",
"Spectrum",
"add_annotation_to_spectrum",
"add_spectrum_to_mf",
"add_strains_to_spectrum",
]
115 changes: 115 additions & 0 deletions src/nplinker/metabolomics/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from nplinker.logconfig import LogConfig
from nplinker.strain_collection import StrainCollection
from .molecular_family import MolecularFamily
from .spectrum import Spectrum


logger = LogConfig.getLogger(__name__)


def add_annotation_to_spectrum(annotations: dict[str, dict], spectra: list[Spectrum]) -> None:
"""Add GNPS annotations to the `Spectrum.gnps_annotaions` attribute for input spectra.
It is possible that some spectra don't have annotations.
Note that the input `spectra` list is changed in place.
Args:
annotations(dict[str, dict]): A dictionary of GNPS annotations, where the keys are
spectrum ids and the values are GNPS annotations.
spectra(list[Spectrum]): A list of Spectrum objects.
"""
for spec in spectra:
if spec.spectrum_id in annotations:
spec.gnps_annotations = annotations[spec.spectrum_id]


def add_strains_to_spectrum(
strains: StrainCollection, spectra: list[Spectrum]
) -> tuple[list[Spectrum], list[Spectrum]]:
"""Add `Strain` objects to the `Spectrum.strains` attribute for input spectra.
Note that the input `spectra` list is changed in place.
Args:
strains(StrainCollection): A collection of strain objects.
spectra(list[Spectrum]): A list of Spectrum objects.
Returns:
tuple(list[Spectrum], list[Spectrum]): A tuple of two lists of Spectrum
objects. The first list contains Spectrum objects that are updated
with Strain objects; the second list contains Spectrum objects that
are not updated with Strain objects becuase no Strain objects are found.
"""
spectra_with_strains = []
spectra_without_strains = []
for spec in spectra:
try:
strain_list = strains.lookup(spec.spectrum_id)
except ValueError:
spectra_without_strains.append(spec)
continue

for strain in strain_list:
spec.strains.add(strain)
spectra_with_strains.append(spec)

logger.info(
f"{len(spectra_with_strains)} Spectrum objects updated with Strain objects.\n"
f"{len(spectra_without_strains)} Spectrum objects not updated with Strain objects."
)

return spectra_with_strains, spectra_without_strains


def add_spectrum_to_mf(
spectra: list[Spectrum], mfs: list[MolecularFamily]
) -> tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]:
"""Add Spectrum objects to MolecularFamily objects.
The attribute of `spectra_ids` of MolecularFamily object contains the ids of Spectrum objects.
These ids are used to find Spectrum objects from the input `spectra` list. The found Spectrum
objects are added to the `spectra` attribute of MolecularFamily object. It is possible that
some spectrum ids are not found in the input `spectra` list, and so their Spectrum objects are
missing in the MolecularFamily object.
Note that the input `mfs` list is changed in place.
Args:
spectra(list[Spectrum]): A list of Spectrum objects.
mfs(list[MolecularFamily]): A list of MolecularFamily objects.
Returns:
tuple(list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]):
The first list contains MolecularFamily objects that are updated with Spectrum objects.
The second list contains MolecularFamily objects that are not updated with Spectrum
objects (all Spectrum objects are missing).
The dictionary contains MolecularFamily objects as keys and a set of ids of missing
Spectrum objects as values.
"""
spec_dict = {spec.spectrum_id: spec for spec in spectra}
mf_with_spec = []
mf_without_spec = []
mf_missing_spec: dict[MolecularFamily, set[str]] = {}
for mf in mfs:
for spec_id in mf.spectra_ids:
try:
spec = spec_dict[spec_id]
except KeyError:
if mf not in mf_missing_spec:
mf_missing_spec[mf] = {spec_id}
else:
mf_missing_spec[mf].add(spec_id)
continue
mf.add_spectrum(spec)

if mf.spectra:
mf_with_spec.append(mf)
else:
mf_without_spec.append(mf)

logger.info(
f"{len(mf_with_spec)} MolecularFamily objects updated with Spectrum objects.\n"
f"{len(mf_without_spec)} MolecularFamily objects not updated with Spectrum objects.\n"
f"{len(mf_missing_spec)} MolecularFamily objects have missing Spectrum objects."
)
return mf_with_spec, mf_without_spec, mf_missing_spec
83 changes: 83 additions & 0 deletions tests/metabolomics/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pytest
from nplinker.metabolomics import MolecularFamily
from nplinker.metabolomics import Spectrum
from nplinker.metabolomics import add_annotation_to_spectrum
from nplinker.metabolomics import add_spectrum_to_mf
from nplinker.metabolomics import add_strains_to_spectrum
from nplinker.strain import Strain
from nplinker.strain_collection import StrainCollection


@pytest.fixture
def spectra():
"""Fixture for a list of Spectrum objects."""
# The order of the spectra is important for the tests.
return [
Spectrum("spec0", [100, 200], [0.1, 0.2], 150),
Spectrum("spec1", [100, 200], [0.1, 0.2], 150),
Spectrum("spec2", [100, 200], [0.1, 0.2], 150),
]


def test_add_annotation_to_spectrum(spectra):
"""Test the add_annotation_to_spectrum function."""
annotations = {
"spec0": {"annotation": "annotation_0"},
"spec1": {"annotation": "annotation_1"},
"spec3": {"annotation": "annotation_3"},
}

add_annotation_to_spectrum(annotations, spectra)

for i, spec in enumerate(spectra):
if i < 2:
assert spec.gnps_annotations == {"annotation": f"annotation_{i}"}
else:
assert spec.gnps_annotations == {}


def test_add_strains_to_spectrum(spectra):
"""Test the add_strains_to_spectrum function."""
strains = StrainCollection()
strain0 = Strain("spec0") # spectrum id as strain id
strain1 = Strain("strain1")
strain1.add_alias("spec1") # spectrum id as strain alias
strains.add(strain0)
strains.add(strain1)

spectra_with_strains, spectra_without_strains = add_strains_to_spectrum(strains, spectra)

assert len(spectra_with_strains) == 2
assert len(spectra_without_strains) == 1
assert spectra_with_strains == [spectra[0], spectra[1]]
assert spectra_without_strains == [spectra[2]]
assert strain0 in spectra_with_strains[0].strains
assert strain1 in spectra_with_strains[1].strains
assert spectra_without_strains[0].strains == StrainCollection()


def test_add_spectrum_to_mf(spectra):
"""Test the add_spectrum_to_mf function."""
# Prepare the molecular families
mf0 = MolecularFamily("mf0")
mf0.spectra_ids = {"spec0", "spec1"}
mf1 = MolecularFamily("mf1")
mf1.spectra_ids = {
"spec2",
"spec-missing-1",
}
mf2 = MolecularFamily("mf2")
mf2.spectra_ids = {"spec-missing-2", "spec-missing-3"}
mfs = [mf0, mf1, mf2]

mf_with_spec, mf_without_spec, mf_missing_spec = add_spectrum_to_mf(spectra, mfs)

assert len(mf_with_spec) == 2
assert len(mf_without_spec) == 1
assert len(mf_missing_spec) == 2
assert mf_with_spec == [mf0, mf1]
assert mf_without_spec == [mf2]
assert mf_missing_spec == {mf1: {"spec-missing-1"}, mf2: {"spec-missing-2", "spec-missing-3"}}
assert mf0.spectra == {spectra[0], spectra[1]}
assert mf1.spectra == {spectra[2]}
assert mf2.spectra == set()

0 comments on commit 6d012f5

Please sign in to comment.