diff --git a/notebooks/npclassscore_linking/prospecting/class_linking.py b/notebooks/npclassscore_linking/prospecting/class_linking.py index c793be56..355a7be5 100644 --- a/notebooks/npclassscore_linking/prospecting/class_linking.py +++ b/notebooks/npclassscore_linking/prospecting/class_linking.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """Initial code for NPClassScore.""" + import glob import os import sys @@ -541,7 +542,7 @@ def class_linking_score(self, obj, target): if is_spectrum: # list of list of tuples/None - todo: add to spectrum object - spec_like_classes = self.canopus.spectra_classes.get(str(spec_like.spectrum_id)) + spec_like_classes = self.canopus.spectra_classes.get(str(spec_like.id)) spec_like_classes_names_inds = self.canopus.spectra_classes_names_inds else: # molfam spec_like_classes = self.canopus.molfam_classes.get(str(spec_like.family_id)) @@ -654,7 +655,7 @@ def npclass_score(self, obj, target, method="main"): if is_spectrum: # list of list of tuples/None - todo: add to spectrum object # take only 'best' (first) classification per ontology level - all_classes = self.canopus.spectra_classes.get(str(spec_like.spectrum_id)) + all_classes = self.canopus.spectra_classes.get(str(spec_like.id)) if all_classes: spec_like_classes = [ cls_per_lvl @@ -675,7 +676,7 @@ def npclass_score(self, obj, target, method="main"): spec_like_classes_names_inds = self.canopus.molfam_classes_names_inds if use_mne and not spec_like_classes: # if mne or when main/canopus does not get classes if is_spectrum: - spec_like_classes = self.molnetenhancer.spectra_classes(spec_like.spectrum_id) + spec_like_classes = self.molnetenhancer.spectra_classes(spec_like.id) else: # molfam spec_like_classes = self.molnetenhancer.molfam_classes.get(str(spec_like.family_id)) # classes are same for molfam and spectrum so names are irrespective of is_spectrum @@ -777,4 +778,4 @@ def _get_bgc_like_classes(self, bgc_like, is_bgc): return bgc_like_classes_dict def _get_bgc_like_gcf(self, bgc_like): - return [gcf for gcf in self.gcfs if bgc_like.bgc_id in [b.bgc_id for b in gcf.bgcs]][0] + return [gcf for gcf in self.gcfs if bgc_like.bgc_id in [b.id for b in gcf.bgcs]][0] diff --git a/notebooks/npclassscore_linking/prospecting/class_linking_test.py b/notebooks/npclassscore_linking/prospecting/class_linking_test.py index 6b7af47a..fec525a7 100644 --- a/notebooks/npclassscore_linking/prospecting/class_linking_test.py +++ b/notebooks/npclassscore_linking/prospecting/class_linking_test.py @@ -35,8 +35,8 @@ # 2. check chemical compound predictions from canopus and molnetenhancer test_spec = list(npl.spectra)[500] - print(npl.canopus.spectra_classes.get(str(test_spec.spectrum_id))) - print(npl.molnetenhancer.spectra_classes(str(test_spec.spectrum_id))) + print(npl.canopus.spectra_classes.get(str(test_spec.id))) + print(npl.molnetenhancer.spectra_classes(str(test_spec.id))) # 3. example of a good score, (predicted) NRP linking to a (predicted) peptide like spectrum print(npl.class_linking_score(list(npl.gcfs)[0], test_spec)) diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py index b99f9f47..5bf1327d 100644 --- a/src/nplinker/class_info/chem_classes.py +++ b/src/nplinker/class_info/chem_classes.py @@ -396,17 +396,17 @@ class prediction for a level. When no class is present, instead of Tuple it will molfam_classes = {} for molfam in molfams: - fid = molfam.family_id # the key + fid = molfam.id # the key spectra = molfam.spectra # if singleton family, format like 'fid_spectrum-id' if fid.startswith("singleton-"): - spec_id = spectra[0].spectrum_id + spec_id = spectra[0].id fid += f"_{spec_id}" len_molfam = len(spectra) classes_per_spectra = [] for spec in spectra: - spec_classes = self.spectra_classes.get(spec.spectrum_id) + spec_classes = self.spectra_classes.get(spec.id) if spec_classes: # account for spectra without prediction classes_per_spectra.append(spec_classes) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index c63c94cb..e291cac0 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -28,7 +28,7 @@ class BGC: and used by MIBiG. Attributes: - bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession. + id: BGC identifier, e.g. MIBiG accession, GenBank accession. product_prediction: A tuple of (predicted) natural products or product classes of the BGC. For antiSMASH's GenBank data, the feature `region /product` @@ -59,15 +59,15 @@ class BGC: strain: The strain of the BGC. """ - def __init__(self, bgc_id: str, /, *product_prediction: str): + def __init__(self, id: str, /, *product_prediction: str): """Initialize the BGC object. Args: - bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession. + id: BGC identifier, e.g. MIBiG accession, GenBank accession. product_prediction: BGC's (predicted) natural products or product classes. """ # BGC metadata - self.bgc_id = bgc_id + self.id = id self.product_prediction = product_prediction self.mibig_bgc_class: tuple[str] | None = None @@ -87,9 +87,9 @@ def __repr__(self): return str(self) def __str__(self): - return "{}(bgc_id={}, strain={}, asid={}, region={})".format( + return "{}(id={}, strain={}, asid={}, region={})".format( self.__class__.__name__, - self.bgc_id, + self.id, self.strain, self.antismash_id, self.antismash_region, @@ -97,13 +97,11 @@ def __str__(self): def __eq__(self, other) -> bool: if isinstance(other, BGC): - return ( - self.bgc_id == other.bgc_id and self.product_prediction == other.product_prediction - ) + return self.id == other.id and self.product_prediction == other.product_prediction return NotImplemented def __hash__(self) -> int: - return hash((self.bgc_id, self.product_prediction)) + return hash((self.id, self.product_prediction)) def add_parent(self, gcf: GCF) -> None: """Add a parent GCF to the BGC. @@ -146,7 +144,7 @@ def is_mibig(self) -> bool: Returns: True if it's MIBiG reference BGC """ - return self.bgc_id.startswith("BGC") + return self.id.startswith("BGC") # CG: why not providing whole product but only amino acid as product monomer? # this property is not used in NPLinker core business. diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index eb5963c0..ecddd7a7 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -18,7 +18,7 @@ class GCF: tools such as BiG-SCAPE and BiG-SLICE. Attributes: - gcf_id: id of the GCF object. + id: id of the GCF object. bgc_ids: a set of BGC ids that belongs to the GCF. bigscape_class: BiG-SCAPE's BGC class. BiG-SCAPE's BGC classes are similar to those defined in MiBIG @@ -26,13 +26,13 @@ class GCF: https://doi.org/10.1038%2Fs41589-019-0400-9. """ - def __init__(self, gcf_id: str, /) -> None: + def __init__(self, id: str, /) -> None: """Initialize the GCF object. Args: - gcf_id: id of the GCF object. + id: id of the GCF object. """ - self.gcf_id = gcf_id + self.id = id self.bgc_ids: set[str] = set() self.bigscape_class: str | None = None self._bgcs: set[BGC] = set() @@ -40,7 +40,7 @@ def __init__(self, gcf_id: str, /) -> None: def __str__(self) -> str: return ( - f"GCF(id={self.gcf_id}, #BGC_objects={len(self.bgcs)}, #bgc_ids={len(self.bgc_ids)}," + f"GCF(id={self.id}, #BGC_objects={len(self.bgcs)}, #bgc_ids={len(self.bgc_ids)}," f"#strains={len(self._strains)})." ) @@ -49,7 +49,7 @@ def __repr__(self) -> str: def __eq__(self, other) -> bool: if isinstance(other, GCF): - return self.gcf_id == other.gcf_id and self.bgcs == other.bgcs + return self.id == other.id and self.bgcs == other.bgcs return NotImplemented def __hash__(self) -> int: @@ -58,7 +58,7 @@ def __hash__(self) -> int: Note that GCF class is a mutable container. We only hash the GCF id to avoid the hash value changes when `self._bgcs` is updated. """ - return hash(self.gcf_id) + return hash(self.id) @property def bgcs(self) -> set[BGC]: @@ -74,17 +74,17 @@ def add_bgc(self, bgc: BGC) -> None: """Add a BGC object to the GCF.""" bgc.parents.add(self) self._bgcs.add(bgc) - self.bgc_ids.add(bgc.bgc_id) + self.bgc_ids.add(bgc.id) if bgc.strain is not None: self._strains.add(bgc.strain) else: - logger.warning("No strain specified for the BGC %s", bgc.bgc_id) + logger.warning("No strain specified for the BGC %s", bgc.id) def detach_bgc(self, bgc: BGC) -> None: """Remove a child BGC object.""" bgc.parents.remove(self) self._bgcs.remove(bgc) - self.bgc_ids.remove(bgc.bgc_id) + self.bgc_ids.remove(bgc.id) if bgc.strain is not None: for other_bgc in self._bgcs: if other_bgc.strain == bgc.strain: diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py index aa984e20..ba4c227b 100644 --- a/src/nplinker/genomics/utils.py +++ b/src/nplinker/genomics/utils.py @@ -91,13 +91,13 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[ bgc_without_strain = [] for bgc in bgcs: try: - strain_list = strains.lookup(bgc.bgc_id) + strain_list = strains.lookup(bgc.id) except ValueError: bgc_without_strain.append(bgc) continue if len(strain_list) > 1: raise ValueError( - f"Multiple strain objects found for BGC id '{bgc.bgc_id}'." + f"Multiple strain objects found for BGC id '{bgc.id}'." f"BGC object accept only one strain." ) bgc.strain = strain_list[0] @@ -136,7 +136,7 @@ def add_bgc_to_gcf( - The dictionary contains GCF objects as keys and a set of ids of missing BGC objects as values. """ - bgc_dict = {bgc.bgc_id: bgc for bgc in bgcs} + bgc_dict = {bgc.id: bgc for bgc in bgcs} gcf_with_bgc = [] gcf_without_bgc = [] gcf_missing_bgc: dict[GCF, set[str]] = {} diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index ac048b03..93589643 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -98,15 +98,15 @@ def _load(self) -> None: for row in reader: spec1_id = row["CLUSTERID1"] spec2_id = row["CLUSTERID2"] - family_id = row["ComponentIndex"] - if family_id not in family_dict: - family_dict[family_id] = set([spec1_id, spec2_id]) + mf_id = row["ComponentIndex"] + if mf_id not in family_dict: + family_dict[mf_id] = set([spec1_id, spec2_id]) else: - family_dict[family_id].add(spec1_id) - family_dict[family_id].add(spec2_id) + family_dict[mf_id].add(spec1_id) + family_dict[mf_id].add(spec2_id) # convert dict to list of MolecularFamily objects - for family_id, spectra_ids in family_dict.items(): - if family_id == "-1": # "-1" is from GNPS, it means the singleton molecular family + for mf_id, spectra_ids in family_dict.items(): + if mf_id == "-1": # "-1" is from GNPS, it means the singleton molecular family for spectrum_id in spectra_ids: # family id must be unique, so using "singleton-" + spectrum id as family id family = MolecularFamily("singleton-" + str(spectrum_id)) @@ -114,6 +114,6 @@ def _load(self) -> None: self._mfs.append(family) else: # for regular molecular families, use the value of "ComponentIndex" as family id - family = MolecularFamily(family_id) + family = MolecularFamily(mf_id) family.spectra_ids = spectra_ids self._mfs.append(family) diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py index e4d25939..d9b34156 100644 --- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py @@ -89,7 +89,7 @@ def _load(self): rt = spec["params"].get("rtinseconds", 0) spectrum = Spectrum( - spectrum_id=spectrum_id, + id=spectrum_id, mz=list(spec["m/z array"]), intensity=list(spec["intensity array"]), precursor_mz=precursor_mz, diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index aaf64b7f..16d9bd6a 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -12,24 +12,24 @@ class MolecularFamily: """Class to model molecular family. Attributes: - family_id: Unique id for the molecular family. + id: Unique id for the molecular family. spectra_ids: Set of spectrum ids in the molecular family. """ - def __init__(self, family_id: str): + def __init__(self, id: str): """Initialize the MolecularFamily. Args: - family_id: Unique id for the molecular family. + id: Unique id for the molecular family. """ - self.family_id: str = family_id + self.id: str = id self.spectra_ids: set[str] = set() self._spectra: set[Spectrum] = set() self._strains: StrainCollection = StrainCollection() def __str__(self) -> str: return ( - f"MolecularFamily(family_id={self.family_id}, #Spectrum_objects={len(self._spectra)}, " + f"MolecularFamily(id={self.id}, #Spectrum_objects={len(self._spectra)}, " f"#spectrum_ids={len(self.spectra_ids)}, #strains={len(self._strains)})" ) @@ -38,11 +38,11 @@ def __repr__(self) -> str: def __eq__(self, other) -> bool: if isinstance(other, MolecularFamily): - return self.family_id == other.family_id + return self.id == other.id return NotImplemented def __hash__(self) -> int: - return hash(self.family_id) + return hash(self.id) @property def spectra(self) -> set[Spectrum]: @@ -61,7 +61,7 @@ def add_spectrum(self, spectrum: Spectrum) -> None: spectrum: `Spectrum` object to add to the molecular family. """ self._spectra.add(spectrum) - self.spectra_ids.add(spectrum.spectrum_id) + self.spectra_ids.add(spectrum.id) self._strains = self._strains + spectrum.strains # add the molecular family to the spectrum spectrum.family = self @@ -73,7 +73,7 @@ def detach_spectrum(self, spectrum: Spectrum) -> None: spectrum: `Spectrum` object to remove from the molecular family. """ self._spectra.remove(spectrum) - self.spectra_ids.remove(spectrum.spectrum_id) + self.spectra_ids.remove(spectrum.id) self._strains = self._update_strains() # remove the molecular family from the spectrum spectrum.family = None diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 53c82b72..5ec7ccb2 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -14,7 +14,7 @@ class Spectrum: """Class to model MS/MS Spectrum. Attributes: - spectrum_id: the spectrum ID. + id: the spectrum ID. mz: the list of m/z values. intensity: the list of intensity values. precursor_mz: the m/z value of the precursor. @@ -30,7 +30,7 @@ class Spectrum: def __init__( self, - spectrum_id: str, + id: str, mz: list[float], intensity: list[float], precursor_mz: float, @@ -40,7 +40,7 @@ def __init__( """Initialize the Spectrum. Args: - spectrum_id: the spectrum ID. + id: the spectrum ID. mz: the list of m/z values. intensity: the list of intensity values. precursor_mz: the precursor m/z. @@ -48,7 +48,7 @@ def __init__( metadata: the metadata of the spectrum, i.e. the header infomation in the MGF file. """ - self.spectrum_id = spectrum_id + self.id = id self.mz = mz self.intensity = intensity self.precursor_mz = precursor_mz @@ -61,18 +61,18 @@ def __init__( self.family: MolecularFamily | None = None def __str__(self) -> str: - return f"Spectrum(spectrum_id={self.spectrum_id}, #strains={len(self.strains)})" + return f"Spectrum(id={self.id}, #strains={len(self.strains)})" def __repr__(self) -> str: return str(self) def __eq__(self, other) -> bool: if isinstance(other, Spectrum): - return self.spectrum_id == other.spectrum_id and self.precursor_mz == other.precursor_mz + return self.id == other.id and self.precursor_mz == other.precursor_mz return NotImplemented def __hash__(self) -> int: - return hash((self.spectrum_id, self.precursor_mz)) + return hash((self.id, self.precursor_mz)) @cached_property def peaks(self) -> np.ndarray: diff --git a/src/nplinker/metabolomics/utils.py b/src/nplinker/metabolomics/utils.py index 1f1ce8a0..a8a53aef 100644 --- a/src/nplinker/metabolomics/utils.py +++ b/src/nplinker/metabolomics/utils.py @@ -25,8 +25,8 @@ def add_annotation_to_spectrum(annotations: dict[str, dict], spectra: list[Spect spectra: A list of Spectrum objects. """ for spec in spectra: - if spec.spectrum_id in annotations: - spec.gnps_annotations = annotations[spec.spectrum_id] + if spec.id in annotations: + spec.gnps_annotations = annotations[spec.id] def add_strains_to_spectrum( @@ -51,7 +51,7 @@ def add_strains_to_spectrum( spectra_without_strains = [] for spec in spectra: try: - strain_list = strains.lookup(spec.spectrum_id) + strain_list = strains.lookup(spec.id) except ValueError: spectra_without_strains.append(spec) continue @@ -94,7 +94,7 @@ def add_spectrum_to_mf( - the third is a dictionary containing MolecularFamily objects as keys and a set of ids of missing Spectrum objects as values. """ - spec_dict = {spec.spectrum_id: spec for spec in spectra} + spec_dict = {spec.id: spec for spec in spectra} mf_with_spec = [] mf_without_spec = [] mf_missing_spec: dict[MolecularFamily, set[str]] = {} diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index dd3f8984..0dff285e 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -262,25 +262,25 @@ def get_links( logger.info("Final size of link collection is {}".format(len(link_collection))) return link_collection - def has_bgc(self, bgc_id): - """Returns True if BGC ``bgc_id`` exists in the dataset.""" - return bgc_id in self._bgc_lookup + def has_bgc(self, id): + """Returns True if BGC ``id`` exists in the dataset.""" + return id in self._bgc_lookup - def lookup_bgc(self, bgc_id): - """If BGC ``bgc_id`` exists, return it. Otherwise return None.""" - return self._bgc_lookup.get(bgc_id, None) + def lookup_bgc(self, id): + """If BGC ``id`` exists, return it. Otherwise return None.""" + return self._bgc_lookup.get(id, None) - def lookup_gcf(self, gcf_id): - """If GCF ``gcf_id`` exists, return it. Otherwise return None.""" - return self._gcf_lookup.get(gcf_id, None) + def lookup_gcf(self, id): + """If GCF ``id`` exists, return it. Otherwise return None.""" + return self._gcf_lookup.get(id, None) - def lookup_spectrum(self, spectrum_id): + def lookup_spectrum(self, id): """If Spectrum ``name`` exists, return it. Otherwise return None.""" - return self._spec_lookup.get(spectrum_id, None) + return self._spec_lookup.get(id, None) - def lookup_mf(self, mf_id): - """If MolecularFamily `family_id` exists, return it. Otherwise return None.""" - return self._mf_lookup.get(mf_id, None) + def lookup_mf(self, id): + """If MolecularFamily `id` exists, return it. Otherwise return None.""" + return self._mf_lookup.get(id, None) @property def strains(self): diff --git a/src/nplinker/pickler.py b/src/nplinker/pickler.py index 133ab1fd..c80866a5 100644 --- a/src/nplinker/pickler.py +++ b/src/nplinker/pickler.py @@ -39,13 +39,13 @@ class NPLinkerPickler(pickle.Pickler): def persistent_id(self, obj): if isinstance(obj, BGC): - return ("BGC", obj.bgc_id) + return ("BGC", obj.id) elif isinstance(obj, GCF): - return ("GCF", obj.gcf_id) + return ("GCF", obj.id) elif isinstance(obj, Spectrum): - return ("Spectrum", obj.spectrum_id) + return ("Spectrum", obj.id) elif isinstance(obj, MolecularFamily): - return ("MolecularFamily", obj.family_id) + return ("MolecularFamily", obj.id) else: # TODO: ideally should use isinstance(obj, ScoringMethod) here # but it's currently a problem because it creates a circular diff --git a/src/nplinker/scoring/iokr/nplinker_iokr.py b/src/nplinker/scoring/iokr/nplinker_iokr.py index 405e19a0..07219e0c 100644 --- a/src/nplinker/scoring/iokr/nplinker_iokr.py +++ b/src/nplinker/scoring/iokr/nplinker_iokr.py @@ -136,7 +136,7 @@ def score_smiles(self, ms_list, candidate_smiles): candidates = iokr_opt.preprocess_candidates(candidate_fps, latent, latent_basis, gamma) for ms_index, ms in enumerate(ms_list): - logger.debug("Rank spectrum {} ({}/{})".format(ms.spectrum_id, ms_index, len(ms_list))) + logger.debug("Rank spectrum {} ({}/{})".format(ms.id, ms_index, len(ms_list))) ms.filter = spectrum_filters.filter_by_frozen_dag logger.debug("kernel vector") t0 = time.time() diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 54b81c18..e868e1f7 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -387,9 +387,7 @@ def _get_links( links = [] if obj_type == "gcf": - # TODO CG: the hint and mypy warnings will be gone after renaming all - # string ids to `.id` - obj_ids = [gcf.gcf_id for gcf in objects] + obj_ids = [gcf.id for gcf in objects] # spec-gcf scores = self.raw_score_spec_gcf.loc[:, obj_ids] df = self._get_scores_source_gcf(scores, score_cutoff) @@ -402,14 +400,14 @@ def _get_links( links.append(df) if obj_type == "spec": - obj_ids = [spec.spectrum_id for spec in objects] + obj_ids = [spec.id for spec in objects] scores = self.raw_score_spec_gcf.loc[obj_ids, :] df = self._get_scores_source_met(scores, score_cutoff) df.name = LINK_TYPES[0] links.append(df) if obj_type == "mf": - obj_ids = [mf.family_id for mf in objects] + obj_ids = [mf.id for mf in objects] scores = self.raw_score_mf_gcf.loc[obj_ids, :] df = self._get_scores_source_met(scores, score_cutoff) df.name = LINK_TYPES[1] diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index e351f872..25479512 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -213,7 +213,7 @@ def _get_gen_classes(self, bgc_like, gcf_as_cutoff=0.5): if is_bgc: # get parent gcf for bgc bgc_like_gcf = [ - gcf for gcf in self.npl.gcfs if bgc_like.bgc_id in [b.bgc_id for b in gcf.bgcs] + gcf for gcf in self.npl.gcfs if bgc_like.id in [b.id for b in gcf.bgcs] ][0] # gather AS classes and convert to names in scoring dict as_classes = self.npl.class_matches.convert_as_classes( @@ -269,9 +269,7 @@ def _get_met_classes(self, spec_like, method="mix"): if is_spectrum: # list of list of tuples/None - todo: add to spectrum object? # take only 'best' (first) classification per ontology level - all_classes = self.npl.chem_classes.canopus.spectra_classes.get( - spec_like.spectrum_id - ) + all_classes = self.npl.chem_classes.canopus.spectra_classes.get(spec_like.id) if all_classes: spec_like_classes = [ cls_per_lvl @@ -283,9 +281,9 @@ def _get_met_classes(self, spec_like, method="mix"): self.npl.chem_classes.canopus.spectra_classes_names_inds ) else: # molfam - fam_id = spec_like.family_id + fam_id = spec_like.family.id if fam_id.startswith("singleton-"): # account for singleton families - fam_id += f"_{spec_like.spectra[0].spectrum_id}" + fam_id += f"_{spec_like.spectra[0].id}" all_classes = self.npl.chem_classes.canopus.molfam_classes.get(fam_id) if all_classes: spec_like_classes = [ @@ -301,12 +299,12 @@ def _get_met_classes(self, spec_like, method="mix"): # if mne or when main/canopus does not get classes if is_spectrum: spec_like_classes = self.npl.chem_classes.molnetenhancer.spectra_classes( - spec_like.spectrum_id + spec_like.id ) else: # molfam - fam_id = spec_like.family_id + fam_id = spec_like.family.id if fam_id.startswith("singleton"): # account for singleton families - fam_id += f"_{spec_like.spectra[0].spectrum_id}" + fam_id += f"_{spec_like.spectra[0].id}" spec_like_classes = self.npl.chem_classes.molnetenhancer.molfam_classes.get(fam_id) # classes are same for molfam and spectrum so names are irrespective of is_spectrum spec_like_classes_names_inds = ( diff --git a/src/nplinker/scoring/rosetta/rosetta.py b/src/nplinker/scoring/rosetta/rosetta.py index 3ab7a287..c24f962a 100644 --- a/src/nplinker/scoring/rosetta/rosetta.py +++ b/src/nplinker/scoring/rosetta/rosetta.py @@ -547,10 +547,10 @@ def export_to_csv(self, filename): for hit in self._rosetta_hits: csvwriter.writerow( [ - hit.spec.spectrum_id, + hit.spec.id, hit.gnps_id, hit.spec_match_score, - hit.bgc.bgc_id, + hit.bgc.id, hit.mibig_id, hit.bgc_match_score, ] diff --git a/src/nplinker/scoring/rosetta/rosetta_hit.py b/src/nplinker/scoring/rosetta/rosetta_hit.py index dda4a1ef..2ca23872 100644 --- a/src/nplinker/scoring/rosetta/rosetta_hit.py +++ b/src/nplinker/scoring/rosetta/rosetta_hit.py @@ -9,8 +9,8 @@ def __init__(self, spec, gnps_id, mibig_id, bgc, spec_match_score, bgc_match_sco def __str__(self): return "RosettaHit: {}<-->{} via ({} ({:.3f}), {} ({:.3f}))".format( - self.spec.spectrum_id, - self.bgc.bgc_id, + self.spec.id, + self.bgc.id, self.gnps_id, self.spec_match_score, self.mibig_id, diff --git a/src/nplinker/scoring/rosetta/spec_lib.py b/src/nplinker/scoring/rosetta/spec_lib.py index 6896a3a4..d99a413d 100644 --- a/src/nplinker/scoring/rosetta/spec_lib.py +++ b/src/nplinker/scoring/rosetta/spec_lib.py @@ -39,7 +39,7 @@ def get_n_spec(self): return len(self.spectra) def get_ids(self): - return list(s.spectrum_id for s in self.spectra) + return list(s.id for s in self.spectra) def get_n_peaks(self): return [len(s.peaks) for s in self.spectra] diff --git a/src/nplinker/scoring/rosetta_scoring.py b/src/nplinker/scoring/rosetta_scoring.py index c8a6ce62..057d14d7 100644 --- a/src/nplinker/scoring/rosetta_scoring.py +++ b/src/nplinker/scoring/rosetta_scoring.py @@ -126,7 +126,7 @@ def get_links(self, *objects, **parameters): def _collect_results_spectra(self, objects, ro_hits, results): for spec in objects: for hit in ro_hits: - if spec.spectrum_id == hit.spec.spectrum_id: + if spec.id == hit.spec.id: if not self.bgc_to_gcf: # can use the BGC directly results = self._insert_result_met(results, spec, hit.bgc, hit) @@ -141,7 +141,7 @@ def _collect_results_spectra(self, objects, ro_hits, results): def _collect_results_bgc(self, objects, ro_hits, results): for bgc in objects: for hit in ro_hits: - if bgc.bgc_id == hit.bgc.bgc_id: + if bgc.id == hit.bgc.id: if not self.bgc_to_gcf: # can use the BGC directly results = self._insert_result_gen(results, bgc, hit) diff --git a/src/nplinker/scoring/utils.py b/src/nplinker/scoring/utils.py index b3914ae7..a93ed24f 100644 --- a/src/nplinker/scoring/utils.py +++ b/src/nplinker/scoring/utils.py @@ -21,19 +21,19 @@ def get_presence_gcf_strain(gcfs: Sequence[GCF], strains: StrainCollection) -> p """Get the occurence of strains in gcfs. The occurence is a DataFrame with gcfs as rows and strains as columns, - where index is `gcf.gcf_id` and column name is `strain.id`. The values + where index is `gcf.id` and column name is `strain.id`. The values are 1 if the gcf contains the strain and 0 otherwise. """ df_gcf_strain = pd.DataFrame( np.zeros((len(gcfs), len(strains))), - index=[gcf.gcf_id for gcf in gcfs], + index=[gcf.id for gcf in gcfs], columns=[strain.id for strain in strains], dtype=int, ) for gcf in gcfs: for strain in strains: if gcf.has_strain(strain): - df_gcf_strain.loc[gcf.gcf_id, strain.id] = 1 + df_gcf_strain.loc[gcf.id, strain.id] = 1 return df_gcf_strain @@ -43,19 +43,19 @@ def get_presence_spec_strain( """Get the occurence of strains in spectra. The occurence is a DataFrame with spectra as rows and strains as columns, - where index is `spectrum.spectrum_id` and column name is `strain.id`. + where index is `spectrum.id` and column name is `strain.id`. The values are 1 if the spectrum contains the strain and 0 otherwise. """ df_spec_strain = pd.DataFrame( np.zeros((len(spectra), len(strains))), - index=[spectrum.spectrum_id for spectrum in spectra], + index=[spectrum.id for spectrum in spectra], columns=[strain.id for strain in strains], dtype=int, ) for spectrum in spectra: for strain in strains: if spectrum.has_strain(strain): - df_spec_strain.loc[spectrum.spectrum_id, strain.id] = 1 + df_spec_strain.loc[spectrum.id, strain.id] = 1 return df_spec_strain @@ -65,18 +65,18 @@ def get_presence_mf_strain( """Get the occurence of strains in molecular families. The occurence is a DataFrame with molecular families as rows and - strains as columns, where index is `mf.family_id` and column name is + strains as columns, where index is `mf.id` and column name is `strain.id`. The values are 1 if the molecular family contains the strain and 0 otherwise. """ df_mf_strain = pd.DataFrame( np.zeros((len(mfs), len(strains))), - index=[mf.family_id for mf in mfs], + index=[mf.id for mf in mfs], columns=[strain.id for strain in strains], dtype=int, ) for mf in mfs: for strain in strains: if mf.has_strain(strain): - df_mf_strain.loc[mf.family_id, strain.id] = 1 + df_mf_strain.loc[mf.id, strain.id] = 1 return df_mf_strain diff --git a/src/nplinker/strain/strain.py b/src/nplinker/strain/strain.py index 358ed371..20818f29 100644 --- a/src/nplinker/strain/strain.py +++ b/src/nplinker/strain/strain.py @@ -12,13 +12,13 @@ class Strain: id. """ - def __init__(self, primary_id: str) -> None: + def __init__(self, id: str) -> None: """To model the mapping between strain id and its aliases. Args: - primary_id: the representative id of the strain. + id: the representative id of the strain. """ - self.id: str = primary_id + self.id: str = id self._aliases: set[str] = set() def __repr__(self) -> str: diff --git a/tests/unit/genomics/test_antismash_loader.py b/tests/unit/genomics/test_antismash_loader.py index 2cee2b13..64f205aa 100644 --- a/tests/unit/genomics/test_antismash_loader.py +++ b/tests/unit/genomics/test_antismash_loader.py @@ -62,7 +62,7 @@ def test_parse_bgc_genbank(): gbk_file = str(DATA_DIR / "antismash" / "GCF_000514515.1" / "NZ_AZWB01000005.region001.gbk") bgc = parse_bgc_genbank(gbk_file) assert isinstance(bgc, BGC) - assert bgc.bgc_id == "NZ_AZWB01000005.region001" + assert bgc.id == "NZ_AZWB01000005.region001" assert bgc.product_prediction == ("NRPS", "lanthipeptide") assert "Salinispora pacifica CNT029 B170DRAFT_scaffold" in bgc.description assert bgc.antismash_id == "NZ_AZWB01000005" diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 9c0e6932..1cf3f401 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -5,7 +5,7 @@ def test_default(): bgc = BGC("BGC0000001", "Polyketide") - assert bgc.bgc_id == "BGC0000001" + assert bgc.id == "BGC0000001" assert bgc.product_prediction == ("Polyketide",) assert bgc.is_mibig() is True assert bgc.parents == set() diff --git a/tests/unit/genomics/test_gcf.py b/tests/unit/genomics/test_gcf.py index ded23e27..cd694472 100644 --- a/tests/unit/genomics/test_gcf.py +++ b/tests/unit/genomics/test_gcf.py @@ -23,7 +23,7 @@ def bgc_without_strain(): def test_init(): """Test the initialization of GCF.""" gcf = GCF("1") - assert gcf.gcf_id == "1" + assert gcf.id == "1" assert gcf.bgcs == set() assert isinstance(gcf.strains, StrainCollection) assert len(gcf.strains) == 0 @@ -77,7 +77,7 @@ def test_add_bgc_wo_strain(bgc_without_strain, caplog): """Test add_bgc method with a BGC that does have strain.""" gcf = GCF("1") gcf.add_bgc(bgc_without_strain) - assert gcf.gcf_id == "1" + assert gcf.id == "1" assert gcf.bgcs == {bgc_without_strain} assert len(gcf.strains) == 0 assert "No strain specified for the BGC" in caplog.text diff --git a/tests/unit/genomics/test_mibig_loader.py b/tests/unit/genomics/test_mibig_loader.py index 0cb59103..1188971b 100644 --- a/tests/unit/genomics/test_mibig_loader.py +++ b/tests/unit/genomics/test_mibig_loader.py @@ -67,5 +67,5 @@ def test_parse_bgc_metadata_json(): json_file = DATA_DIR / "mibig" / "BGC0000001_v3.1.json" bgc = parse_bgc_metadata_json(str(json_file)) assert isinstance(bgc, BGC) - assert bgc.bgc_id == "BGC0000001" + assert bgc.id == "BGC0000001" assert bgc.mibig_bgc_class == ("Polyketide",) diff --git a/tests/unit/metabolomics/test_gnps_molecular_family_loader.py b/tests/unit/metabolomics/test_gnps_molecular_family_loader.py index 9f3ad4f3..99dd3f1a 100644 --- a/tests/unit/metabolomics/test_gnps_molecular_family_loader.py +++ b/tests/unit/metabolomics/test_gnps_molecular_family_loader.py @@ -22,5 +22,5 @@ def test_gnps_molecular_family_loader( actual = loader.get_mfs(keep_singleton=keep_singleton) assert len(actual) == num_families # test molecular family with id "1" has correct number of spectra ids - mf = [mf for mf in actual if mf.family_id == "1"][0] + mf = [mf for mf in actual if mf.id == "1"][0] assert len(mf.spectra_ids) == num_spectra diff --git a/tests/unit/metabolomics/test_molecular_family.py b/tests/unit/metabolomics/test_molecular_family.py index 7713e189..eb6fcd26 100644 --- a/tests/unit/metabolomics/test_molecular_family.py +++ b/tests/unit/metabolomics/test_molecular_family.py @@ -8,7 +8,7 @@ @pytest.fixture() def spectrum1(): """Return a Spectrum object.""" - spec = Spectrum(spectrum_id="spec001", mz=[1.0], intensity=[1.0], precursor_mz=100.0) + spec = Spectrum(id="spec001", mz=[1.0], intensity=[1.0], precursor_mz=100.0) spec.strains = StrainCollection() spec.strains.add(Strain("strain001")) yield spec @@ -17,7 +17,7 @@ def spectrum1(): @pytest.fixture() def spectrum2(): """Return a Spectrum object.""" - spec = Spectrum(spectrum_id="spec002", mz=[1.0], intensity=[1.0], precursor_mz=100.0) + spec = Spectrum(id="spec002", mz=[1.0], intensity=[1.0], precursor_mz=100.0) spec.strains = StrainCollection() spec.strains.add(Strain("strain002")) yield spec @@ -26,7 +26,7 @@ def spectrum2(): def test_init(): """Test MolecularFamily class initialization.""" mf = MolecularFamily("mf001") - assert mf.family_id == "mf001" + assert mf.id == "mf001" assert mf.spectra_ids == set() assert mf.spectra == set() assert mf.strains == StrainCollection() @@ -35,10 +35,7 @@ def test_init(): def test_str_repr(): """Test __str__ and __repr__ methods.""" mf = MolecularFamily("mf001") - assert ( - str(mf) - == "MolecularFamily(family_id=mf001, #Spectrum_objects=0, #spectrum_ids=0, #strains=0)" - ) + assert str(mf) == "MolecularFamily(id=mf001, #Spectrum_objects=0, #spectrum_ids=0, #strains=0)" assert repr(mf) == str(mf) @@ -80,11 +77,11 @@ def test_add_spectrum(spectrum1, spectrum2): mf = MolecularFamily("mf001") mf.add_spectrum(spectrum1) assert spectrum1 in mf.spectra - assert spectrum1.spectrum_id in mf.spectra_ids + assert spectrum1.id in mf.spectra_ids assert Strain("strain001") in mf.strains mf.add_spectrum(spectrum2) assert spectrum2 in mf.spectra - assert spectrum2.spectrum_id in mf.spectra_ids + assert spectrum2.id in mf.spectra_ids assert Strain("strain002") in mf.strains assert len(mf.spectra) == 2 assert len(mf.spectra_ids) == 2 @@ -98,12 +95,12 @@ def test_detach_spectrum(spectrum1, spectrum2): mf.add_spectrum(spectrum2) mf.detach_spectrum(spectrum1) assert spectrum1 not in mf.spectra - assert spectrum1.spectrum_id not in mf.spectra_ids + assert spectrum1.id not in mf.spectra_ids assert Strain("strain001") not in mf.strains assert Strain("strain002") in mf.strains mf.detach_spectrum(spectrum2) assert spectrum2 not in mf.spectra - assert spectrum2.spectrum_id not in mf.spectra_ids + assert spectrum2.id not in mf.spectra_ids assert Strain("strain002") not in mf.strains assert len(mf.spectra) == 0 assert len(mf.spectra_ids) == 0 diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index 3c4d22ed..e984eaba 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -16,7 +16,7 @@ def test_init(rt, metadata, expected_metadata): """Test the initialization of the Spectrum class.""" spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, rt, metadata) - assert spec.spectrum_id == "spec1" + assert spec.id == "spec1" assert spec.mz == [100, 200] assert spec.intensity == [0.1, 0.2] assert spec.precursor_mz == 150 @@ -33,8 +33,8 @@ def test_init(rt, metadata, expected_metadata): def test_str_repr(): """Test the __str__ and __repr__ methods.""" spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) - assert str(spec) == "Spectrum(spectrum_id=spec1, #strains=0)" - assert repr(spec) == "Spectrum(spectrum_id=spec1, #strains=0)" + assert str(spec) == "Spectrum(id=spec1, #strains=0)" + assert repr(spec) == "Spectrum(id=spec1, #strains=0)" def test_eq(): diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index bea4edb3..cb750407 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -77,9 +77,9 @@ def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker: npl._spectra = spectra npl._molfams = mfs npl._strains = strains - npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} - npl._mf_lookup = {mf.family_id: mf for mf in mfs} - npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} + npl._gcf_lookup = {gcf.id: gcf for gcf in gcfs} + npl._mf_lookup = {mf.id: mf for mf in mfs} + npl._spec_lookup = {spec.id: spec for spec in spectra} return npl diff --git a/tests/unit/scoring/test_nplinker_scoring.py b/tests/unit/scoring/test_nplinker_scoring.py index a76c2231..3b65f911 100644 --- a/tests/unit/scoring/test_nplinker_scoring.py +++ b/tests/unit/scoring/test_nplinker_scoring.py @@ -16,7 +16,7 @@ def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs, strains_l assert isinstance(links, LinkCollection) links = links.links # dict of link values assert len(links) == 3 - assert {i.gcf_id for i in links.keys()} == {"gcf1", "gcf2", "gcf3"} + assert {i.id for i in links.keys()} == {"gcf1", "gcf2", "gcf3"} assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) assert links[gcfs[0]][spectra[0]].data(mc) == 12 assert links[gcfs[1]][spectra[0]].data(mc) == -9 @@ -30,7 +30,7 @@ def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs, strains_l links = npl.get_links(list(gcfs), mc, and_mode=True) assert isinstance(links, LinkCollection) links = links.links - assert {i.gcf_id for i in links.keys()} == {"gcf1", "gcf2", "gcf3"} + assert {i.id for i in links.keys()} == {"gcf1", "gcf2", "gcf3"} assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) # test scores assert links[gcfs[0]][spectra[0]].data(mc) == 12 @@ -57,7 +57,7 @@ def test_get_links_spec_standardised_false(npl, mc, gcfs, spectra, strains_list) assert isinstance(links, LinkCollection) links = links.links # dict of link values assert len(links) == 3 - assert {i.spectrum_id for i in links.keys()} == {"spectrum1", "spectrum2", "spectrum3"} + assert {i.id for i in links.keys()} == {"spectrum1", "spectrum2", "spectrum3"} assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) assert links[spectra[0]][gcfs[0]].data(mc) == 12 assert links[spectra[0]][gcfs[1]].data(mc) == -9 @@ -68,7 +68,7 @@ def test_get_links_spec_standardised_false(npl, mc, gcfs, spectra, strains_list) assert isinstance(links, LinkCollection) links = links.links # dict of link values assert len(links) == 3 - assert {i.spectrum_id for i in links.keys()} == {"spectrum1", "spectrum2", "spectrum3"} + assert {i.id for i in links.keys()} == {"spectrum1", "spectrum2", "spectrum3"} assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) assert links[spectra[0]][gcfs[0]].data(mc) == 12 assert links[spectra[0]].get(gcfs[1]) is None @@ -91,7 +91,7 @@ def test_get_links_mf_standardised_false(npl, mc, gcfs, mfs, strains_list): assert isinstance(links, LinkCollection) links = links.links assert len(links) == 3 - assert {i.family_id for i in links.keys()} == {"mf1", "mf2", "mf3"} + assert {i.id for i in links.keys()} == {"mf1", "mf2", "mf3"} assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) assert links[mfs[0]][gcfs[0]].data(mc) == 12 assert links[mfs[0]][gcfs[1]].data(mc) == -9 @@ -102,7 +102,7 @@ def test_get_links_mf_standardised_false(npl, mc, gcfs, mfs, strains_list): assert isinstance(links, LinkCollection) links = links.links assert len(links) == 3 - assert {i.family_id for i in links.keys()} == {"mf1", "mf2", "mf3"} + assert {i.id for i in links.keys()} == {"mf1", "mf2", "mf3"} assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) assert links[mfs[0]][gcfs[0]].data(mc) == 12 assert links[mfs[0]].get(gcfs[1]) is None