diff --git a/src/nplinker/annotations.py b/src/nplinker/annotations.py index 970576a7..6449b6d2 100644 --- a/src/nplinker/annotations.py +++ b/src/nplinker/annotations.py @@ -14,10 +14,9 @@ import csv import os - from deprecated import deprecated - -from nplinker.metabolomics.spectrum import Spectrum, GNPS_KEY +from nplinker.metabolomics.spectrum import GNPS_KEY +from nplinker.metabolomics.spectrum import Spectrum from .logconfig import LogConfig @@ -61,14 +60,14 @@ def create_gnps_annotation(spec: Spectrum, gnps_anno: dict): @deprecated(version="1.3.3", reason="Use GNPSAnnotationLoader class instead.") -def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[int, Spectrum]) -> list[Spectrum]: +def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra: list[Spectrum], spec_dict: dict[str, Spectrum]) -> list[Spectrum]: """Load the annotations from the GNPS annotation file present in root to the spectra. Args: root(str | os.PathLike): Path to the downloaded and extracted GNPS results. config(str | os.PathLike): Path to config file for custom file locations. spectra(list[Spectrum]): List of spectra to annotate. - spec_dict(dict[int, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable. + spec_dict(dict[str, Spectrum]): Dictionary mapping to spectra passed in `spectra` variable. Raises: Exception: Raises exception if custom annotation config file has invalid content. @@ -76,7 +75,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra Returns: list[Spectrum]: List of annotated spectra. """ - + if not os.path.exists(root): logger.debug(f'Annotation directory not found ({root})') return spectra @@ -89,7 +88,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra logger.debug('Found {} annotations .tsv files in {}'.format( len(annotation_files), root)) - + for af in annotation_files: with open(af) as f: rdr = csv.reader(f, delimiter='\t') @@ -105,7 +104,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra # each line should be a different spec ID here for line in rdr: # read the scan ID column and get the corresponding Spectrum object - scan_id = int(line[scans_index]) + scan_id = line[scans_index] if scan_id not in spec_dict: logger.warning( 'Unknown spectrum ID found in GNPS annotation file (ID={})' @@ -147,7 +146,7 @@ def load_annotations(root: str | os.PathLike, config: str | os.PathLike, spectra # note that might have multiple lines for the same spec ID! spec_annotations = {} for line in rdr: - scan_id = int(line[spec_id_index]) + scan_id = line[spec_id_index] if scan_id not in spec_dict: logger.warning( 'Unknown spectrum ID found in annotation file "{}", ID is "{}"' diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py index 8aeb0190..eb0e0c99 100644 --- a/src/nplinker/class_info/chem_classes.py +++ b/src/nplinker/class_info/chem_classes.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import Counter import glob import os -from collections import Counter from canopus import Canopus from canopus.classifications_to_gnps import analyse_canopus from ..logconfig import LogConfig @@ -399,17 +399,17 @@ class prediction for a level. When no class is present, instead of Tuple it will molfam_classes = {} for molfam in molfams: - fid = str(molfam.family_id) # the key + fid = molfam.family_id # the key spectra = molfam.spectra - # if singleton family, format like '-1_spectrum-id' - if fid == '-1': + # if singleton family, format like 'fid_spectrum-id' + if fid.startswith('singleton-'): spec_id = spectra[0].spectrum_id fid += f'_{spec_id}' len_molfam = len(spectra) classes_per_spectra = [] for spec in spectra: - spec_classes = self.spectra_classes.get(str(spec.spectrum_id)) + spec_classes = self.spectra_classes.get(spec.spectrum_id) if spec_classes: # account for spectra without prediction classes_per_spectra.append(spec_classes) @@ -555,6 +555,7 @@ def _read_cf_classes(self, mne_dir): nr_nodes = line.pop(0) # todo: make it easier to query classes of singleton families # if singleton family, format like '-1_spectrum-id' like canopus results + # CG: Note that the singleton families id is "singleton-" + uuid. if nr_nodes == '1': component = f'-1_{cluster}' class_info = [] diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 7eb7921a..668d159c 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -4,6 +4,7 @@ from nplinker.logconfig import LogConfig from .aa_pred import predict_aa + if TYPE_CHECKING: from ..strains import Strain from .gcf import GCF @@ -121,7 +122,7 @@ def strain(self, strain: Strain) -> None: self._strain = strain @property - def bigscape_classes(self) -> set[str]: + def bigscape_classes(self) -> set[str | None]: """Get BiG-SCAPE's BGC classes. BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 5ea61486..1bf413f1 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -35,7 +35,6 @@ def __init__(self, gcf_id: str, /) -> None: self.bigscape_class: str | None = None # CG TODO: remove attribute id, see issue 103 # https://github.com/NPLinker/nplinker/issues/103 - self.id: int | None = None self.bgc_ids: set[str] = set() self.strains: StrainCollection = StrainCollection() diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index c4e694b0..2ed1ab62 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -245,7 +245,8 @@ def _filter_gcfs( for bgc in bgcs_to_remove: bgcs.remove(bgc) - strains.remove(bgc.strain) + if bgc.strain is not None: + strains.remove(bgc.strain) logger.info( 'Remove GCFs that has only MIBiG BGCs: removing {} GCFs and {} BGCs'. diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index c753fe24..b25952c4 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -659,12 +659,6 @@ def _load_genomics(self): antismash_bgc_loader.get_files(), self._bigscape_cutoff) - # CG TODO: remove the gcf.id, see issue 103 - # https://github.com/NPLinker/nplinker/issues/103 - # This is only place to set gcf.id value. - for i, gcf in enumerate(self.gcfs): - gcf.id = i - #---------------------------------------------------------------------- # CG: write unknown strains in genomics to file #---------------------------------------------------------------------- @@ -680,6 +674,7 @@ def _load_genomics(self): return True + # TODO CG: replace deprecated load_dataset with GPNSLoader def _load_metabolomics(self): spec_dict, self.spectra, self.molfams, unknown_strains = load_dataset( self.strains, diff --git a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py index 13cb5028..aade4de2 100644 --- a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py @@ -2,9 +2,9 @@ from os import PathLike from pathlib import Path from typing import Any - from nplinker.metabolomics.abc import AnnotationLoaderBase + GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}' class GNPSAnnotationLoader(AnnotationLoaderBase): @@ -15,28 +15,28 @@ def __init__(self, file: str | PathLike): file(str | PathLike): The GNPS annotation file. """ self._file = Path(file) - self._annotations : dict[int, dict] = dict() + self._annotations : dict[str, dict] = {} with open(self._file, mode='rt', encoding='UTF-8') as f: header = f.readline().split('\t') dict_reader = csv.DictReader(f, header, delimiter='\t') for row in dict_reader: - scan_id = int(row.pop('#Scan#')) + scan_id = row.pop('#Scan#') self._annotations[scan_id] = row - + # also insert useful URLs for t in ['png', 'json', 'svg', 'spectrum']: self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID']) - - def get_annotations(self) -> dict[int, dict]: + + def get_annotations(self) -> dict[str, dict]: """Get annotations. Returns: - dict[int, dict]: Spectra indices are keys and values are the annotations for this spectrum. + dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum. Examples: >>> print(loader.annotations()[100]) """ - return self._annotations \ No newline at end of file + return self._annotations diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index 47783e15..acbe5547 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -17,44 +17,44 @@ def __init__(self, file: str | PathLike): file(str | PathLike): str or PathLike object pointing towards the GNPS molecular families file to load. """ self._families: list[MolecularFamily | SingletonFamily] = [] - + for family_id, spectra_ids in _load_molecular_families(file).items(): - if family_id == -1: + if family_id == '-1': # the "-1" is from GNPS result for spectrum_id in spectra_ids: - family = SingletonFamily() + family = SingletonFamily() ## uuid as family id family.spectra_ids = set([spectrum_id]) self._families.append(family) else: family = MolecularFamily(family_id) family.spectra_ids = spectra_ids self._families.append(family) - + def families(self) -> list[MolecularFamily]: return self._families -def _load_molecular_families(file: str | PathLike) -> dict[int, set[int]]: +def _load_molecular_families(file: str | PathLike) -> dict[str, set[str]]: """Load ids of molecular families and corresponding spectra from GNPS output file. Args: file(str | PathLike): path to the GNPS file to load molecular families. Returns: - dict[int, set[int]]: Mapping from molecular family/cluster id to the spectra ids. + dict[str, set[str]]: Mapping from molecular family/cluster id to the spectra ids. """ logger.debug('loading edges file: %s', file) families: dict = {} - + with open(file, mode='rt', encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t') headers = next(reader) cid1_index, cid2_index, fam_index = _sniff_column_indices(file, headers) for line in reader: - spec1_id = int(line[cid1_index]) - spec2_id = int(line[cid2_index]) - family_id = int(line[fam_index]) + spec1_id = line[cid1_index] + spec2_id = line[cid2_index] + family_id = line[fam_index] if families.get(family_id) is None: families[family_id] = set([spec1_id, spec2_id]) @@ -84,5 +84,5 @@ def _sniff_column_indices(file: str | PathLike, headers: list[str]) -> tuple[int except ValueError as ve: message = f'Unknown or missing column(s) in edges file: {file}' raise Exception(message) from ve - + return cid1_index,cid2_index,fam_index diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py index 59acd06f..0a1431cf 100644 --- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py @@ -18,7 +18,7 @@ def __init__(self, file: str | PathLike): ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(file)]) logger.info('%d molecules parsed from MGF file', len(ms1)) self._spectra = _mols_to_spectra(ms2, metadata) - + def spectra(self) -> list[Spectrum]: """Get the spectra loaded from the file. @@ -26,7 +26,7 @@ def spectra(self) -> list[Spectrum]: list[Spectrum]: the loaded spectra as a list of `Spectrum` objects. """ return self._spectra - + def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spectrum]: """Function to convert ms2 object and metadata to `Spectrum` objects. @@ -39,14 +39,16 @@ def _mols_to_spectra(ms2: list, metadata: dict[str, dict[str, str]]) -> list[Spe list[Spectrum]: List of mass spectra obtained from ms2 and metadata. """ ms2_dict = {} + # an example of m: + # (118.487999, 0.0, 18.753, , 'spectra.mgf', 0.0) for m in ms2: - if not m[3] in ms2_dict: + if not m[3] in ms2_dict: # m[3] is `nplinker.parsers.mgf.MS1` object ms2_dict[m[3]] = [] ms2_dict[m[3]].append((m[0], m[2])) spectra = [] - for i, m in enumerate(ms2_dict.keys()): - new_spectrum = Spectrum(i, ms2_dict[m], int(m.name), + for i, m in enumerate(ms2_dict.keys()): # m is `nplinker.parsers.mgf.MS1` object + new_spectrum = Spectrum(i, ms2_dict[m], m.name, metadata[m.name]['precursormass'], metadata[m.name]['parentmass']) new_spectrum.metadata = metadata[m.name] diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py index 0266df56..14a5ec2c 100644 --- a/src/nplinker/metabolomics/load_gnps.py +++ b/src/nplinker/metabolomics/load_gnps.py @@ -4,10 +4,11 @@ from typing import Any from deprecated import deprecated from nplinker.logconfig import LogConfig +from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping +from nplinker.metabolomics.gnps.gnps_format import GNPSFormat from nplinker.metabolomics.spectrum import Spectrum from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain -from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping ,GNPSFormat logger = LogConfig.getLogger(__name__) @@ -102,7 +103,7 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, the strain_mappings file. finally it also tries to extract the growth medium label as given in the metadata_table file, again using the strain label which should match between the two files. - >>> + >>> """ @@ -160,14 +161,14 @@ def _parse_mzxml_header(hdr: str, strains: StrainCollection, md_table: dict[str, return (strain_name, growth_medium, strain_name not in strains) -def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[int, Spectrum]) -> dict[str, int]: +def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str, spec_dict: dict[str, Spectrum]) -> dict[str, int]: """ Load info about clusters from old style GNPS files. Args: gnps_format(str): Identifier for the GNPS format of the file. Has to be one of [GNPS_FORMAT_OLD_ALLFILES, GNPS_FORMAT_OLD_UNIQUEFILES] strains(StrainCollection): StrainCollection in which to search for the detected strains. file(str): Path to file from which to load the cluster information. - spec_dict(dict[int, Spectrum]): Dictionary with already loaded spectra into which the metadata read from the file will be inserted. + spec_dict(dict[str, Spectrum]): Dictionary with already loaded spectra into which the metadata read from the file will be inserted. Raises: Exception: Raises exception if not supported GNPS format was detected. @@ -194,7 +195,7 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str for line in reader: # get the values of the important columns - clu_index = int(line[clu_index_index]) + clu_index = line[clu_index_index] if gnps_format == GNPSFormat.UniqueFiles: mzxmls = line[mzxml_index].split('|') else: @@ -314,7 +315,7 @@ def _parse_metadata_table(file: str) -> dict[str, dict[str, str|None]]: def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nodes_file: str, - md_table_file: str, spec_dict: dict[int, Spectrum], ext_metadata_parsing: bool) -> tuple[dict[int, dict[str, str|None]], dict[str, int]]: + md_table_file: str, spec_dict: dict[str, Spectrum], ext_metadata_parsing: bool) -> tuple[dict[str, dict[str, str|None]], dict[str, str]]: """Load the clusterinfo from a feature based molecular networking run output from GNPS. Args: @@ -322,11 +323,11 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod nodes_file(str): File from which to load the cluster information. extra_nodes_file(str): Unknown. md_table_file(str): Path to metadata table. Deprecated. - spec_dict(dict[int, Spectrum]): Dictionary with already loaded spectra. + spec_dict(dict[str, Spectrum]): Dictionary with already loaded spectra. ext_metadata_parsing(bool): Whether to use extended metadata parsing. Returns: - tuple[dict[int, dict], dict[str, int]]: Spectra info mapping from spectrum id to all columns in the nodes file and unknown strain mapping from file identifier to spectrum id. + tuple[dict[str, dict], dict[str, str]]: Spectra info mapping from spectrum id to all columns in the nodes file and unknown strain mapping from file identifier to spectrum id. """ spec_info = {} @@ -347,7 +348,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod tmp = {} for i, v in enumerate(line): tmp[headers[i]] = v - spec_info[int(line[ci_index])] = tmp + spec_info[line[ci_index]] = tmp with open(extra_nodes_file) as f: reader = csv.reader(f, delimiter=',') @@ -358,7 +359,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod # nodes_file to the "row ID" from this file, and update the per-row dict # with the extra columns from this file for line in reader: - ri = int(line[ri_index]) + ri = line[ri_index] tmp = {} for i, v in enumerate(line): tmp[headers[i]] = v @@ -437,7 +438,7 @@ def _load_clusterinfo_fbmn(strains: StrainCollection, nodes_file: str, extra_nod @deprecated(version="1.3.3", reason="Use the GNPSFileMappingLoader class instead.") def load_gnps(strains: StrainCollection, nodes_file: str, quant_table_file: str, metadata_table_file: str, - ext_metadata_parsing: bool, spec_dict: dict[int, Spectrum]) -> dict[str, int]: + ext_metadata_parsing: bool, spec_dict: dict[str, Spectrum]) -> dict[str, int]: """Wrapper function to load information from GNPS outputs. Args: @@ -446,13 +447,13 @@ def load_gnps(strains: StrainCollection, nodes_file: str, quant_table_file: str, quant_table_file(str): Path to the quantification table. metadata_table_file(str): Path to the metadata table. ext_metadata_parsing(bool): Whether to use extended metadata parsing. - spec_dict(dict[int, Spectrum]): Mapping from int to spectra loaded from file. + spec_dict(dict[str, Spectrum]): Mapping from int to spectra loaded from file. Raises: Exception: Raises exception if an unknown GNPS format is encountered. Returns: - dict[str, int]: Returns a mapping from unknown strains which are found to spectra ids which occur in these unknown strains. + dict[str, int]: Returns a mapping from unknown strains which are found to spectra ids which occur in these unknown strains. """ gnps_format = gnps_format_from_file_mapping(nodes_file, quant_table_file is not None) diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py index 1c4cde5d..79b1c7d5 100644 --- a/src/nplinker/metabolomics/metabolomics.py +++ b/src/nplinker/metabolomics/metabolomics.py @@ -35,7 +35,7 @@ def _mols_to_spectra(ms2, metadata): spectra = [] for i, m in enumerate(ms2_dict.keys()): - new_spectrum = Spectrum(i, ms2_dict[m], int(m.name), + new_spectrum = Spectrum(i, ms2_dict[m], m.name, metadata[m.name]['precursormass'], metadata[m.name]['parentmass']) new_spectrum.metadata = metadata[m.name] @@ -53,15 +53,15 @@ def _mols_to_spectra(ms2, metadata): return spectra @deprecated(version="1.3.3", reason="Use the GNPSMolecularFamilyLoader class instead.") -def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]): +def load_edges(edges_file: str | PathLike, spec_dict: dict[str, Spectrum]): """Insert information about the molecular family into the spectra. Args: edges_file(str): File containing the molecular families. - spec_dict(dict[int, Spectrum]): Dictionary with mapping from spectra_id to Spectrum. + spec_dict(dict[str, Spectrum]): Dictionary with mapping from spectra_id to Spectrum. Raises: - Exception: Raises exception if the edges file doesn't contain the correct columns. + Exception: Raises exception if the edges file doesn't contain the correct columns. """ logger.debug('loading edges file: {} [{} spectra from MGF]'.format( edges_file, len(spec_dict))) @@ -79,16 +79,16 @@ def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]): edges_file)) for line in reader: - spec1_id = int(line[cid1_index]) - spec2_id = int(line[cid2_index]) + spec1_id = line[cid1_index] + spec2_id = line[cid2_index] cosine = float(line[cos_index]) - family = int(line[fam_index]) + family = line[fam_index] if spec1_id in spec_dict and spec2_id in spec_dict: spec1 = spec_dict[spec1_id] spec2 = spec_dict[spec2_id] - if family != -1: # singletons + if family != '-1': # singletons spec1.family_id = family spec2.family_id = family @@ -134,7 +134,7 @@ def load_dataset(strains, # spec_dict = {spec.spectrum_id: spec for spec in spectra} # add edges info to the spectra - molfams = make_families(spec_dict.values()) + molfams = make_families(list(spec_dict.values())) # molfams = GNPSMolecularFamilyLoader(edges_file).families() unknown_strains = load_gnps(strains, nodes_file, quant_table_file, @@ -145,7 +145,7 @@ def load_dataset(strains, @deprecated(version="1.3.3", reason="Use the GNPSSpectrumLoader class instead.") -def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[int, Spectrum]: +def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[str, Spectrum]: """Wrapper function to load spectra and init the molecular family links. Args: @@ -153,14 +153,14 @@ def load_spectra(mgf_file: str | PathLike, edges_file: str | PathLike) -> dict[i edges_file(str | PathLike): File storing the molecular family information in .selfloop or .pairsinfo format. Returns: - dict[int, Spectrum]: Indexed dict of mass spectra. + dict[str, Spectrum]: Indexed dict of mass spectra. """ ms1, ms2, metadata = LoadMGF(name_field='scans').load_spectra([str(mgf_file)]) logger.info('%d molecules parsed from MGF file', len(ms1)) spectra = _mols_to_spectra(ms2, metadata) # above returns a list, create a dict indexed by spectrum_id to make # the rest of the parsing a bit simpler - spec_dict: dict[int, Spectrum] = {spec.spectrum_id: spec for spec in spectra} + spec_dict: dict[str, Spectrum] = {spec.spectrum_id: spec for spec in spectra} load_edges(edges_file, spec_dict) return spec_dict @@ -181,7 +181,7 @@ def make_families(spectra: list[Spectrum]) -> list[MolecularFamily]: fams, singles = 0, 0 for spectrum in spectra: family_id = spectrum.family_id - if family_id == -1: # singleton + if family_id == '-1': # singleton new_family = SingletonFamily() new_family.id = family_index family_index += 1 diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index a55a4df9..c4e584c5 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -1,30 +1,23 @@ from typing_extensions import Self - from nplinker.metabolomics.spectrum import Spectrum from nplinker.strain_collection import StrainCollection from nplinker.strains import Strain + class MolecularFamily(): - def __init__(self, family_id: int): + def __init__(self, family_id: str): """Class to model molecular families. Args: - family_id(int): Id for the molecular family. + family_id(str): Id for the molecular family. """ self.id: int = -1 - self.family_id: int = family_id + self.family_id: str = family_id self.spectra: list[Spectrum] = [] - self.family = None - self.spectra_ids: set[int] = set() - - # def has_strain(self, strain): - # for spectrum in self.spectra: - # if spectrum.has_strain(strain): - # return True - - # return False + self.spectra_ids: set[str] = set() + # TODO: change property to attibute @property def strains(self) -> StrainCollection: """Get strains of spectra in the molecular family. @@ -38,6 +31,18 @@ def strains(self) -> StrainCollection: strains.add(strain) return strains + def has_strain(self, strain: str | Strain) -> bool: + """Check if the given strain exists. + + Args: + strain(str | Strain): strain id or `Strain` object. + + Returns: + bool: True when the given strain exist. + """ + return strain in self.strains + + # TODO: update the logics, mf should also be added to the spectrum object def add_spectrum(self, spectrum: Spectrum): """Add a spectrum to the spectra list. @@ -50,11 +55,10 @@ def __str__(self) -> str: return 'MolFam(family_id={}, spectra={})'.format( self.family_id, len(self.spectra)) - def __eq__(self, other: Self) -> bool: + def __eq__(self, other) -> bool: if isinstance(other, MolecularFamily): return (self.id == other.id - and self.family_id == other.family_id - and set(self.spectra) == set(other.spectra)) + and self.family_id == other.family_id) return NotImplemented def __hash__(self) -> int: diff --git a/src/nplinker/metabolomics/singleton_family.py b/src/nplinker/metabolomics/singleton_family.py index a742b807..94457049 100644 --- a/src/nplinker/metabolomics/singleton_family.py +++ b/src/nplinker/metabolomics/singleton_family.py @@ -1,10 +1,11 @@ +import uuid from .molecular_family import MolecularFamily + class SingletonFamily(MolecularFamily): def __init__(self): - super().__init__(-1) + super().__init__("singleton-" + str(uuid.uuid4())) def __str__(self): return f"Singleton molecular family (id={self.id})" - diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 78a8e8ce..12663871 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -2,6 +2,7 @@ from nplinker.strain_collection import StrainCollection from nplinker.utils import sqrt_normalise + GNPS_KEY = 'gnps' JCAMP = '##TITLE={}\\n' +\ @@ -22,7 +23,7 @@ class Spectrum(): def __init__(self, id, peaks, - spectrum_id, + spectrum_id: str, precursor_mz, parent_mz=None, rt=None): @@ -34,8 +35,7 @@ def __init__(self, intensity for mz, intensity in self.peaks) self.total_ms2_intensity = sum( intensity for mz, intensity in self.peaks) - assert (isinstance(spectrum_id, int)) - self.spectrum_id = spectrum_id # == metadata.get('cluster_index') + self.spectrum_id = spectrum_id # MS1.name self.rt = rt self.precursor_mz = precursor_mz self.parent_mz = parent_mz @@ -47,7 +47,8 @@ def __init__(self, # this is a dict indexed by Strain objects (the strains found in this Spectrum), with # the values being dicts of the form {growth_medium: peak intensity} for the parent strain self.growth_media = {} - self.family_id = -1 + # TODO CG: self.family_id should be removed, used in deprecated make_families method + self.family_id = '-1' self.family = None # a dict indexed by filename, or "gnps" self.annotations = {} diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 1019abe5..6e58981f 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -1,20 +1,8 @@ -# Copyright 2021 The NPLinker Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +from __future__ import annotations import copy import logging import sys +from typing import TYPE_CHECKING from .config import Args from .config import Config from .genomics import BGC @@ -28,7 +16,12 @@ from .scoring.metcalf_scoring import MetcalfScoring from .scoring.np_class_scoring import NPClassScoring from .scoring.rosetta_scoring import RosettaScoring +from .strain_collection import StrainCollection + +if TYPE_CHECKING: + from collections.abc import Sequence + from .strains import Strain logger = LogConfig.getLogger(__name__) @@ -130,7 +123,9 @@ def __init__(self, userconfig=None): self._class_matches = None self._bgc_lookup = {} + self._gcf_lookup = {} self._spec_lookup = {} + self._mf_lookup = {} self._scoring_methods = {} config_methods = self._config.config.get('scoring_methods', []) @@ -260,6 +255,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): Returns: bool: True if successful, False otherwise """ + # TODO: the met_only is useless, remove it. NPlinker will stop working if met_only=True # typical case where load_data is being called with no params if new_bigscape_cutoff is None: logger.debug( @@ -272,8 +268,7 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): else: # CG: only reload genomics data when changing bigscape cutoff # TODO: this part should be removed, reload everything if bigscape data changes. - logger.debug( - f'load_data with new cutoff = {new_bigscape_cutoff}') + logger.debug(f'load_data with new cutoff = {new_bigscape_cutoff}') # 1. change the cutoff (which by itself doesn't do anything) self._loader._bigscape_cutoff = new_bigscape_cutoff # 2. reload the strain mappings (MiBIG filtering may have removed strains @@ -294,29 +289,23 @@ def load_data(self, new_bigscape_cutoff=None, met_only=False): self._class_matches = self._loader.class_matches logger.debug('Generating lookup tables: genomics') - self._bgc_lookup = {} - for i, bgc in enumerate(self._bgcs): - self._bgc_lookup[bgc.bgc_id] = i - - self._gcf_lookup = {} - for i, gcf in enumerate(self._gcfs): - self._gcf_lookup[gcf.gcf_id] = i + self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs} + self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs} # don't need to do these two if cutoff changed (indicating genomics data # was reloaded but not metabolomics) if new_bigscape_cutoff is None: logger.debug('Generating lookup tables: metabolomics') - self._spec_lookup = {} - for i, spec in enumerate(self._spectra): - self._spec_lookup[spec.spectrum_id] = i - - self._molfam_lookup = {} - for i, molfam in enumerate(self._molfams): - self._molfam_lookup[molfam.id] = i + self._spec_lookup = { + spec.spectrum_id: spec + for spec in self._spectra + } + self._mf_lookup = {mf.family_id: mf for mf in self._molfams} logger.debug('load_data: completed') return True + # TODO CG: refactor this method and update its unit tests def get_links(self, input_objects, scoring_methods, and_mode=True): """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams) @@ -407,8 +396,8 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): objects_for_method = input_objects[i] logger.debug('Calling scoring method {} on {} objects'.format( method.name, len(objects_for_method))) - link_collection = method.get_links(objects_for_method, - link_collection) + link_collection = method.get_links(*objects_for_method, + link_collection=link_collection) if not self._datalinks: logger.debug('Creating internal datalinks object') @@ -432,20 +421,16 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): targets = list( filter(lambda x: not isinstance(x, BGC), link_data.keys())) if len(targets) > 0: - shared_strains = self._datalinks.common_strains([source], - targets, True) - for objpair in shared_strains.keys(): - shared_strains[objpair] = [ - self._strains.lookup_index(x) - for x in shared_strains[objpair] - ] - if isinstance(source, GCF): + shared_strains = self._datalinks.get_common_strains( + targets, [source], True) for target, link in link_data.items(): if (target, source) in shared_strains: link.shared_strains = shared_strains[(target, source)] else: + shared_strains = self._datalinks.get_common_strains( + [source], targets, True) for target, link in link_data.items(): if (source, target) in shared_strains: link.shared_strains = shared_strains[(source, @@ -457,51 +442,32 @@ def get_links(self, input_objects, scoring_methods, and_mode=True): len(link_collection))) return link_collection - def get_common_strains(self, objects_a, objects_b, filter_no_shared=True): - """Retrive strains shared by arbitrary pairs of objects. + def get_common_strains( + self, + met: Sequence[Spectrum] | Sequence[MolecularFamily], + gcfs: Sequence[GCF], + filter_no_shared: bool = True + ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]: + """Get common strains between given spectra/molecular families and GCFs. - Two lists of objects are required as input. Typically one list will be - MolecularFamily or Spectrum objects and the other GCF (which list is which - doesn't matter). - - The return value is a dict mapping pairs of objects to lists of Strain objects - shared by that pair. This list may be empty if ``filter_no_shared`` is False, - indicating no shared strains were found. - - If ``filter_no_shared`` is True, every entry in the dict with no shared strains - will be removed before it is returned, so the only entries will be those for - which shared strains exist. + Note that SingletonFamily objects are excluded from given molecular families. Args: - objects_a (list): a list of Spectrum/MolecularFamily/GCF objects - objects_b (list): a list of Spectrum/MolecularFamily/GCF objects - filter_no_shared (bool): if True, remove result entries for which no shared strains exist + met(Sequence[Spectrum] | Sequence[MolecularFamily]): + A list of Spectrum or MolecularFamily objects. + gcfs(Sequence[GCF]): A list of GCF objects. + filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF + without common strains will be removed from the returned dict; Returns: - A dict mapping pairs of objects (obj1, obj2) to lists of Strain objects. - - NOTE: The ordering of the pairs is *fixed* to be (metabolomic, genomic). In - other words, if objects_a = [GCF1, GC2, ...] and objects_b = [Spectrum1, - Spectrum2, ...], the object pairs will be (Spectrum1, GCF1), (Spectrum2, - GCF2), and so on. The same applies if objects_a and objects_b are swapped, - the metabolomic objects (Spectrum or MolecularFamily) will be the obj1 - entry in each pair. + dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) + and values are a list of shared Strain objects. """ if not self._datalinks: self._datalinks = self.scoring_method( MetcalfScoring.NAME).datalinks - - # this is a dict with structure: - # (Spectrum/MolecularFamily, GCF) => list of strain indices - common_strains = self._datalinks.common_strains( - objects_a, objects_b, filter_no_shared) - - # replace the lists of strain indices with actual strain objects - for objpair in common_strains.keys(): - common_strains[objpair] = [ - self._strains.lookup_index(x) for x in common_strains[objpair] - ] - + common_strains = self._datalinks.get_common_strains( + met, gcfs, filter_no_shared) return common_strains def has_bgc(self, bgc_id): @@ -510,15 +476,19 @@ def has_bgc(self, bgc_id): def lookup_bgc(self, bgc_id): """If BGC ``bgc_id`` exists, return it. Otherwise return None""" - return self._bgcs[self._bgc_lookup[bgc_id]] if self.has_bgc(bgc_id) else None + return self._bgc_lookup.get(bgc_id, None) def lookup_gcf(self, gcf_id): """If GCF ``gcf_id`` exists, return it. Otherwise return None""" - return self._gcfs[self._gcf_lookup[gcf_id]] if gcf_id in self._gcf_lookup else None + return self._gcf_lookup.get(gcf_id, None) - def lookup_spectrum(self, name): + def lookup_spectrum(self, spectrum_id): """If Spectrum ``name`` exists, return it. Otherwise return None""" - return self._spectra[self._spec_lookup[name]] if name in self._spec_lookup else None + return self._spec_lookup.get(spectrum_id, None) + + def lookup_mf(self, mf_id): + """If MolecularFamily `family_id` exists, return it. Otherwise return None""" + return self._mf_lookup.get(mf_id, None) @property def strains(self): @@ -596,55 +566,3 @@ def scoring_method(self, name): self._scoring_methods_setup_complete[name] = True return self._scoring_methods.get(name, None)(self) - - -if __name__ == "__main__": - # can set default logging configuration this way... - LogConfig.setLogLevel(logging.DEBUG) - - # initialise NPLinker from the command-line arguments - npl = NPLinker(Args().get_args()) - - # load the dataset - if not npl.load_data(): - print('Failed to load the dataset!') - sys.exit(-1) - - # create a metcalf scoring object - mc = npl.scoring_method('metcalf') - if mc is not None: - # set a scoring cutoff threshold - mc.cutoff = 0.5 - - # pick some GCFs to get links for - test_gcfs = npl.gcfs[:10] - - # tell nplinker to find links for this set of GCFs using metcalf - # scoring - results = npl.get_links(test_gcfs, mc) - - # check if any links were found - if len(results) == 0: - print('No links found!') - sys.exit(0) - - # the "result" object will be a LinkCollection, holding all the information - # returned by the scoring method(s) used - print(f'{len(results)} total links found') - - # display some information about each object and its links - for obj, result in results.links.items(): - print('Results for object: {}, {} total links, {} methods used'. - format(obj, len(result), results.method_count)) - - # get links for this object, sorted by metcalf score - sorted_links = results.get_sorted_links(mc, obj) - for link_data in sorted_links: - print(' --> [{}] {} | {} | # shared_strains = {}'.format( - ','.join(method.name for method in link_data.methods), - link_data.target, link_data[mc], - len(link_data.shared_strains))) - - rs = npl.scoring_method('rosetta') - if rs is not None: - print('OK') diff --git a/src/nplinker/process_output.py b/src/nplinker/process_output.py index 42420867..27f23929 100644 --- a/src/nplinker/process_output.py +++ b/src/nplinker/process_output.py @@ -51,7 +51,7 @@ def get_sig_spec(data_link, sig_links, scores, gcf_pos, min_n_strains=2): # Check if there are *any* strains in the GCF # No strains = MiBIG # Can also filter if only (e.g. 2 strains) - strain_sum = data_link.M_gcf_strain[gcf_pos, :].sum() + strain_sum = data_link.occurrence_gcf_strain[gcf_pos, :].sum() if strain_sum < min_n_strains: return [] col = sig_links[:, gcf_pos] # get the column diff --git a/src/nplinker/scoring/__init__.py b/src/nplinker/scoring/__init__.py index e69de29b..0aaefeaf 100644 --- a/src/nplinker/scoring/__init__.py +++ b/src/nplinker/scoring/__init__.py @@ -0,0 +1,10 @@ +import logging +from .link_collection import LinkCollection +from .metcalf_scoring import MetcalfScoring +from .methods import ScoringMethod +from .object_link import ObjectLink + + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = ["LinkCollection", "MetcalfScoring", "ScoringMethod", "ObjectLink"] diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py new file mode 100644 index 00000000..f33d043b --- /dev/null +++ b/src/nplinker/scoring/linking/__init__.py @@ -0,0 +1,14 @@ +import logging +from .data_links import DataLinks +from .data_links import LINK_TYPES +from .link_finder import LinkFinder +from .utils import calc_correlation_matrix +from .utils import isinstance_all + + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = [ + "DataLinks", "LINK_TYPES", "LinkFinder", "calc_correlation_matrix", + "isinstance_all" +] diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py deleted file mode 100644 index a6f181f0..00000000 --- a/src/nplinker/scoring/linking/data_linking.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright 2021 The NPLinker Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Methods to find correlations between spectra/molecular families and -# gene clusters/families (BGCs/GCFs) -# -# (still at very much protoype/exploration stage) -# -# Naming: -# M_* stands for a matrix format -# map_* stands for a simple mapping lookup table -# spec stands for spectrum -# fam stands for molecular family - -from collections import Counter -from typing import Sequence -# import packages -import numpy as np -import pandas as pd - -from nplinker.metabolomics.molecular_family import MolecularFamily -from nplinker.genomics.gcf import GCF -from nplinker.metabolomics.spectrum import Spectrum -from .data_linking_functions import calc_correlation_matrix - - -SCORING_METHODS = ['metcalf', 'likescore', 'hg'] - -from nplinker.logconfig import LogConfig - - -logger = LogConfig.getLogger(__name__) - - -class DataLinks(): - """ - DataLinks collects and structures co-occurence data - 1) Co-occurences of spectra, families, and GCFs with respect to strains - 2) Mappings: Lookup-tables that link different ids and categories - 3) Correlation matrices that show how often spectra/families and GCFs co-occur - """ - - def __init__(self): - # matrices that store co-occurences with respect to strains - # values = 1 where gcf/spec/fam occur in strain - # values = 0 where gcf/spec/fam do not occur in strain - self.M_gcf_strain = [] - self.M_spec_strain = [] - self.M_fam_strain = [] - - # mappings (lookup lists to map between different ids and categories - self.mapping_spec = pd.DataFrame() - self.mapping_gcf = pd.DataFrame() - self.mapping_fam = pd.DataFrame() # labels for strain-family matrix - self.mapping_strain = pd.DataFrame() - self.family_members = [] - - # correlation matrices for spectra <-> GCFs - self.M_spec_gcf = [ - ] # = int: Number of strains where spec_x and gcf_y co_occure - self.M_spec_notgcf = [ - ] # = int: Number of strains where spec_x and NOT-gcf_y co_occure - self.M_notspec_gcf = [ - ] # = int: Number of strains where NOT-spec_x and gcf_y co_occure - # and the same for mol.families <-> GCFs - self.M_fam_gcf = [] - self.M_fam_notgcf = [] - self.M_notfam_gcf = [] - - def get_spec_pos(self, spec_id): - # get the position in the arrays of a spectrum - row = self.mapping_spec.loc[self.mapping_spec['original spec-id'] == - float(spec_id)] - return int(row.iloc[0]['spec-id']) - - def get_gcf_pos(self, gcf_id): - # TODO: fix this so the original ID is present in case of re-ordering - pass - - def load_data(self, spectra, gcf_list, strain_list, molfams): - # load data from spectra, GCFs, and strains - logger.debug("Create mappings between spectra, gcfs, and strains.") - self.collect_mappings_spec(spectra) - # self.collect_mappings_spec_v2(molfams) - self.collect_mappings_gcf(gcf_list) - logger.debug( - "Create co-occurence matrices: spectra<->strains + and gcfs<->strains." - ) - self.matrix_strain_gcf(gcf_list, strain_list) - self.matrix_strain_spec(spectra, strain_list) - - def find_correlations(self, include_singletons=False): - # collect correlations/ co-occurences - logger.debug("Create correlation matrices: spectra<->gcfs.") - self.correlation_matrices(type='spec-gcf') - logger.debug("Create correlation matrices: mol-families<->gcfs.") - self.data_family_mapping(include_singletons=include_singletons) - self.correlation_matrices(type='fam-gcf') - - def collect_mappings_spec(self, obj: Sequence[Spectrum]|Sequence[MolecularFamily]): - if isinstance(obj[0], Spectrum): - mapping_spec = self._collect_mappings_from_spectra(obj) - elif isinstance(obj[0], MolecularFamily): - mapping_spec = self._collect_mappings_from_molecular_families(obj) - - # extend mapping tables: - self.mapping_spec["spec-id"] = mapping_spec[:, 0] - self.mapping_spec["original spec-id"] = mapping_spec[:, 1] - self.mapping_spec["fam-id"] = mapping_spec[:, 2] - - def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]: - # Collect most import mapping tables from input data - mapping_spec = np.zeros((len(spectra), 3)) - mapping_spec[:, 0] = np.arange(0, len(spectra)) - - for i, spectrum in enumerate(spectra): - mapping_spec[i, 1] = spectrum.id - mapping_spec[i, 2] = spectrum.family.family_id - - return mapping_spec - - def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]: - num_spectra = sum(len(x.spectra_ids) for x in molfams) - mapping_spec = np.zeros((num_spectra, 3)) - mapping_spec[:, 0] = np.arange(0, num_spectra) - - inverted_mappings = {} - for item in molfams: - for spectrum_id in item.spectra_ids: - inverted_mappings[spectrum_id] = item.family_id - - for i, key in enumerate(inverted_mappings): - mapping_spec[i, 1] = key - mapping_spec[i, 2] = inverted_mappings[key] - - return mapping_spec - - def collect_mappings_gcf(self, gcf_list): - """ - Find classes of gene cluster (nrps, pksi etc.) - collect most likely class (most occuring name, preferentially not "Others") - additional score shows fraction of chosen class among all given ones - """ - - # TODO: not only collect bigclass types but also product predictions - bigscape_bestguess = [] - for i, gcf in enumerate(gcf_list): - #bigscape_class = [] - #for i, bgc in enumerate(gcf_list[i].bgcs): - # # bigscape_class.append(gcf_list[i].bgc_list[m].bigscape_class) - # bigscape_class.append(bgc.bigscape_class) - # class_counter = Counter(bigscape_class) - - # TODO: this might need properly rewritten due to changes in the way GCF/BGC - # objects store bigscape class information and handle hybrid BGCs (see genomics.py). the - # original version i've left above iterates over every BGC in the current GCF - # and extracts its .bigscape_class attribute, but now each BGC can have multiple - # classes if it happens to be a hybrid and i'm not sure the version below - # still makes sense. - # - # doesn't seem to be very important for calculating the metcalf scores though so not urgent for now. - bigscape_class = [] - for bgc in gcf.bgcs: - bigscape_class.extend(bgc.bigscape_classes) - class_counter = Counter(bigscape_class) - - # try not to select "Others": - if class_counter.most_common(1)[0][0] is None: - bigscape_bestguess.append(["Others", 0]) - elif class_counter.most_common( - 1)[0][0] == "Others" and class_counter.most_common( - 1)[0][1] < len(bigscape_class): - if class_counter.most_common(2)[1][0] is None: - bigscape_bestguess.append([ - class_counter.most_common(1)[0][0], - class_counter.most_common(1)[0][1] / - len(bigscape_class) - ]) - else: - bigscape_bestguess.append([ - class_counter.most_common(2)[1][0], - class_counter.most_common(2)[1][1] / - len(bigscape_class) - ]) - else: - bigscape_bestguess.append([ - class_counter.most_common(1)[0][0], - class_counter.most_common(1)[0][1] / len(bigscape_class) - ]) - - # extend mapping tables: - self.mapping_gcf["gcf-id"] = np.arange(0, len(bigscape_bestguess)) - bigscape_guess, bigscape_guessscore = zip(*bigscape_bestguess) - self.mapping_gcf["bgc class"] = bigscape_guess - self.mapping_gcf["bgc class score"] = bigscape_guessscore - - def matrix_strain_gcf(self, gcf_list, strain_list): - # Collect co-ocurences in M_spec_strain matrix - M_gcf_strain = np.zeros((len(gcf_list), len(strain_list))) - - for i, strain in enumerate(strain_list): - for m, gcf in enumerate(gcf_list): - if gcf.has_strain(strain): - M_gcf_strain[m, i] = 1 - - self.M_gcf_strain = M_gcf_strain - # extend mapping tables: - self.mapping_gcf["no of strains"] = np.sum(self.M_gcf_strain, axis=1) - self.mapping_strain["no of gcfs"] = np.sum(self.M_gcf_strain, axis=0) - - def matrix_strain_spec(self, spectra, strain_list): - # Collect co-ocurences in M_strains_specs matrix - - M_spec_strain = np.zeros((len(spectra), len(strain_list))) - for i, spectrum in enumerate(spectra): - for j, s in enumerate(strain_list): - if spectrum.has_strain(s): - M_spec_strain[i, j] = 1 - self.M_spec_strain = M_spec_strain - - # extend mapping tables: - self.mapping_spec["no of strains"] = np.sum(self.M_spec_strain, axis=1) - self.mapping_strain["no of spectra"] = np.sum(self.M_spec_strain, - axis=0) - self.mapping_strain["strain name"] = [str(s) for s in strain_list] - - def data_family_mapping(self, include_singletons=False): - # Create M_fam_strain matrix that gives co-occurences between mol. families and strains - # matrix dimensions are: number of families x number of strains - - family_ids = np.unique( - self.mapping_spec["fam-id"]) # get unique family ids - - # if singletons are included, check if there are a non-zero number of them (singleton - # families all have -1 as a family ID number) - if include_singletons and np.where( - self.mapping_spec["fam-id"] == -1)[0].shape[0] > 0: - # in this case the number of unique families is the number of singletons - # plus the number of normal families. the "-1" is (I think) to account for - # the single "-1" entry that will be present in "family_ids". - num_of_singletons = np.where( - self.mapping_spec["fam-id"] == -1)[0].shape[0] - num_unique_fams = num_of_singletons + len(family_ids) - 1 - else: - # if no singletons included or present in the dataset, just take the number - # of regular molfams instead - num_of_singletons = 0 - num_unique_fams = len(family_ids) - - M_fam_strain = np.zeros((num_unique_fams, self.M_spec_strain.shape[1])) - strain_fam_labels = [] - strain_fam_index = [] - - if num_of_singletons > 0: # if singletons exist + included - M_fam_strain[( - num_unique_fams - - num_of_singletons):, :] = self.M_spec_strain[np.where( - self.mapping_spec["fam-id"][:, 0] == -1)[0], :] - - # go through families (except singletons) and collect member strain occurences - self.family_members = [] - for i, fam_id in enumerate( - family_ids[np.where(family_ids != -1)].astype(int)): - family_members = np.where( - np.array(self.mapping_spec["fam-id"]) == fam_id) - self.family_members.append(family_members) - M_fam_strain[i, :] = np.sum(self.M_spec_strain[family_members, :], - axis=1) - strain_fam_labels.append(fam_id) - strain_fam_index.append(i) - - add_singleton_entries = -1 in family_ids - # TODO: i think this breaks stuff below due to mismatches in the number of rows - # in the dataframes and matrices if there are no -1 family ids. - # discovered when trying to write some code to test scoring. is this ever - # likely to happen with a real dataset?? - if add_singleton_entries: - strain_fam_labels.append([-1] * num_of_singletons) - strain_fam_index.append(i + 1) - - # only looking for co-occurence, hence only 1 or 0 - M_fam_strain[M_fam_strain > 1] = 1 - - self.M_fam_strain = M_fam_strain - # extend mapping table: - self.mapping_fam["family id"] = strain_fam_index - self.mapping_fam["original family id"] = strain_fam_labels - self.mapping_fam["no of strains"] = np.sum(self.M_fam_strain, axis=1) - num_members = [x[0].shape[0] for x in self.family_members] - # see above - if add_singleton_entries: - num_members.append(num_of_singletons) - self.mapping_fam["no of members"] = num_members - return self.family_members - - def common_strains(self, objects_a, objects_b, filter_no_shared=False): - """ - Obtain the set of common strains between all pairs from the lists objects_a - and objects_b. - - The two parameters can be either lists or single instances of the 3 supported - object types (GCF, Spectrum, MolecularFamily). It's possible to use a single - object together with a list as well. - - Returns a dict indexed by tuples of (Spectrum/MolecularFamily, GCF), where - the values are lists of strain indices which appear in both objects, which - can then be looked up in NPLinker.strains. - """ - - # TODO make this work for BGCs too? - - is_list_a = isinstance(objects_a, list) - is_list_b = isinstance(objects_b, list) - - type_a = type(objects_a[0]) if is_list_a else type(objects_a) - type_b = type(objects_b[0]) if is_list_b else type(objects_b) - - if type_a == type_b: - raise Exception('Must supply objects with different types!') - - # to keep things slightly simpler, ensure the GCFs are always "b" - if type_a == GCF: - type_a, type_b = type_b, type_a - is_list_a, is_list_b = is_list_b, is_list_a - objects_a, objects_b = objects_b, objects_a - - if not is_list_a: - objects_a = [objects_a] - if not is_list_b: - objects_b = [objects_b] - - # retrieve object IDs - # TODO: issue #103 stop using gcf.id, but note that the ids_b should be - # a list of int - ids_b = [gcf.id for gcf in objects_b] - # these might be MolFams or Spectra, either way they'll have a .id attribute - ids_a = [obj.id for obj in objects_a] - - data_a = self.M_spec_strain if type_a == Spectrum else self.M_fam_strain - data_b = self.M_gcf_strain - - results = {} - for a, obj_a in enumerate(objects_a): - for b, obj_b in enumerate(objects_b): - # just AND both arrays and extract the indices with positive results - result = np.where( - np.logical_and(data_a[ids_a[a]], data_b[ids_b[b]]))[0] - # if we want to exclude results with no shared strains - if (filter_no_shared - and len(result) > 0) or not filter_no_shared: - results[(obj_a, obj_b)] = result - - return results - - def correlation_matrices(self, type='spec-gcf'): - """ - Collect co-occurrences accros strains: - IF type='spec-gcf': - number of co-occurences of spectra and GCFS - --> Output: M_spec_gcf matrix - IF type='fam-gcf': - number of co-occurences of mol.families and GCFS - --> Output: M_fam_gcf matrix - """ - - # Make selection for scenario spec<->gcf or fam<->gcf - if type == 'spec-gcf': - M_type1_strain = self.M_spec_strain - elif type == 'fam-gcf': - M_type1_strain = self.M_fam_strain - elif type == 'spec-bgc' or type == 'fam-bgc': - raise Exception("Given types are not yet supported... ") - else: - raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf', ..." - ) - - logger.debug( - f"Calculating correlation matrices of type: {type}") - - # Calculate correlation matrix from co-occurence matrices - M_type1_gcf, M_type1_notgcf, M_nottype1_gcf, M_nottype1_notgcf = calc_correlation_matrix( - M_type1_strain, self.M_gcf_strain) - - # return results: - if type == 'spec-gcf': - self.M_spec_gcf = M_type1_gcf - self.M_spec_notgcf = M_type1_notgcf - self.M_notspec_gcf = M_nottype1_gcf - self.M_notspec_notgcf = M_nottype1_notgcf - elif type == 'fam-gcf': - self.M_fam_gcf = M_type1_gcf - self.M_fam_notgcf = M_type1_notgcf - self.M_notfam_gcf = M_nottype1_gcf - self.M_notfam_notgcf = M_nottype1_notgcf - else: - raise Exception("No correct correlation matrix was created.") - - # class data_links OUTPUT functions - # TODO add output functions (e.g. to search for mappings of individual specs, gcfs etc.) diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py new file mode 100644 index 00000000..30e7c1b8 --- /dev/null +++ b/src/nplinker/scoring/linking/data_links.py @@ -0,0 +1,256 @@ +from __future__ import annotations +from typing import Sequence, TYPE_CHECKING +import numpy as np +import pandas as pd +from nplinker.genomics.gcf import GCF +from nplinker.logconfig import LogConfig +from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.singleton_family import SingletonFamily +from nplinker.metabolomics.spectrum import Spectrum +from .utils import calc_correlation_matrix +from .utils import isinstance_all + + +if TYPE_CHECKING: + from nplinker.strain_collection import StrainCollection + from nplinker.strains import Strain + +logger = LogConfig.getLogger(__name__) + +LINK_TYPES = ['spec-gcf', 'mf-gcf'] + + +class DataLinks(): + + def __init__(self, gcfs: Sequence[GCF], spectra: Sequence[Spectrum], + mfs: Sequence[MolecularFamily], strains: StrainCollection): + """DataLinks class to store occurrence and co-occurrence information. + + Occurrence refers to the presence of a spectrum/gcf/mf in a strain. + Co-occurrence refers to the presence of a spectrum/mf and a gcf in a strain. + + Args: + gcfs(Sequence[GCF]): A list of GCF objects. + spectra(Sequence[Spectrum]): A list of Spectrum objects. + mfs(Sequence[MolecularFamily]): A list of MolecularFamily objects. + strains(StrainCollection): A StrainCollection object. + + Attributes: + occurrence_gcf_strain(pd.DataFrame): A DataFrame to store occurrence of + gcfs with respect to strains. + occurrence_spec_strain(pd.DataFrame): A DataFrame to store occurrence of + spectra with respect to strains. + occurrence_mf_strain(pd.DataFrame): A DataFrame to store occurrence of + molecular families with respect to strains. + cooccurrence_spec_gcf(pd.DataFrame): A DataFrame to store co-occurrence + of the presence of spectra and the presence of gcfs with respect + to strains. + cooccurrence_spec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence + of the presence of spectra and the absence of gcfs with respect + to strains. "notgcf" means the absence of gcfs. + cooccurrence_notspec_gcf(pd.DataFrame): A DataFrame to store co-occurrence + of the absence of spectra and the presence of gcfs with respect + to strains. "notspec" means the absence of spectra. + cooccurrence_notspec_notgcf(pd.DataFrame): A DataFrame to store co-occurrence + of the absence of spectra and the absence of gcfs with respect + to strains. + cooccurrence_mf_gcf(pd.DataFrame): A DataFrame to store co-occurrence + of the presence of molecular families and the presence of gcfs + with respect to strains. + cooccurrence_mf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence + of the presence of molecular families and the absence of gcfs + with respect to strains. "notgcf" means the absence of gcfs. + cooccurrence_notmf_gcf(pd.DataFrame): A DataFrame to store co-occurrence + of the absence of molecular families and the presence of gcfs + with respect to strains. "notmf" means the absence of molecular + families. + cooccurrence_notmf_notgcf(pd.DataFrame): A DataFrame to store co-occurrence + of the absence of molecular families and the absence of gcfs + with respect to strains. + """ + self._strains = strains + + logger.debug( + "Create occurrence dataframes: spectra<->strains, gcfs<->strains and mfs<->strains." + ) + # DataFrame to store occurrence of gcfs/spectra/mfs with respect to strains + # values = 1 where gcf/spec/fam occur in strain, 0 otherwise + self.occurrence_gcf_strain = self._get_occurrence_gcf_strain( + gcfs, strains) + self.occurrence_spec_strain = self._get_occurrence_spec_strain( + spectra, strains) + self.occurrence_mf_strain = self._get_occurrence_mf_strain( + mfs, strains) + + # DataFrame to store co-occurrence of "spectra<->gcf" or "mfs<->gcf" + logger.debug("Create correlation matrices: spectra<->gcfs.") + (self.cooccurrence_spec_gcf, self.cooccurrence_spec_notgcf, + self.cooccurrence_notspec_gcf, + self.cooccurrence_notspec_notgcf) = self._get_cooccurrence( + link_type='spec-gcf') + logger.debug("Create correlation matrices: mol-families<->gcfs.") + (self.cooccurrence_mf_gcf, self.cooccurrence_mf_notgcf, + self.cooccurrence_notmf_gcf, + self.cooccurrence_notmf_notgcf) = self._get_cooccurrence( + link_type='mf-gcf') + + def get_common_strains( + self, + spectra_or_mfs: Sequence[Spectrum | MolecularFamily], + gcfs: Sequence[GCF], + filter_no_shared: bool = False + ) -> dict[tuple[Spectrum | MolecularFamily, GCF], list[Strain]]: + """Get common strains between given spectra/molecular families and GCFs. + + Note that SingletonFamily objects are excluded from given `spectra_or_mfs`. + + Args: + spectra_or_mfs(Sequence[Spectrum | MolecularFamily]): + A list of Spectrum and/or MolecularFamily objects. + gcfs(Sequence[GCF]): A list of GCF objects. + filter_no_shared(bool): If True, the pairs of spectrum/mf and GCF + without common strains will be removed from the returned dict; + + Returns: + dict: A dict where the keys are tuples of (Spectrum/MolecularFamily, GCF) + and values are a list of shared Strain objects. + + Raises: + ValueError: If given `spectra_or_mfs` or `gcfs` is empty. + TypeError: If given `spectra_or_mfs` or `gcfs` is not a list of + Spectrum/MolecularFamily or GCF objects, respectively. + """ + # Check input arguments + if len(spectra_or_mfs) == 0 or len(gcfs) == 0: + raise ValueError('Empty list for first or second argument.') + if not isinstance_all(*spectra_or_mfs, + objtype=(Spectrum, MolecularFamily)): + raise TypeError( + 'First argument must be Spectrum and/or MolecularFamily objects.' + ) + if not isinstance_all(*gcfs, objtype=GCF): + raise TypeError('Second argument must be GCF objects.') + + # Assume that 3 occurrence dataframes have same df.columns (strain ids) + strain_ids = self.occurrence_gcf_strain.columns + results = {} + for obj in spectra_or_mfs: + if isinstance(obj, SingletonFamily): + continue + for gcf in gcfs: + if isinstance(obj, Spectrum): + shared_strains = strain_ids[np.logical_and( + self.occurrence_spec_strain.loc[obj.spectrum_id], + self.occurrence_gcf_strain.loc[gcf.gcf_id])] + else: + shared_strains = strain_ids[np.logical_and( + self.occurrence_mf_strain.loc[obj.family_id], + self.occurrence_gcf_strain.loc[gcf.gcf_id])] + if filter_no_shared and len(shared_strains) == 0: + continue + results[(obj, gcf)] = [ + self._strains.lookup(strain_id) + for strain_id in shared_strains + ] + return results + + def _get_occurrence_gcf_strain(self, gcfs: Sequence[GCF], + strains: StrainCollection) -> pd.DataFrame: + """Get the occurence of strains in gcfs. + + The occurence is a DataFrame with gcfs as rows and strains as columns, + where index is `gcf.gcf_id` and column name is `strain.id`. The values + are 1 if the gcf contains the strain and 0 otherwise. + """ + df_gcf_strain = pd.DataFrame(np.zeros((len(gcfs), len(strains))), + index=[gcf.gcf_id for gcf in gcfs], + columns=[strain.id for strain in strains], + dtype=int) + for gcf in gcfs: + for strain in strains: + if gcf.has_strain(strain): + df_gcf_strain.loc[gcf.gcf_id, strain.id] = 1 + return df_gcf_strain + + def _get_occurrence_spec_strain(self, spectra: Sequence[Spectrum], + strains: StrainCollection) -> pd.DataFrame: + """Get the occurence of strains in spectra. + + The occurence is a DataFrame with spectra as rows and strains as columns, + where index is `spectrum.spectrum_id` and column name is `strain.id`. + The values are 1 if the spectrum contains the strain and 0 otherwise. + """ + df_spec_strain = pd.DataFrame( + np.zeros((len(spectra), len(strains))), + index=[spectrum.spectrum_id for spectrum in spectra], + columns=[strain.id for strain in strains], + dtype=int) + for spectrum in spectra: + for strain in strains: + if spectrum.has_strain(strain): + df_spec_strain.loc[spectrum.spectrum_id, strain.id] = 1 + return df_spec_strain + + def _get_occurrence_mf_strain(self, mfs: Sequence[MolecularFamily], + strains: StrainCollection) -> pd.DataFrame: + """Get the occurence of strains in molecular families. + + The occurence is a DataFrame with molecular families as rows and + strains as columns, where index is `mf.family_id` and column name is + `strain.id`. The values are 1 if the molecular family contains the + strain and 0 otherwise. + + Note that SingletonFamily objects are excluded from given `mfs`. + """ + # remove SingletonFamily objects + mfs = [mf for mf in mfs if not isinstance(mf, SingletonFamily)] + + df_mf_strain = pd.DataFrame(np.zeros((len(mfs), len(strains))), + index=[mf.family_id for mf in mfs], + columns=[strain.id for strain in strains], + dtype=int) + for mf in mfs: + for strain in strains: + if mf.has_strain(strain): + df_mf_strain.loc[mf.family_id, strain.id] = 1 + return df_mf_strain + + def _get_cooccurrence( + self, + link_type: str = 'spec-gcf' + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Calculate co-occurrence for given link types across strains. + + Args: + link_type(str): Type of link to calculate co-occurrence for, + either 'spec-gcf' or 'mf-gcf'. + """ + if link_type == 'spec-gcf': + met_strain_occurrence = self.occurrence_spec_strain + elif link_type == 'mf-gcf': + met_strain_occurrence = self.occurrence_mf_strain + else: + raise ValueError( + f"Link type {link_type} is not supported. Use 'spec-gcf' or 'mf-gcf' instead." + ) + logger.debug(f"Calculating correlation matrices of type: {link_type}") + m1, m2, m3, m4 = calc_correlation_matrix(met_strain_occurrence, + self.occurrence_gcf_strain) + df_met_gcf = pd.DataFrame(m1, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index, + dtype=int) + df_met_notgcf = pd.DataFrame(m2, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index, + dtype=int) + df_notmet_gcf = pd.DataFrame(m3, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index, + dtype=int) + df_notmet_notgcf = pd.DataFrame( + m4, + index=met_strain_occurrence.index, + columns=self.occurrence_gcf_strain.index, + dtype=int) + return df_met_gcf, df_met_notgcf, df_notmet_gcf, df_notmet_notgcf diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py index 2e58acd9..7a400bd9 100644 --- a/src/nplinker/scoring/linking/link_finder.py +++ b/src/nplinker/scoring/linking/link_finder.py @@ -1,768 +1,199 @@ - +from __future__ import annotations +from typing import TYPE_CHECKING import numpy as np import pandas as pd from scipy.stats import hypergeom from nplinker.genomics.gcf import GCF -from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking.data_linking import SCORING_METHODS -from .data_linking_functions import pair_prob_approx -from .data_linking_functions import pair_prob_hg - - -# CG: TODO get_links function does not work any more, need to update its logics - - -# CG: TODO get_links function does not work any more, need to update its logics - - -# import packages for plotting -# TODO move plotting to separate module? -try: - import seaborn as sns - from matplotlib import pyplot as plt -except ImportError: - print( - 'Warning: plotting functionality will not be available (missing matplotlib and/or seaborn)' - ) - from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.spectrum import Spectrum +from . import LINK_TYPES +from .utils import isinstance_all + +if TYPE_CHECKING: + from . import DataLinks logger = LogConfig.getLogger(__file__) +# TODO CG: this class could be merged to MetcalfScoring class? class LinkFinder(): - """ - Class to: - 1) Score potential links based on collected information from: - DataLinks, LinkLikelihood (and potentially other resources) - Different scores can be used for this! - - 2) Rank and output selected candidates - 3) Create output plots and tables - """ - - def __init__(self): - """ - Create tables of prospective link candidates. - Separate tables will exist for different linking scenarios, such as - gcfs <-> spectra OR gcf <-> mol.families - """ - - # metcalf scores - self.metcalf_spec_gcf = [] - self.metcalf_fam_gcf = [] - - # likelihood scores - self.likescores_spec_gcf = [] - self.likescores_fam_gcf = [] - - # hg scores - self.hg_spec_gcf = [] - self.hg_fam_gcf = [] - - # link candidate tables - self.link_candidates_gcf_spec = [] - self.link_candidates_gcf_fam = [] - - # metcalf caching - self.metcalf_expected = None - self.metcalf_variance = None - - def get_scores(self, method, type_): - if method == 'metcalf': - if type_ == 'spec-gcf': - return self.metcalf_spec_gcf - elif type_ == 'fam-gcf': - return self.metcalf_fam_gcf - elif method == 'likescore': - if type_ == 'spec-gcf': - return self.likescores_spec_gcf - elif type_ == 'fam-gcf': - return self.likescores_fam_gcf - elif method == 'hg': - if type_ == 'spec-gcf': - return self.hg_spec_gcf - elif type_ == 'fam-gcf': - return self.hg_fam_gcf - - raise Exception( - 'Unknown method or type (method="{}", type="{}")'.format( - method, type_)) - - def metcalf_scoring(self, - data_links, - both=10, - type1_not_gcf=-10, - gcf_not_type1=0, - not_type1_not_gcf=1, - type='spec-gcf'): - """ - Calculate metcalf scores from DataLinks() co-occurence matrices - """ - - # Compute the expected values for all possible values of spec and gcf strains - # we need the total number of strains - _, n_strains = data_links.M_gcf_strain.shape - if self.metcalf_expected is None: - sz = (n_strains + 1, n_strains + 1) - self.metcalf_expected = np.zeros(sz) - self.metcalf_variance = np.zeros(sz) - - for n in range(n_strains + 1): - for m in range(n_strains + 1): - max_overlap = min(n, m) - min_overlap = max( - 0, - n + m - n_strains) # minimum possible strain overlap - expected_value = 0 - expected_sq = 0 - for o in range(min_overlap, max_overlap + 1): - o_prob = hypergeom.pmf(o, n_strains, n, m) - # compute metcalf for n strains in type 1 and m in gcf - score = o * both - score += type1_not_gcf * (n - o) - score += gcf_not_type1 * (m - o) - score += not_type1_not_gcf * (n_strains - (n + m - o)) - expected_value += o_prob * score - expected_sq += o_prob * (score**2) - - self.metcalf_expected[n, m] = expected_value - expected_sq = expected_sq - expected_value**2 - if expected_sq < 1e-09: - expected_sq = 1 - self.metcalf_variance[n, m] = expected_sq - - self.metcalf_variance_sqrt = np.sqrt(self.metcalf_variance) - - # now, we would like an option to take any actual score an subtract the - # expected value and then divide by the square root of the variance - # e.g. if we have a score computed between a type 1 object that has - # 3 strains, and a gcf with 6 strains, we should use the expected value - # at expected_metcalf[3,6] and sqrt of the variance in the same position - - if type == 'spec-gcf': - metcalf_scores = np.zeros(data_links.M_spec_gcf.shape) - metcalf_scores = (data_links.M_spec_gcf * both + - data_links.M_spec_notgcf * type1_not_gcf + - data_links.M_notspec_gcf * gcf_not_type1 + - data_links.M_notspec_notgcf * not_type1_not_gcf) - self.metcalf_spec_gcf = metcalf_scores - - elif type == 'fam-gcf': - metcalf_scores = np.zeros(data_links.M_fam_gcf.shape) - metcalf_scores = (data_links.M_fam_gcf * both + - data_links.M_fam_notgcf * type1_not_gcf + - data_links.M_notfam_gcf * gcf_not_type1 + - data_links.M_notfam_notgcf * not_type1_not_gcf) - - self.metcalf_fam_gcf = metcalf_scores - return metcalf_scores - - def hg_scoring(self, data_links, type='spec-gcf'): - """ - Calculate metcalf scores from DataLinks() co-occurence matrices - """ - - # NOTE:can't use the correlation matrices directly for this scoring method because - # it seems to require more inclusive counts of the strains in each object. - # Instead of "number of strains only in GCF", it requires "number of strains in the - # GCF PLUS the number shared between the GCF and the other object". - # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain, - # M_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead - # of "3", because the spectrum only has 2 distinct strains vs the GCF. - # To fix this the M_spec_gcf/M_fam_gcf matrix can just be added onto the others to give - # the correct totals. - - if type == 'spec-gcf': - num_strains = np.ones( - data_links.M_spec_gcf.shape) * data_links.M_gcf_strain.shape[1] - overlap_counts = data_links.M_spec_gcf - gcf_counts = overlap_counts + data_links.M_notspec_gcf - spec_counts = overlap_counts + data_links.M_spec_notgcf - hg_scores = hypergeom.sf(overlap_counts, - num_strains, - gcf_counts, - spec_counts, - loc=1) - self.hg_spec_gcf = hg_scores - elif type == 'fam-gcf': - num_strains = np.ones( - data_links.M_fam_gcf.shape) * data_links.M_gcf_strain.shape[1] - overlap_counts = data_links.M_fam_gcf - gcf_counts = overlap_counts + data_links.M_notfam_gcf - fam_counts = overlap_counts + data_links.M_fam_notgcf - hg_scores = hypergeom.sf(overlap_counts, - num_strains, - gcf_counts, - fam_counts, - loc=1) - self.hg_fam_gcf = hg_scores - - return hg_scores - - def likelihood_scoring(self, - data_links, - likelihoods, - alpha_weighing=0.5, - type='spec-gcf'): - """ - Calculate likelihood scores from DataLinks() co-occurence matrices. - - Idea: - Score reflect the directionality BGC-->compound-->spectrum, which - suggests that the most relevant likelihoods are: - P(gcf|type1) - If type1 is the result of only one particular gene cluster, - this value should be high (close or equal to 1) - P(type1|not gcf) - Following the same logic, this value should be very - small or 0 ("no gene cluster, no compound") - - Score: - Score = P(gcf|type1) * (1 - P(type1|not gcf) * weighing function - - weighing function is here a function of the number of strains they co-occur. - weighing function = (1 - exp(-alpha_weighing * num_of_co-occurrences) - """ - - if type == 'spec-gcf': - likelihood_scores = np.zeros(data_links.M_spec_gcf.shape) - likelihood_scores = ( - likelihoods.P_gcf_given_spec * - (1 - likelihoods.P_spec_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.M_spec_gcf))) - - self.likescores_spec_gcf = likelihood_scores - - elif type == 'fam-gcf': - likelihood_scores = np.zeros(data_links.M_fam_gcf.shape) - likelihood_scores = ( - likelihoods.P_gcf_given_fam * (1 - likelihoods.P_fam_not_gcf) * - (1 - np.exp(-alpha_weighing * data_links.M_fam_gcf))) - - self.likescores_fam_gcf = likelihood_scores - return likelihood_scores - - def select_link_candidates(self, - data_links, - likelihoods, - P_cutoff=0.8, - main_score='likescore', - score_cutoff=0, - type='fam-gcf'): - """ - Look for potential best candidate for links between - IF type='spec-gcf': GCFs and spectra - IF type='fam-gcf': GCFs and mol.families - - Parameters - ---------- - data_links: DataLinks() object - containing co-occurence matrices - likelihood: LinkLikelihood() object - containing co-occurence likelihoods - P_cutoff: float - Thresholds to conly consider candidates for which: - P_gcf_given_type1 >= P_cutoff - main_score: str - Which main score to use ('metcalf', 'likescore') - score_cutoff: - Thresholds to conly consider candidates for which: - score >= score_cutoff - """ - - # Select scenario: spec<->gcf or fam<->gcf - if type == 'spec-gcf': - P_gcf_given_type1 = likelihoods.P_gcf_given_spec - P_gcf_not_type1 = likelihoods.P_gcf_not_spec - P_type1_given_gcf = likelihoods.P_spec_given_gcf - P_type1_not_gcf = likelihoods.P_spec_not_gcf - M_type1_gcf = data_links.M_spec_gcf - metcalf_scores = self.metcalf_spec_gcf - likescores = self.likescores_spec_gcf - index_names = [ - "spectrum_id", "GCF id", "P(gcf|spec)", "P(spec|gcf)", - "P(gcf|not spec)", "P(spec|not gcf)", "co-occur in # strains", - "metcalf score", "likelihood score", "HG prob", "link prob", - "link prob specific" - ] - - elif type == 'fam-gcf': - P_gcf_given_type1 = likelihoods.P_gcf_given_fam - P_gcf_not_type1 = likelihoods.P_gcf_not_fam - P_type1_given_gcf = likelihoods.P_fam_given_gcf - P_type1_not_gcf = likelihoods.P_fam_not_gcf - M_type1_gcf = data_links.M_fam_gcf - metcalf_scores = self.metcalf_fam_gcf - likescores = self.likescores_fam_gcf - index_names = [ - "family_id", "GCF id", "P(gcf|fam)", "P(fam|gcf)", - "P(gcf|not fam)", "P(fam|not gcf)", "co-occur in # strains", - "metcalf score", "likelihood score", "HG prob", "link prob", - "link prob specific" - ] - - elif type == 'spec-bgc' or type == 'fam-bgc': - raise Exception("Given types are not yet supported... ") - else: - raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf'..." - ) - - dim1, dim2 = P_gcf_given_type1.shape - - # PRE-SELECTION: - # Select candidates with P_gcf_given_spec >= P_cutoff AND score >= score_cutoff - if main_score == 'likescore': - candidate_ids = np.where((P_gcf_given_type1[:, :] >= P_cutoff) - & (likescores >= score_cutoff)) - elif main_score == 'metcalf': - candidate_ids = np.where((P_gcf_given_type1[:, :] >= P_cutoff) - & (metcalf_scores >= score_cutoff)) - else: - raise Exception( - 'Wrong scoring type given. Must be one of: {}'.format( - SCORING_METHODS)) - - logger.debug(candidate_ids[0].shape[0], " candidates selected with ", - index_names[2], " >= ", P_cutoff, " and a link score >= ", - score_cutoff, ".") - - link_candidates = np.zeros((12, candidate_ids[0].shape[0])) - link_candidates[0, :] = candidate_ids[0] # spectrum/fam number - link_candidates[1, :] = candidate_ids[1] # gcf id - link_candidates[2, :] = P_gcf_given_type1[candidate_ids] - link_candidates[3, :] = P_type1_given_gcf[candidate_ids] - link_candidates[4, :] = P_gcf_not_type1[candidate_ids] - link_candidates[5, :] = P_type1_not_gcf[candidate_ids] - link_candidates[6, :] = M_type1_gcf[candidate_ids] - link_candidates[7, :] = metcalf_scores[candidate_ids] - link_candidates[8, :] = likescores[candidate_ids] - - # Calculate probability to find similar link by chance - Nx_list = data_links.mapping_gcf["no of strains"] - if type == 'spec-gcf': - Ny_list = data_links.mapping_spec["no of strains"] - elif type == 'fam-gcf': - Ny_list = data_links.mapping_fam["no of strains"] - - # Calculate probabilities of finding a spectrum in a certain strain - P_str = np.array(data_links.mapping_strain["no of spectra"]) - P_str = P_str / np.sum(P_str) - num_strains = data_links.M_gcf_strain.shape[1] + def __init__(self) -> None: + """Initialise LinkFinder object. + + Attributes: + raw_score_spec_gcf (pd.DataFrame): The raw Metcalf scores for + spectrum-GCF links. + raw_score_mf_gcf (pd.DataFrame): The raw Metcalf scores for + molecular family-GCF links. + metcalf_mean (np.ndarray | None): The mean value used for + standardising Metcalf scores. + metcalf_std (np.ndarray | None): The standard deviation value used + for standardising Metcalf scores. + """ + self.raw_score_spec_gcf = pd.DataFrame() + self.raw_score_mf_gcf = pd.DataFrame() + self.metcalf_mean = None + self.metcalf_std = None + + # TODO CG: calc_score method could be integrated to __init__? + def calc_score( + self, + data_links: DataLinks, + link_type: str = 'spec-gcf', + scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1) + ) -> None: + """Calculate Metcalf scores. + + Args: + data_links(DataLinks): The DataLinks object to use for scoring. + link_type(str): The type of link to score. Must be 'spec-gcf' or + 'mf-gcf'. Defaults to 'spec-gcf'. + scoring_weights(tuple[int,int,int,int]): The weights to + use for Metcalf scoring. The weights are applied to + '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'. + Defaults to (10, -10, 0, 1). + + Raises: + ValueError: If an invalid link type is provided. + """ + if link_type not in LINK_TYPES: + raise ValueError( + f'Invalid link type: {link_type}. Must be one of {LINK_TYPES}') - # Calculate the hypergeometric probability (as before) - for i in range(link_candidates.shape[1]): - link_candidates[9, i] = pair_prob_hg( - link_candidates[6, i], num_strains, - Nx_list[link_candidates[1, i]], - Ny_list[int(link_candidates[0, i])]) - - # Calculate the GCF specific probability - for i in range(link_candidates.shape[1]): - id_spec = int(link_candidates[0, i]) - id_gcf = int(link_candidates[1, i]) - - # find set of strains which contain GCF with id link_candidates[1,i] - XG = np.where(data_links.M_gcf_strain[id_gcf, :] == 1)[0] - - link_candidates[10, - i] = pair_prob_approx(P_str, XG, - int(Ny_list[id_spec]), - int(link_candidates[6, i])) - # Calculate the link specific probability - # Find strains where GCF and spectra/family co-occur - if type == 'spec-gcf': - XGS = np.where((data_links.M_gcf_strain[id_gcf, :] == 1) & - (data_links.M_spec_strain[id_spec, :] == 1))[0] - elif type == 'fam-gcf': - XGS = np.where((data_links.M_gcf_strain[id_gcf, :] == 1) - & (data_links.M_fam_strain[id_spec, :] == 1))[0] - link_candidates[11, - i] = link_prob(P_str, XGS, int(Nx_list[id_gcf]), - int(Ny_list[id_spec]), num_strains) - - # Transform into pandas Dataframe (to avoid index confusions): - link_candidates_pd = pd.DataFrame(link_candidates.transpose(1, 0), - columns=index_names) - - # add other potentially relevant knowdledge - # If this will grow to more collected information -> create separate function/class - bgc_class = [] - for i in link_candidates_pd["GCF id"].astype(int): - bgc_class.append(data_links.mapping_gcf["bgc class"][i]) - link_candidates_pd["BGC class"] = bgc_class - - # Change some columns to int - link_candidates_pd.iloc[:, [0, 1, 6, 7]] = link_candidates_pd.iloc[:, [ - 0, 1, 6, 7 - ]].astype(int) - - # return results - if type == 'spec-gcf': - self.link_candidates_gcf_spec = link_candidates_pd - elif type == 'fam-gcf': - self.link_candidates_gcf_fam = link_candidates_pd - else: - raise Exception("No candidate selection was created.") - return link_candidates_pd + if link_type == 'spec-gcf': + self.raw_score_spec_gcf = ( + data_links.cooccurrence_spec_gcf * scoring_weights[0] + + data_links.cooccurrence_spec_notgcf * scoring_weights[1] + + data_links.cooccurrence_notspec_gcf * scoring_weights[2] + + data_links.cooccurrence_notspec_notgcf * scoring_weights[3]) + if link_type == 'mf-gcf': + self.raw_score_mf_gcf = ( + data_links.cooccurrence_mf_gcf * scoring_weights[0] + + data_links.cooccurrence_mf_notgcf * scoring_weights[1] + + data_links.cooccurrence_notmf_gcf * scoring_weights[2] + + data_links.cooccurrence_notmf_notgcf * scoring_weights[3]) + + # TODO CG: this part should be moved outside of this method + n_strains = data_links.occurrence_gcf_strain.shape[1] + if self.metcalf_mean is None or self.metcalf_std is None: + self.metcalf_mean, self.metcalf_std = self._calc_mean_std( + n_strains, scoring_weights) + + # TODO CG: read paper and check the logics of this method + def _calc_mean_std( + self, n_strains: int, scoring_weights: tuple[int, int, int, int] + ) -> tuple[np.ndarray, np.ndarray]: + sz = (n_strains + 1, n_strains + 1) + mean = np.zeros(sz) + variance = np.zeros(sz) + for n in range(n_strains + 1): + for m in range(n_strains + 1): + max_overlap = min(n, m) + min_overlap = max(0, n + m - n_strains) + expected_value = 0 + expected_sq = 0 + for o in range(min_overlap, max_overlap + 1): + o_prob = hypergeom.pmf(o, n_strains, n, m) + # compute metcalf for n strains in type 1 and m in gcf + score = o * scoring_weights[0] + score += scoring_weights[1] * (n - o) + score += scoring_weights[2] * (m - o) + score += scoring_weights[3] * (n_strains - (n + m - o)) + expected_value += o_prob * score + expected_sq += o_prob * (score**2) + mean[n, m] = expected_value + expected_sq = expected_sq - expected_value**2 + if expected_sq < 1e-09: + expected_sq = 1 + variance[n, m] = expected_sq + return mean, np.sqrt(variance) def get_links(self, - data_links, - input_object, - main_score='likescore', - score_cutoff=0.5): - """ - Output likely links for 'input_object' - - Parameters - ---------- - input_objects: object() - Object or list of objects of either class: spectra, families, or GCFs - main_score: str - Which main score to use ('metcalf', 'likescore') - score_cutoff: - Thresholds to conly consider candidates for which: - score >= score_cutoff - """ - - if main_score not in SCORING_METHODS: - raise Exception( - 'Wrong scoring type given. Must be one of: {}'.format( - SCORING_METHODS)) - - # Check if input is list: - if isinstance(input_object, list): - query_size = len(input_object) + *objects: tuple[GCF, ...] | tuple[Spectrum, ...] + | tuple[MolecularFamily, ...], + score_cutoff: float = 0.5) -> list[pd.DataFrame]: + """Get links and scores for given objects. + + Args: + objects(tuple): A list of GCF, Spectrum or MolecularFamily objects + and all objects must be of the same type. + score_cutoff(float): Minimum score to consider a link (≥score_cutoff). + Default is 0.5. + Returns: + list: List of data frames containing the ids of the linked objects + and the score. The data frame has index names of + 'source', 'target' and 'score': + - the 'source' row contains the ids of the input/source objects, + - the 'target' row contains the ids of the target objects, + - the 'score' row contains the scores. + + Raises: + ValueError: If input objects are empty. + TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. + """ + if len(objects) == 0: + raise ValueError('Empty input objects.') + + if isinstance_all(*objects, objtype=GCF): + obj_type = 'gcf' + elif isinstance_all(*objects, objtype=Spectrum): + obj_type = 'spec' + elif isinstance_all(*objects, objtype=MolecularFamily): + obj_type = 'mf' else: - input_object = [input_object] - query_size = 1 - - # Check type of input_object: - # If GCF: - if isinstance(input_object[0], GCF): - input_type = "gcf" - link_levels = [0, 1] - - # Get necessary ids - # CG: TODO update the logics here: - # don't use integer gcf.id, use string gcf.gcf_id instead. - input_ids = np.array([gcf.id for gcf in input_object], - dtype=np.int32) - - if main_score == 'likescore': - likescores = [ - self.likescores_spec_gcf[:, input_ids], - self.likescores_fam_gcf[:, input_ids] - ] - elif main_score == 'metcalf': - metcalf_scores = [ - self.metcalf_spec_gcf[:, input_ids], - self.metcalf_fam_gcf[:, input_ids] - ] - elif main_score == 'hg': - hg_scores = [ - self.hg_spec_gcf[:, input_ids], self.hg_fam_gcf[:, - input_ids] - ] - - # If Spectrum: - elif isinstance(input_object[0], Spectrum): - # Get necessary ids - input_ids = np.array([spec.id for spec in input_object], - dtype=np.int32) - - input_type = "spec" - link_levels = [0] - if main_score == 'likescore': - likescores = [self.likescores_spec_gcf[input_ids, :], []] - elif main_score == 'metcalf': - metcalf_scores = [self.metcalf_spec_gcf[input_ids, :], []] - elif main_score == 'hg': - hg_scores = [self.hg_spec_gcf[input_ids, :], []] - # If MolecularFamily: - elif isinstance(input_object[0], MolecularFamily): - - # Get necessary ids - # TODO: include Singletons, maybe optinal - #input_ids = np.zeros(query_size) - #mapping_fam_id = data_links.mapping_fam["original family id"] - #for i, family in enumerate(input_object): - # input_ids[i] = np.where(mapping_fam_id == int(family.family_id))[0] - #input_ids = input_ids.astype(int) - input_ids = np.array([mf.id for mf in input_object], - dtype=np.int32) - - input_type = "fam" - link_levels = [1] - if main_score == 'likescore': - likescores = [[], self.likescores_fam_gcf[input_ids, :]] - elif main_score == 'metcalf': - metcalf_scores = [[], self.metcalf_fam_gcf[input_ids, :]] - elif main_score == 'hg': - hg_scores = [[], self.hg_fam_gcf[input_ids, :]] - else: - raise Exception( - "Input_object must be Spectrum, MolecularFamily, or GCF object (single or list)." + types = [type(i) for i in objects] + raise TypeError( + f'Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects.' ) links = [] - for linklevel in link_levels: - if score_cutoff is not None: - if main_score == 'likescore': - candidate_ids = np.where( - likescores[linklevel] >= score_cutoff) - elif main_score == 'metcalf': - candidate_ids = np.where( - metcalf_scores[linklevel] >= score_cutoff) - elif main_score == 'hg': - candidate_ids = np.where( - hg_scores[linklevel] >= score_cutoff) - else: - # should never happen - raise Exception( - f'Unknown scoring type! "{main_score}"') - else: - # TODO is this best way to get same output as above code? - # to keep the remainder of the method identical in the case of no cutoff - # being supplied, while still returning all the candidate links, I'm - # currently abusing np.where like this - candidate_ids = np.where(metcalf_scores[linklevel] != np.nan) - - link_candidates = np.zeros((3, candidate_ids[0].shape[0])) - - # this is supposed to construct a (3, x) array, where: - # - 1st index gives list of source/input object IDs - # - 2nd index gives list of destination/link object IDs - # - 3rd index gives list of scores for the link between the given pair of objects - # - x = number of links found - - # if there is only a single object given as input, things are pretty simple here: - if query_size == 1: - # first, can set every index of the input object ID array to the - # single object ID we've been give - link_candidates[0, :] = input_ids - # then, based on input type copy the other object IDs from candidate_ids - if input_type == 'gcf': - link_candidates[1, :] = candidate_ids[0].astype(int) - else: - link_candidates[1, :] = candidate_ids[1].astype(int) - else: - # if there is a list of input objects, things are slightly more complex - # - the "input IDs" element of the output array now needs to be set by - # a lookup into the original input_ids array based on candidate_ids - # - the "output IDs" element is taken directly from the other element - # of candidate IDs - if input_type == 'gcf': - link_candidates[0, :] = input_ids[candidate_ids[1].astype( - int)] - link_candidates[1, :] = candidate_ids[0].astype(int) - else: - link_candidates[0, :] = input_ids[candidate_ids[0].astype( - int)] - link_candidates[1, :] = candidate_ids[1].astype(int) - - # finally, copy in the actual scores too - if main_score == 'likescore': - link_candidates[2, :] = likescores[linklevel][candidate_ids] - elif main_score == 'metcalf': - link_candidates[ - 2, :] = metcalf_scores[linklevel][candidate_ids] - elif main_score == 'hg': - link_candidates[2, :] = hg_scores[linklevel][candidate_ids] - - links.append(link_candidates) - + if obj_type == 'gcf': + # TODO CG: the hint and mypy warnings will be gone after renaming all + # string ids to `.id` + obj_ids = [gcf.gcf_id for gcf in objects] + # spec-gcf + scores = self.raw_score_spec_gcf.loc[:, obj_ids] + df = self._get_scores_source_gcf(scores, score_cutoff) + df.name = LINK_TYPES[0] + links.append(df) + # mf-gcf + scores = self.raw_score_mf_gcf.loc[:, obj_ids] + df = self._get_scores_source_gcf(scores, score_cutoff) + df.name = LINK_TYPES[1] + links.append(df) + + if obj_type == 'spec': + obj_ids = [spec.spectrum_id for spec in objects] + scores = self.raw_score_spec_gcf.loc[obj_ids, :] + df = self._get_scores_source_met(scores, score_cutoff) + df.name = LINK_TYPES[0] + links.append(df) + + if obj_type == 'mf': + obj_ids = [mf.family_id for mf in objects] + scores = self.raw_score_mf_gcf.loc[obj_ids, :] + df = self._get_scores_source_met(scores, score_cutoff) + df.name = LINK_TYPES[1] + links.append(df) return links - def create_cytoscape_files(self, - data_links, - network_filename, - link_type='fam-gcf', - score_type='metcalf'): - """ - Create network file for import into Cytoscape. - Network file will be generated using networkx. - The type of network created here is a bipartite network. - mass spec side --> bipartite = 0 - gene cluster side --> bipartite = 1 - Output format is a graphml file. - - Parameters - ---------- - data_links: DataLinks() object - containing co-occurence matrices - likelihood: LinkLikelihood() object - containing co-occurence likelihoods - network_filename: str - Filename to save generated model as graphml file. - """ - - import networkx as nx - NPlinker_net = nx.Graph() - - if link_type == 'fam-gcf': - link_candidates = self.link_candidates_gcf_fam - type1str = 'family_id' - elif link_type == 'spec-gcf': - link_candidates = self.link_candidates_gcf_spec - type1str = 'spectrum_id' - else: - raise Exception("Wrong link-type given.") - - # Select score type - if score_type == 'metcalf': - scorestr = 'metcalf score' - elif score_type == 'likescore': - scorestr = 'likelihood score' - else: - raise Exception( - "Wrong score_type given. Must be one of: 'metcalf', 'likescore' ." - ) - - # Add nodes (all partners from link_candidate table): - # mass spec side --> bipartite = 0 - type1_names = [] - for type1 in link_candidates[type1str].astype(int): - type1_names.append(type1str + str(type1)) - - type1_names_unique = list(set(type1_names)) - NPlinker_net.add_nodes_from(type1_names_unique, bipartite=0) - - # gene cluster side --> bipartite = 1 - type2_names = [] - for type2 in link_candidates['GCF id']: - type2_names.append("GCF_" + str(type2)) - - type2_names_unique = list(set(type2_names)) - NPlinker_net.add_nodes_from(type2_names_unique, bipartite=1) - - # Add edges: - for i in range(0, link_candidates.shape[0]): - NPlinker_net.add_edge(type1_names[i], - type2_names[i], - weight=float(link_candidates[scorestr][i])) - - # Add edges between molecular family members - if link_type == 'spec-gcf': - type1_unique = (np.unique(link_candidates[type1str])).astype(int) - map_spec_fam = data_links.mapping_spec["fam-id"] - for type1 in type1_unique: - if map_spec_fam[type1] > 0: # if no singleton - members = data_links.family_members[int( - map_spec_fam[type1])][0] - - # select other family members if among link candidates - members_present = [ - x for x in list(members) if x in list(type1_unique) - ] - for member in members_present: - NPlinker_net.add_edge(type1str + str(type1), - type1str + str(member)) - - # export graph for drawing (e.g. using Cytoscape) - nx.write_graphml(NPlinker_net, network_filename) - - def plot_candidates(self, - P_cutoff=0.8, - score_type='likescore', - score_cutoff=0, - type='fam-gcf'): - """ - Plot best rated correlations between gcfs and spectra/families - plot in form of seaborn clustermap - """ - - # Select score type - if score_type == 'metcalf': - scorestr = 'metcalf score' - elif score_type == 'likescore': - scorestr = 'likelihood score' - else: - raise Exception( - "Wrong score_type given. Must be one of: 'metcalf', 'likescore' ." - ) - - if type == 'spec-gcf': - link_candidates = self.link_candidates_gcf_spec - selected_ids = np.where( - (link_candidates["P(gcf|spec)"] > P_cutoff) - & (link_candidates[scorestr] > score_cutoff))[0] - elif type == 'fam-gcf': - link_candidates = self.link_candidates_gcf_fam - selected_ids = np.where( - (link_candidates["P(gcf|fam)"] > P_cutoff) - & (link_candidates[scorestr] > score_cutoff))[0] - else: - raise Exception("Wrong correlation 'type' given.") - - mapping_fams = np.unique(link_candidates.iloc[selected_ids, 0]) - mapping_gcfs = np.unique(link_candidates.iloc[selected_ids, 1]) - unique_fams = len(mapping_fams) - unique_gcfs = len(mapping_gcfs) - - M_links = np.zeros((unique_fams, unique_gcfs)) - - # define colors for different BGC classes - bigscape_classes_dict = { - "Others": "C0", - "NRPS": "C1", - "PKS-NRP_Hybrids": "C2", - "PKSother": "C3", - "PKSI": "C4", - "RiPPs": "C5", - "Saccharides": "C6", - "Terpene": "C7" - } - col_colors = [] - - # Create matrix with relevant link scores - # TODO replace by better numpy method... - for i in range(0, link_candidates.shape[0]): - x = np.where(mapping_fams == link_candidates.iloc[i, 0])[0] - y = np.where(mapping_gcfs == link_candidates.iloc[i, 1])[0] - M_links[x, y] = link_candidates[scorestr][i] - - # make pandas dataframe from numpy array - M_links = pd.DataFrame(M_links, - index=mapping_fams.astype(int), - columns=mapping_gcfs.astype(int)) - if type == 'spec-gcf': - M_links.index.name = 'spectrum number' - elif type == 'fam-gcf': - M_links.index.name = 'molecular family number' - M_links.columns.name = 'gene cluster family (GCF)' - - # add color label representing gene cluster class - for bgc_class in link_candidates["BGC class"]: - if bgc_class is None: - col_colors.append((0, 0, 0)) - else: - col_colors.append(bigscape_classes_dict[bgc_class]) - - # bgc_type_colors = pd.Series(mapping_gcfs.astype(int), index=M_links.columns).map(col_colors) - graph = sns.clustermap( - M_links, - metric="correlation", - method="weighted", - cmap= - "Reds", #sns.cubehelix_palette(8, start=0, rot=.3, dark=0, light=1), - vmin=0, - vmax=np.max(link_candidates[scorestr]), - col_cluster=True, - col_colors=col_colors, - robust=True) - graph.fig.suptitle('Correlation map') - - # Rotate labels - plt.setp(graph.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) - plt.setp(graph.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) - - # Make labels smaller - plt.setp(graph.ax_heatmap.xaxis.get_majorticklabels(), fontsize=7) - plt.setp(graph.ax_heatmap.yaxis.get_majorticklabels(), fontsize=7) - - plt.ylabel("scoring index") - - return M_links + def _get_scores_source_gcf(self, scores: pd.DataFrame, + score_cutoff: float) -> pd.DataFrame: + row_indexes, col_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.columns[col_indexes].to_list() + target_obj_ids = scores.index[row_indexes].to_list() + scores_candidate = scores.values[row_indexes, col_indexes].tolist() + return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate], + index=['source', 'target', 'score']) + + def _get_scores_source_met(self, scores: pd.DataFrame, + score_cutoff: float) -> pd.DataFrame: + row_indexes, col_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.index[row_indexes].to_list() + target_obj_ids = scores.columns[col_indexes].to_list() + scores_candidate = scores.values[row_indexes, col_indexes].tolist() + return pd.DataFrame([src_obj_ids, target_obj_ids, scores_candidate], + index=['source', 'target', 'score']) diff --git a/src/nplinker/scoring/linking/link_likelihood.py b/src/nplinker/scoring/linking/link_likelihood.py index fa68f952..b13e1e97 100644 --- a/src/nplinker/scoring/linking/link_likelihood.py +++ b/src/nplinker/scoring/linking/link_likelihood.py @@ -1,11 +1,12 @@ +from deprecated import deprecated from nplinker.logconfig import LogConfig -from nplinker.scoring.linking.data_linking_functions import \ +from nplinker.scoring.linking.utils import \ calc_likelihood_matrix logger = LogConfig.getLogger(__file__) - +@deprecated(version='1.3.3', reason="It's unused and will be removed in 2.0.0") class LinkLikelihood(): """ Class to: @@ -39,27 +40,27 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): IF type='spec-gcf': P(GCF_x | spec_y), P(spec_y | GCF_x), P(GCF_x | not spec_y), P(spec_y | not GCF_x) - IF type='fam-gcf': + IF type='mf-gcf': P(GCF_x | fam_y), P(fam_y | GCF_x), P(GCF_x | not fam_y), P(fam_y | not GCF_x) """ - # Make selection for scenario spec<->gcf or fam<->gcf + # Make selection for scenario spec<->gcf or mf<->gcf if type == 'spec-gcf': - M_type1_type2 = data_links.M_spec_gcf - M_type1_nottype2 = data_links.M_spec_notgcf - M_nottype1_type2 = data_links.M_notspec_gcf - M_type1_cond = data_links.M_spec_strain - elif type == 'fam-gcf': - M_type1_type2 = data_links.M_fam_gcf - M_type1_nottype2 = data_links.M_fam_notgcf - M_nottype1_type2 = data_links.M_notfam_gcf - M_type1_cond = data_links.M_fam_strain + M_type1_type2 = data_links.cooccurrence_spec_gcf + M_type1_nottype2 = data_links.cooccurrence_spec_notgcf + M_nottype1_type2 = data_links.cooccurrence_notspec_gcf + M_type1_cond = data_links.occurrence_spec_strain + elif type == 'mf-gcf': + M_type1_type2 = data_links.cooccurrence_mf_gcf + M_type1_nottype2 = data_links.cooccurrence_mf_notgcf + M_nottype1_type2 = data_links.cooccurrence_notmf_gcf + M_type1_cond = data_links.occurrence_mf_strain elif type == 'spec-bgc' or type == 'fam-bgc': raise Exception("Given types are not yet supported... ") else: raise Exception( - "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'fam-gcf'..." + "Wrong correlation 'type' given. Must be one of 'spec-gcf', 'mf-gcf'..." ) logger.debug( @@ -67,7 +68,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): # Calculate likelihood matrices using calc_likelihood_matrix() P_type2_given_type1, P_type2_not_type1, P_type1_given_type2, \ P_type1_not_type2 = calc_likelihood_matrix(M_type1_cond, - data_links.M_gcf_strain, + data_links.occurrence_gcf_strain, M_type1_type2, M_type1_nottype2, M_nottype1_type2) @@ -76,7 +77,7 @@ def calculate_likelihoods(self, data_links, type='spec-gcf'): self.P_gcf_not_spec = P_type2_not_type1 self.P_spec_given_gcf = P_type1_given_type2 self.P_spec_not_gcf = P_type1_not_type2 - elif type == 'fam-gcf': + elif type == 'mf-gcf': self.P_gcf_given_fam = P_type2_given_type1 self.P_gcf_not_fam = P_type2_not_type1 self.P_fam_given_gcf = P_type1_given_type2 diff --git a/src/nplinker/scoring/linking/data_linking_functions.py b/src/nplinker/scoring/linking/utils.py similarity index 98% rename from src/nplinker/scoring/linking/data_linking_functions.py rename to src/nplinker/scoring/linking/utils.py index 20824c62..044627ad 100644 --- a/src/nplinker/scoring/linking/data_linking_functions.py +++ b/src/nplinker/scoring/linking/utils.py @@ -18,6 +18,10 @@ import numpy as np +def isinstance_all(*objects, objtype) -> bool: + """Check if all objects are of the given type.""" + return all(isinstance(x, objtype) for x in objects) + def calc_correlation_matrix(M_type1_cond, M_type2_cond): """ Calculate correlation matrices from co-occurence matrices diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 65c45a5f..53e6f062 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -1,39 +1,68 @@ +from __future__ import annotations import os +from typing import TYPE_CHECKING import numpy as np -from nplinker.genomics import BGC +import pandas as pd from nplinker.genomics import GCF from nplinker.logconfig import LogConfig from nplinker.metabolomics.molecular_family import MolecularFamily from nplinker.metabolomics.spectrum import Spectrum from nplinker.pickler import load_pickled_data from nplinker.pickler import save_pickled_data -from nplinker.scoring.methods import ScoringMethod -from nplinker.scoring.object_link import ObjectLink -from nplinker.scoring.linking.data_linking import DataLinks -from nplinker.scoring.linking.link_finder import LinkFinder +from .linking import DataLinks +from .linking import isinstance_all +from .linking import LINK_TYPES +from .linking import LinkFinder +from .methods import ScoringMethod +from .object_link import ObjectLink + + +if TYPE_CHECKING: + from . import LinkCollection + from ..nplinker import NPLinker logger = LogConfig.getLogger(__name__) class MetcalfScoring(ScoringMethod): + """Metcalf scoring method. + + Attributes: + DATALINKS (DataLinks): The DataLinks object to use for scoring. + LINKFINDER (LinkFinder): The LinkFinder object to use for scoring. + NAME (str): The name of the scoring method. This is set to 'metcalf'. + """ DATALINKS = None LINKFINDER = None NAME = 'metcalf' - # enumeration for accessing results of LinkFinder.get_links, which are (3, num_links) arrays: - # - R_SRC_ID: the ID of an object that was supplied as input to get_links - # - R_DST_ID: the ID of an object that was discovered to have a link to an input object - # - R_SCORE: the score for the link between a pair of objects - R_SRC_ID, R_DST_ID, R_SCORE = range(3) + def __init__(self, npl: NPLinker) -> None: + """Create a MetcalfScoring object. + + Args: + npl (NPLinker): The NPLinker object to use for scoring. - def __init__(self, npl): + Attributes: + cutoff (float): The cutoff value to use for scoring. Scores below + this value will be discarded. Defaults to 1.0. + standardised (bool): Whether to use standardised scores. Defaults + to True. + name (str): The name of the scoring method. It's set to a fixed value + 'metcalf'. + """ super().__init__(npl) self.cutoff = 1.0 self.standardised = True + # TODO CG: not sure why using staticmethod here. Check later and refactor if possible + # TODO CG: refactor this method and extract code for cache file to a separate method @staticmethod - def setup(npl): + def setup(npl: NPLinker): + """Setup the MetcalfScoring object. + + DataLinks and LinkFinder objects are created and cached for later use. + """ logger.info( 'MetcalfScoring.setup (bgcs={}, gcfs={}, spectra={}, molfams={}, strains={})' .format(len(npl.bgcs), len(npl.gcfs), len(npl.spectra), @@ -45,7 +74,6 @@ def setup(npl): # the metcalf preprocessing can take a long time for large datasets, so it's # better to cache as the data won't change unless the number of objects does - dataset_counts = [ len(npl.bgcs), len(npl.gcfs), @@ -76,250 +104,225 @@ def setup(npl): logger.info( 'MetcalfScoring.setup preprocessing dataset (this may take some time)' ) - MetcalfScoring.DATALINKS = DataLinks() - MetcalfScoring.DATALINKS.load_data(npl._spectra, npl._gcfs, - npl._strains, npl.molfams) - # TODO fix crash with this set to True, see https://github.com/sdrogers/nplinker/issues/57 - MetcalfScoring.DATALINKS.find_correlations( - include_singletons=False) + MetcalfScoring.DATALINKS = DataLinks(npl.gcfs, npl.spectra, + npl.molfams, npl.strains) MetcalfScoring.LINKFINDER = LinkFinder() - MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - type='spec-gcf') - MetcalfScoring.LINKFINDER.metcalf_scoring(MetcalfScoring.DATALINKS, - type='fam-gcf') + MetcalfScoring.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, + link_type=LINK_TYPES[0]) + MetcalfScoring.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, + link_type=LINK_TYPES[1]) logger.debug('MetcalfScoring.setup caching results') save_pickled_data((dataset_counts, MetcalfScoring.DATALINKS, MetcalfScoring.LINKFINDER), cache_file) logger.info('MetcalfScoring.setup completed') + # TODO CG: is it needed? remove it if not @property - def datalinks(self): + def datalinks(self) -> DataLinks: return MetcalfScoring.DATALINKS - def _metcalf_postprocess_met(self, linkfinder, results, input_type): - logger.debug( - 'Postprocessing results for standardised Metcalf scores (met input)' - ) - # results will be links from EITHER Spectrum OR MolFam => GCF here - - # need to know if the metabolomic objects given as input are Spectrum/MolFam - met_objs = self.npl.spectra if input_type == Spectrum else self.npl.molfams - new_src, new_dst, new_sco = [], [], [] - - # go through each pair of input objects and calculate their standardised scores - for i in range(len(results[0][self.R_SRC_ID])): - met_obj = met_objs[int(results[0][self.R_SRC_ID][i])] - # met_obj will now be either a Spectrum or a MolecularFamily, but - # doesn't matter which (in this implementation at least) because they - # both have a .strains attribute which is the only thing we need. For - # Spectra it's the number of strains, for a MolFam it's the total - # number of *unique* strains across all Spectra in that family. - met_strains = len(met_obj.strains) - gcf = self.npl.gcfs[int(results[0][self.R_DST_ID][i])] - gen_strains = len(gcf.strains) - - # lookup expected + variance values based on strain counts - expected = linkfinder.metcalf_expected[met_strains][gen_strains] - variance_sqrt = linkfinder.metcalf_variance_sqrt[met_strains][ - gen_strains] - - # calculate the final score based on the basic Metcalf score for these two - # particular objects - final_score = (results[0][self.R_SCORE][i] - - expected) / variance_sqrt - - # finally apply the scoring cutoff and store the result - if self.cutoff is None or (final_score >= self.cutoff): - new_src.append(int(results[0][self.R_SRC_ID][i])) - new_dst.append(int(results[0][self.R_DST_ID][i])) - new_sco.append(final_score) - - # overwrite original "results" with equivalent new data structure - return [np.array([new_src, new_dst, new_sco])] - - def _metcalf_postprocess_gen(self, linkfinder, results, input_type): - logger.debug( - 'Postprocessing results for standardised Metcalf scores (gen input)' - ) - # results will be links from GCF to BOTH Spectrum and MolFams here (first - # element Spectra, second MolFams) - - new_results = [] - met_objs_list = [self.npl.spectra, self.npl.molfams] - - # iterate over the Spectrum results and then the MolFam results - for m, met_objs in enumerate(met_objs_list): - new_src, new_dst, new_sco = [], [], [] - - # go through each pair of input objects and calculate their standardised scores - for i in range(len(results[m][self.R_SRC_ID])): - gcf = self.npl.gcfs[int(results[m][self.R_SRC_ID][i])] - gen_strains = len(gcf.strains) - - # met_obj will now be either a Spectrum or a MolecularFamily, but - # doesn't matter which (in this implementation at least) because they - # both have a .strains attribute which is the only thing we need. For - # Spectra it's the number of strains, for a MolFam it's the total - # number of *unique* strains across all Spectra in that family. - met_obj = met_objs[int(results[m][self.R_DST_ID][i])] - met_strains = len(met_obj.strains) - - # lookup expected + variance values based on strain counts - expected = linkfinder.metcalf_expected[met_strains][ - gen_strains] - variance_sqrt = linkfinder.metcalf_variance_sqrt[met_strains][ - gen_strains] - - # calculate the final score based on the basic Metcalf score for these two - # particular objects - final_score = (results[m][self.R_SCORE][i] - - expected) / variance_sqrt - - # finally apply the scoring cutoff and store the result - if self.cutoff is None or (final_score >= self.cutoff): - new_src.append(int(results[m][self.R_SRC_ID][i])) - new_dst.append(int(results[m][self.R_DST_ID][i])) - new_sco.append(final_score) - - # overwrite original "results" with equivalent new data structure - new_results.append(np.array([new_src, new_dst, new_sco])) - - return new_results - - def get_links(self, objects, link_collection): - # enforce constraint that the list must contain a set of identically typed objects - if not all(isinstance(x, type(objects[0])) for x in objects): - raise Exception( - 'MetcalfScoring: uniformly-typed list of objects is required') - - # also can't handle BGCs here, must be one of the other 3 types (GCF/Spectrum/MolecularFamily) - if isinstance(objects[0], BGC): - raise Exception( - 'MetcalfScoring requires input type GCF/Spectrum/MolecularFamily, not BGC' + def get_links(self, *objects: GCF | Spectrum | MolecularFamily, + link_collection: LinkCollection) -> LinkCollection: + """Get links for the given objects and add them to the given LinkCollection. + + The given objects are treated as input or source objects, which must + be GCF, Spectrum or MolecularFamily objects. + + Args: + objects: The objects to get links for. Must be GCF, Spectrum + or MolecularFamily objects. + link_collection: The LinkCollection object to add the links to. + + Returns: + LinkCollection: The LinkCollection object with the new links added. + + Raises: + ValueError: If the input objects are empty. + TypeError: If the input objects are not of the correct type. + ValueError: If LinkFinder instance has not been created + (MetcalfScoring object has not been setup). + """ + if len(objects) == 0: + raise ValueError('Empty input objects.') + + if isinstance_all(*objects, objtype=GCF): + obj_type = 'gcf' + elif isinstance_all(*objects, objtype=Spectrum): + obj_type = 'spec' + elif isinstance_all(*objects, objtype=MolecularFamily): + obj_type = 'mf' + else: + types = [type(i) for i in objects] + raise TypeError( + f'Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects.' ) - datalinks = MetcalfScoring.DATALINKS - linkfinder = MetcalfScoring.LINKFINDER - input_type = type(objects[0]) + if self.LINKFINDER is None: + raise ValueError(( + 'LinkFinder object not found. Have you called `MetcalfScoring.setup(npl)`?' + )) - logger.debug('MetcalfScoring: standardised = {}'.format( - self.standardised)) + logger.debug(f'MetcalfScoring: standardised = {self.standardised}') if not self.standardised: - results = linkfinder.get_links(datalinks, objects, self.name, - self.cutoff) + scores_list = self.LINKFINDER.get_links(*objects, + score_cutoff=self.cutoff) + # TODO CG: verify the logics of standardised score and add unit tests else: - # get the basic Metcalf scores BUT ignore the cutoff value here by setting - # it to None. The actual user-supplied cutoff value is applied further down - # once the standardised scores for these results have been calculated. - results = linkfinder.get_links(datalinks, objects, self.name, None) - - # The "results" object varies slightly depending on the input provided - # to the LinkFinder class: - # - given Spectra/MolFam input, it will be a single element list containing - # a (3, x) array, where the first row contains source (input) object - # IDs, the second contains destination (linked) object IDs, and the - # third contains regular Metcalf scores for those pairs of objects. - # - however for GCF input, "results" is instead a 2-element list where - # each entry has the same structure as described above, with the first - # entry describing GCF-Spectrum links and the second GCF-MolFam links. - - gcf_input = (input_type == GCF) - - if not gcf_input: - results = self._metcalf_postprocess_met( - linkfinder, results, input_type) + # use negative infinity as the score cutoff to ensure we get all links + # the self.cutoff will be applied later in the postprocessing step + scores_list = self.LINKFINDER.get_links(*objects, + score_cutoff=np.NINF) + if obj_type == 'gcf': + scores_list = self._calc_standardised_score_gen( + self.LINKFINDER, scores_list) else: - results = self._metcalf_postprocess_gen( - linkfinder, results, input_type) + scores_list = self._calc_standardised_score_met( + self.LINKFINDER, scores_list) - scores_found = set() - metcalf_results = {} - - if input_type == GCF: + link_scores: dict[GCF | Spectrum | MolecularFamily, + dict[GCF | Spectrum | MolecularFamily, + ObjectLink]] = {} + if obj_type == 'gcf': logger.debug( - 'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs={}, results={}' - .format(len(objects), results[0].shape)) - # for GCF input, results contains two arrays of shape (3, x), - # which contain spec-gcf and fam-gcf links respectively - result_gcf_spec, result_gcf_fam = results[0], results[1] - - for res, type_ in [(result_gcf_spec, Spectrum), - (result_gcf_fam, MolecularFamily)]: - if res.shape[1] == 0: - if type_ != MolecularFamily: - logger.debug( - 'Found no links for {} input objects (type {})'. - format(len(objects), type_)) - continue # no results - - # for each entry in the results (each Spectrum or MolecularFamily) - for j in range(res.shape[1]): - # extract the ID of the object and get the object itself - obj_id = int(res[self.R_DST_ID, j]) - obj = self.npl._spectra[ - obj_id] if type_ == Spectrum else self.npl._molfams[ - obj_id] - - # retrieve the GCF object too (can use its internal ID to index - # directly into the .gcfs list) - gcf = self.npl._gcfs[int(res[self.R_SRC_ID][j])] - - # record that this GCF has at least one link associated with it - scores_found.add(gcf) - - # save the scores - if gcf not in metcalf_results: - metcalf_results[gcf] = {} - metcalf_results[gcf][obj] = ObjectLink( - gcf, obj, self, res[self.R_SCORE, j]) - + f'MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, ' + f'#inputs={len(objects)}.') + for scores in scores_list: + # when no links found + if scores.shape[1] == 0: + logger.debug( + f'MetcalfScoring: found no "{scores.name}" links') + else: + # when links found + for col_index in range(scores.shape[1]): + gcf = self.npl.lookup_gcf(scores.loc['source', + col_index]) + if scores.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum( + scores.loc['target', col_index]) + else: + met = self.npl.lookup_mf(scores.loc['target', + col_index]) + if gcf not in link_scores: + link_scores[gcf] = {} + # TODO CG: use id instead of object for gcf, met and self? + link_scores[gcf][met] = ObjectLink( + gcf, met, self, scores.loc['score', col_index]) + logger.debug( + f'MetcalfScoring: found {len(link_scores)} {scores.name} links.' + ) else: logger.debug( - 'MetcalfScoring: input_type=Spec/MolFam, result_type=GCF, inputs={}, results={}' - .format(len(objects), results[0].shape)) - # for non-GCF input, result is a list containing a single array, shape (3, x) - # where x is the total number of links found - results = results[0] - if results.shape[1] == 0: - logger.debug('Found no links for {} input objects'.format( - len(objects))) - link_collection._add_links_from_method(self, metcalf_results) - # can just bail out here in this case - logger.debug('MetcalfScoring: completed') - return link_collection - - # for each entry in the results (each GCF) - for j in range(results.shape[1]): - # extract the ID of the GCF and use that to get the object itself - gcf = self.npl._gcfs[int(results[self.R_DST_ID, j])] - - # retrieve the Spec/MolFam object too (can use its internal ID to index - # directly into the appropriate list) - obj_id = int(results[self.R_SRC_ID, j]) - obj = self.npl._spectra[ - obj_id] if input_type == Spectrum else self.npl._molfams[ - obj_id] - - # record that this Spectrum or MolecularFamily has at least one link associated with it - scores_found.add(obj) - - # save the scores - if obj not in metcalf_results: - metcalf_results[obj] = {} - metcalf_results[obj][gcf] = ObjectLink( - obj, gcf, self, results[self.R_SCORE, j]) - - logger.debug('MetcalfScoring found {} results'.format( - len(metcalf_results))) - link_collection._add_links_from_method(self, metcalf_results) + f'MetcalfScoring: input_type=Spec/MolFam, result_type=GCF, ' + f'#inputs={len(objects)}.') + scores = scores_list[0] + # when no links found + if scores.shape[1] == 0: + logger.debug( + f'MetcalfScoring: found no links "{scores.name}" for input objects' + ) + else: + for col_index in range(scores.shape[1]): + gcf = self.npl.lookup_gcf(scores.loc['target', col_index]) + if scores.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum(scores.loc['source', + col_index]) + else: + met = self.npl.lookup_mf(scores.loc['source', + col_index]) + if met not in link_scores: + link_scores[met] = {} + link_scores[met][gcf] = ObjectLink( + met, gcf, self, scores.loc['score', col_index]) + logger.debug( + f'MetcalfScoring: found {len(link_scores)} {scores.name} links.' + ) + + link_collection._add_links_from_method(self, link_scores) logger.debug('MetcalfScoring: completed') return link_collection + def _calc_standardised_score_met(self, linkfinder: LinkFinder, + results: list) -> list[pd.DataFrame]: + if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: + raise ValueError( + 'Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?' + ) + logger.debug('Calculating standardised Metcalf scores (met input)') + raw_score = results[0] + z_scores = [] + for col_index in range(raw_score.shape[1]): + gcf = self.npl.lookup_gcf(raw_score.loc['target', col_index]) + if raw_score.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum(raw_score.at['source', + col_index]) + else: + met = self.npl.lookup_mf(raw_score.at['source', col_index]) + + num_gcf_strains = len(gcf.strains) + num_met_strains = len(met.strains) + mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains] + sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains] + z_score = (raw_score.at['score', col_index] - mean) / sqrt + z_scores.append(z_score) + + z_scores = np.array(z_scores) + mask = z_scores >= self.cutoff + + scores_df = pd.DataFrame([ + raw_score.loc['source'].values[mask], + raw_score.loc['target'].values[mask], z_scores[mask] + ], + index=raw_score.index) + scores_df.name = raw_score.name + + return [scores_df] + + def _calc_standardised_score_gen(self, linkfinder: LinkFinder, + results: list) -> list[pd.DataFrame]: + if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: + raise ValueError( + 'Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?' + ) + logger.debug('Calculating standardised Metcalf scores (gen input)') + postprocessed_scores = [] + for raw_score in results: + z_scores = [] + for col_index in range(raw_score.shape[1]): + gcf = self.npl.lookup_gcf(raw_score.loc['source', col_index]) + if raw_score.name == LINK_TYPES[0]: + met = self.npl.lookup_spectrum(raw_score.at['target', + col_index]) + else: + met = self.npl.lookup_mf(raw_score.at['target', col_index]) + + num_gcf_strains = len(gcf.strains) + num_met_strains = len(met.strains) + mean = linkfinder.metcalf_mean[num_met_strains][ + num_gcf_strains] + sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains] + z_score = (raw_score.at['score', col_index] - mean) / sqrt + z_scores.append(z_score) + + z_scores = np.array(z_scores) + mask = z_scores >= self.cutoff + + scores_df = pd.DataFrame([ + raw_score.loc['source'].values[mask], + raw_score.loc['target'].values[mask], z_scores[mask] + ], + index=raw_score.index) + scores_df.name = raw_score.name + postprocessed_scores.append(scores_df) + + return postprocessed_scores + + # TODO CG: refactor this method def format_data(self, data): # for metcalf the data will just be a floating point value (i.e. the score) return f'{data:.4f}' + # TODO CG: refactor this method def sort(self, objects, reverse=True): # sort based on score return sorted(objects, diff --git a/src/nplinker/scoring/methods.py b/src/nplinker/scoring/methods.py index ff6e7f54..e42b6cdd 100644 --- a/src/nplinker/scoring/methods.py +++ b/src/nplinker/scoring/methods.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations +from typing import TYPE_CHECKING from nplinker.logconfig import LogConfig +if TYPE_CHECKING: + from . import LinkCollection logger = LogConfig.getLogger(__name__) @@ -35,7 +39,7 @@ def setup(npl): """Perform any one-off initialisation required (will only be called once)""" pass - def get_links(self, objects, link_collection): + def get_links(self, *objects, link_collection: LinkCollection) -> LinkCollection: """Given a set of objects, return link information""" return link_collection diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index 51335c00..84975344 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -261,7 +261,7 @@ def _get_met_classes(self, spec_like, method='mix'): # list of list of tuples/None - todo: add to spectrum object? # take only 'best' (first) classification per ontology level all_classes = self.npl.chem_classes.canopus. \ - spectra_classes.get(str(spec_like.spectrum_id)) + spectra_classes.get(spec_like.spectrum_id) if all_classes: spec_like_classes = [ cls_per_lvl for lvl in all_classes @@ -270,8 +270,8 @@ def _get_met_classes(self, spec_like, method='mix'): spec_like_classes_names_inds = self.npl.chem_classes.canopus. \ spectra_classes_names_inds else: # molfam - fam_id = str(spec_like.family_id) - if fam_id == '-1': # account for singleton families + fam_id = spec_like.family_id + if fam_id.startswith("singleton-"): # account for singleton families fam_id += f'_{spec_like.spectra[0].spectrum_id}' all_classes = self.npl.chem_classes.canopus.molfam_classes.get( fam_id) @@ -288,8 +288,8 @@ def _get_met_classes(self, spec_like, method='mix'): spec_like_classes = self.npl.chem_classes.molnetenhancer. \ spectra_classes(spec_like.spectrum_id) else: # molfam - fam_id = str(spec_like.family_id) - if fam_id == '-1': # account for singleton families + fam_id = spec_like.family_id + if fam_id.startswith("singleton"): # account for singleton families fam_id += f'_{spec_like.spectra[0].spectrum_id}' spec_like_classes = self.npl.chem_classes.molnetenhancer. \ molfam_classes.get(fam_id) @@ -337,10 +337,12 @@ def get_links(self, objects, link_collection): if not self.npl._datalinks: self.npl._datalinks = self.npl.scoring_method( MetcalfScoring.NAME).datalinks - # this is a dict with structure: - # tup(Spectrum/MolecularFamily, GCF) => array of strain indices - common_strains = self.npl._datalinks.common_strains( - objects, targets, True) + if obj_is_gen: + common_strains = self.npl.get_common_strains( + targets, objects) + else: + common_strains = self.npl.get_common_strains( + objects, targets) logger.info(f"Calculating NPClassScore for {len(objects)} objects to " f"{len(targets)} targets ({len(common_strains)} pairwise " f"interactions that share at least 1 strain). This might " diff --git a/src/nplinker/scoring/rosetta/spec_lib.py b/src/nplinker/scoring/rosetta/spec_lib.py index 27bb9723..1a332a5e 100644 --- a/src/nplinker/scoring/rosetta/spec_lib.py +++ b/src/nplinker/scoring/rosetta/spec_lib.py @@ -13,8 +13,8 @@ # limitations under the License. from sortedcontainers import SortedList -from ...logconfig import LogConfig from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader +from ...logconfig import LogConfig from .rosetta_functions import fast_cosine diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain_collection.py index 00603e96..42387c9b 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain_collection.py @@ -1,5 +1,4 @@ import csv -import os from os import PathLike from pathlib import Path from typing import Iterator @@ -18,8 +17,8 @@ class StrainCollection(): def __init__(self): """A collection of Strain objects.""" self._strains: list[Strain] = [] - self._strain_dict_id: dict[str, Strain] = {} - self._strain_dict_index: dict[int, Strain] = {} + # dict of strain name (id and alias) to primary strain object + self._strain_dict_name: dict[str, Strain] = {} def __repr__(self) -> str: return str(self) @@ -37,18 +36,19 @@ def __len__(self) -> int: def __eq__(self, other) -> bool: if isinstance(other, StrainCollection): return (self._strains == other._strains - and self._strain_dict_id == other._strain_dict_id - and self._strain_dict_index == other._strain_dict_index) + and self._strain_dict_name == other._strain_dict_name) return NotImplemented - def __contains__(self, strain: str | Strain) -> bool: - if isinstance(strain, str): - value = strain in self._strain_dict_id - elif isinstance(strain, Strain): - value = strain.id in self._strain_dict_id - else: - raise TypeError(f"Expected Strain or str, got {type(strain)}") - return value + def __contains__(self, item: str | Strain) -> bool: + """Check if the strain collection contains the given strain. + + The given strain could be a Strain object, or a strain id or alias. + """ + if isinstance(item, str): + return item in self._strain_dict_name + if isinstance(item, Strain): + return item.id in self._strain_dict_name + raise TypeError(f"Expected Strain or str, got {type(item)}") def __iter__(self) -> Iterator[Strain]: return iter(self._strains) @@ -62,17 +62,16 @@ def add(self, strain: Strain) -> None: strain(Strain): The strain to add. """ # if the strain exists, merge the aliases - if strain.id in self._strain_dict_id: + if strain.id in self._strain_dict_name: existing: Strain = self.lookup(strain.id) for alias in strain.aliases: existing.add_alias(alias) - self._strain_dict_id[alias] = existing + self._strain_dict_name[alias] = existing else: - self._strain_dict_index[len(self)] = strain self._strains.append(strain) - self._strain_dict_id[strain.id] = strain + self._strain_dict_name[strain.id] = strain for alias in strain.aliases: - self._strain_dict_id[alias] = strain + self._strain_dict_name[alias] = strain def remove(self, strain: Strain): """Remove a strain from the collection. @@ -80,12 +79,11 @@ def remove(self, strain: Strain): Args: strain(Strain): The strain to remove. """ - if strain.id in self._strain_dict_id: + if strain.id in self._strain_dict_name: self._strains.remove(strain) - # remove from dict id - del self._strain_dict_id[strain.id] + del self._strain_dict_name[strain.id] for alias in strain.aliases: - del self._strain_dict_id[alias] + del self._strain_dict_name[alias] def filter(self, strain_set: set[Strain]): """ @@ -96,24 +94,11 @@ def filter(self, strain_set: set[Strain]): if strain not in strain_set: self.remove(strain) - def lookup_index(self, index: int) -> Strain: - """Return the strain from lookup by index. - - Args: - index(int): Position index from which to retrieve the strain - - Returns: - Strain: Strain identified by the given index. - """ - return self._strain_dict_index[index] - def lookup(self, name: str) -> Strain: """Lookup a strain by name (id or alias). - If the name is found, return the strain object; Otherwise, raise a - KeyError. - Args: + name(str): Strain name (id or alias) to lookup. Returns: @@ -122,9 +107,9 @@ def lookup(self, name: str) -> Strain: Raises: KeyError: If the strain name is not found. """ - if name not in self._strain_dict_id: - raise KeyError(f"Strain {name} not found in strain collection.") - return self._strain_dict_id[name] + if name in self: + return self._strain_dict_name[name] + raise KeyError(f"Strain {name} not found in strain collection.") def add_from_file(self, file: str | PathLike) -> None: """Add strains from a strain mapping file. diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 661c7268..7b07a588 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -27,8 +27,7 @@ def __str__(self) -> str: def __eq__(self, other) -> bool: if isinstance(other, Strain): - return (self.id == other.id - and self.aliases == other.aliases) + return self.id == other.id return NotImplemented def __hash__(self) -> int: @@ -39,6 +38,11 @@ def __hash__(self) -> int: """ return hash(self.id) + def __contains__(self, alias: str) -> bool: + if not isinstance(alias, str): + raise TypeError(f'Expected str, got {type(alias)}') + return alias in self._aliases + @property def aliases(self) -> set[str]: """Get the set of known aliases. @@ -54,6 +58,8 @@ def add_alias(self, alias: str) -> None: Args: alias(str): The alias to add to the list of known aliases. """ + if not isinstance(alias, str): + raise TypeError(f'Expected str, got {type(alias)}') if len(alias) == 0: logger.warning( 'Refusing to add an empty-string alias to strain {%s}', self) diff --git a/tests/conftest.py b/tests/conftest.py index cac0255c..123e45d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,7 @@ def prepare_data(): @pytest.fixture -def spec_dict() -> dict[int, Spectrum]: +def spec_dict() -> dict[str, Spectrum]: mgf_file = DATA_DIR / "spectra.mgf" edges_file = DATA_DIR / "edges.pairsinfo" return load_spectra(mgf_file, edges_file) diff --git a/tests/metabolomics/test_gnps_annotation_loader.py b/tests/metabolomics/test_gnps_annotation_loader.py index 352be137..24f7cf99 100644 --- a/tests/metabolomics/test_gnps_annotation_loader.py +++ b/tests/metabolomics/test_gnps_annotation_loader.py @@ -32,7 +32,7 @@ def test_reads_all_annotations(file, expected): assert len(sut.get_annotations()) == expected -def test_annotations_are_equal(spec_dict: dict[int, Spectrum]): +def test_annotations_are_equal(spec_dict: dict[str, Spectrum]): annotations_dir = DATA_DIR / "annotations" annotations_file = annotations_dir / "gnps_annotations.tsv" @@ -44,7 +44,7 @@ def test_annotations_are_equal(spec_dict: dict[int, Spectrum]): spectra, spec_dict ) - expected: dict[int, dict] = {} + expected = {} for x in sut: if x.has_annotations(): expected[x.spectrum_id] = x.gnps_annotations diff --git a/tests/metabolomics/test_gnps_molecular_family_loader.py b/tests/metabolomics/test_gnps_molecular_family_loader.py index e8ac05a1..6e9cd531 100644 --- a/tests/metabolomics/test_gnps_molecular_family_loader.py +++ b/tests/metabolomics/test_gnps_molecular_family_loader.py @@ -1,42 +1,34 @@ import os -import numpy import pytest from nplinker.metabolomics.gnps.gnps_molecular_family_loader import \ GNPSMolecularFamilyLoader -from nplinker.metabolomics.metabolomics import make_families -from nplinker.metabolomics.molecular_family import MolecularFamily from .. import DATA_DIR -@pytest.fixture -def molecular_families(spec_dict) -> list[MolecularFamily]: - return make_families(spec_dict.values()) - -@pytest.mark.parametrize("filename", [ - os.path.join(DATA_DIR, "edges.pairsinfo"), - DATA_DIR / "edges.pairsinfo" -]) +@pytest.mark.parametrize( + "filename", + [os.path.join(DATA_DIR, "edges.pairsinfo"), DATA_DIR / "edges.pairsinfo"]) def test_has_molecular_families(filename): sut = GNPSMolecularFamilyLoader(filename) actual = sut.families() assert len(actual) == 25769 - assert len(actual[0].spectra_ids) == 19 - - -def test_families_are_identical(spec_dict, molecular_families): - filename = os.path.join(DATA_DIR, "edges.pairsinfo") - actual = GNPSMolecularFamilyLoader(filename).families() - - actual.sort(key= lambda x: min(x.spectra_ids)) - - for i, x in enumerate(actual): - x.id = i - for spec_id in x.spectra_ids: - x.add_spectrum(spec_dict[spec_id]) - - - for x in molecular_families: - for spec in x.spectra: - x.spectra_ids.add(spec.spectrum_id) - numpy.testing.assert_array_equal(actual, molecular_families) + assert [mf.family_id for mf in actual[:29]] == [ + '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', + '14', '15', '16', '17', '18', '20', '21', '22', '23', '24', '26', '28', + '30', '31', '32', '33' + ] + for i in range(30, 25769): + assert actual[i].family_id.startswith('singleton-') + + assert [len(mf.spectra_ids) for mf in actual[:30]] == [ + 19, 48, 3, 3, 11, 4, 9, 3, 15, 3, 5, 2, 3, 3, 5, 3, 14, 4, 2, 2, 12, 2, + 3, 5, 2, 4, 2, 2, 2, 1 + ] + for i in range(30, 25769): + assert len(actual[i].spectra_ids) == 1 + + assert actual[0].spectra_ids == set( + ('13170', '13662', '15316', '15364', '16341', '17201', '17270', + '18120', '18172', '18748', '18831', '19005', '19673', '19719', + '20320', '20738', '21163', '21593', '23566')) diff --git a/tests/metabolomics/test_load_gnps.py b/tests/metabolomics/test_load_gnps.py index f3c4961b..354895c6 100644 --- a/tests/metabolomics/test_load_gnps.py +++ b/tests/metabolomics/test_load_gnps.py @@ -1,13 +1,13 @@ from itertools import chain - import pytest -from nplinker.metabolomics.load_gnps import _messy_strain_naming_lookup, _parse_mzxml_header -from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping, GNPSFormat +from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping +from nplinker.metabolomics.gnps.gnps_format import GNPSFormat from nplinker.metabolomics.load_gnps import _load_clusterinfo_old +from nplinker.metabolomics.load_gnps import _messy_strain_naming_lookup +from nplinker.metabolomics.load_gnps import _parse_mzxml_header from nplinker.metabolomics.load_gnps import load_gnps from nplinker.strain_collection import StrainCollection from nplinker.utils import get_headers - from .. import DATA_DIR @@ -51,7 +51,7 @@ def test_load_clusterinfo_old(spec_dict): for spectrum_id, spec in spec_dict.items(): metadata = spec.metadata assert len(metadata.get('files')) > 1 - assert isinstance(metadata.get('cluster_index'), int) + assert isinstance(metadata.get('cluster_index'), str) assert spectrum_id == metadata.get('cluster_index') diff --git a/tests/metabolomics/test_spectrum.py b/tests/metabolomics/test_spectrum.py index bcb82d36..30473f90 100644 --- a/tests/metabolomics/test_spectrum.py +++ b/tests/metabolomics/test_spectrum.py @@ -1,13 +1,13 @@ import pytest - from nplinker.metabolomics.spectrum import Spectrum + @pytest.fixture def spectrum() -> Spectrum: spec = Spectrum( 1, peaks=[[10, 100], [20, 150]], - spectrum_id=2, + spectrum_id="2", precursor_mz=30, parent_mz=50, rt= 100 diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py new file mode 100644 index 00000000..b64a5694 --- /dev/null +++ b/tests/scoring/conftest.py @@ -0,0 +1,109 @@ +from pytest import fixture +from nplinker.genomics import GCF +from nplinker.metabolomics.molecular_family import MolecularFamily +from nplinker.metabolomics.spectrum import Spectrum +from nplinker.nplinker import NPLinker +from nplinker.scoring import MetcalfScoring +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder +from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain +from .. import DATA_DIR + + +@fixture(scope='session') +def strains_list() -> tuple[Strain, Strain, Strain]: + return Strain('strain1'), Strain('strain2'), Strain('strain3') + + +@fixture(scope='session') +def strains(strains_list) -> StrainCollection: + strains = StrainCollection() + for strain in strains_list: + strains.add(strain) + return strains + + +@fixture(scope='session') +def gcfs(strains_list) -> tuple[GCF, GCF, GCF]: + gcf1 = GCF('gcf1') + gcf1.strains.add(strains_list[0]) + gcf2 = GCF('gcf2') + gcf2.strains.add(strains_list[1]) + gcf3 = GCF('gcf3') + gcf3.strains.add(strains_list[0]) + gcf3.strains.add(strains_list[1]) + return gcf1, gcf2, gcf3 + + +@fixture(scope='session') +def spectra(strains_list) -> tuple[Spectrum, Spectrum, Spectrum]: + spectrum1 = Spectrum(1, [(1, 1)], "spectrum1", None) + spectrum1.strains.add(strains_list[0]) + spectrum2 = Spectrum(2, [(1, 1)], "spectrum2", None) + spectrum2.strains.add(strains_list[1]) + spectrum3 = Spectrum(3, [(1, 1)], "spectrum3", None) + spectrum3.strains.add(strains_list[0]) + spectrum3.strains.add(strains_list[1]) + return spectrum1, spectrum2, spectrum3 + + +@fixture(scope='session') +def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]: + """For simplicity, we just use one Spectrum object for each MolecularFamily + object, and notice that they are not SingletonFamily object. + """ + mf1 = MolecularFamily('mf1') + mf1.add_spectrum(spectra[0]) + mf2 = MolecularFamily('mf2') + mf2.add_spectrum(spectra[1]) + mf3 = MolecularFamily('mf3') + mf3.add_spectrum(spectra[2]) + return mf1, mf2, mf3 + + +@fixture(scope='module') +def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: + """DataLinks object. See `test_data_links.py` for its values.""" + return DataLinks(gcfs, spectra, mfs, strains) + + +@fixture(scope='module') +def linkfinder(datalinks) -> LinkFinder: + """LinkFinder object. See `test_link_finder.py` for its values.""" + linkfinder = LinkFinder() + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') + return linkfinder + + +@fixture(scope='module') +def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: + """Constructed NPLinker object. + + This NPLinker object does not do loading `npl.load_data()`, instead we + manually set its attributes to the values we want to test. + + The config file `nplinker_demo1.toml` does not affect the tests, just + making sure the NPLinker object can be created succesfully. + """ + npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) + npl._gcfs = gcfs + npl._spectra = spectra + npl._molfams = mfs + npl._strains = strains + npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} + npl._mf_lookup = {mf.family_id: mf for mf in mfs} + npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} + # tmp path to store 'metcalf/metcalf_scores.pckl' file + # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) + npl._loader._root = tmp_path_factory.mktemp('npl_test') + return npl + + +@fixture(scope='module') +def mc(npl) -> MetcalfScoring: + """MetcalfScoring object.""" + mc = MetcalfScoring(npl) + mc.setup(npl) + return mc diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 004db682..22a9c21a 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -1,44 +1,221 @@ -import os +import pandas as pd +from pandas.util.testing import assert_frame_equal import pytest -from nplinker.metabolomics.gnps.gnps_molecular_family_loader import \ - GNPSMolecularFamilyLoader -from nplinker.metabolomics.metabolomics import make_families -from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking.data_linking import DataLinks -from .. import DATA_DIR +from nplinker.metabolomics.singleton_family import SingletonFamily -@pytest.fixture -def spec_with_families(spec_dict) -> dict[int, Spectrum]: - make_families(spec_dict.values()) - return spec_dict +def test_init(datalinks): + """Test that the DataLinks object is initialised correctly. -@pytest.fixture -def molecular_families_gnps(): - filename = os.path.join(DATA_DIR, "edges.pairsinfo") - sut = GNPSMolecularFamilyLoader(filename) - return sut.families() + Multiple private methods are called in the init method, so we test + that the correct dataframes are created. + """ + # test occorrences + col_names = ['strain1', 'strain2', 'strain3'] + assert_frame_equal( + datalinks.occurrence_gcf_strain, + pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]], + index=['gcf1', 'gcf2', 'gcf3'], + columns=col_names)) + assert_frame_equal( + datalinks.occurrence_spec_strain, + pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.occurrence_mf_strain, + pd.DataFrame([[1, 0, 0], [0, 1, 0], [1, 1, 0]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + # test co-occorrences spec-gcf + col_names = ['gcf1', 'gcf2', 'gcf3'] + assert_frame_equal( + datalinks.cooccurrence_spec_gcf, + pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_spec_notgcf, + pd.DataFrame([[0, 1, 0], [1, 0, 0], [1, 1, 0]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notspec_gcf, + pd.DataFrame([[0, 1, 1], [1, 0, 1], [0, 0, 0]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notspec_notgcf, + pd.DataFrame([[2, 1, 1], [1, 2, 1], [1, 1, 1]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=col_names)) + # test co-occorrences mf-gcf + assert_frame_equal( + datalinks.cooccurrence_mf_gcf, + pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_mf_notgcf, + pd.DataFrame([[0, 1, 0], [1, 0, 0], [1, 1, 0]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notmf_gcf, + pd.DataFrame([[0, 1, 1], [1, 0, 1], [0, 0, 0]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) + assert_frame_equal( + datalinks.cooccurrence_notmf_notgcf, + pd.DataFrame([[2, 1, 1], [1, 2, 1], [1, 1, 1]], + index=['mf1', 'mf2', 'mf3'], + columns=col_names)) -def test_collect_mappings_from_spectra(spec_with_families): - sut = DataLinks() - actual = sut._collect_mappings_from_spectra(spec_with_families.values()) - assert actual.shape == (25935,3) +def test_get_common_strains_spec(datalinks, spectra, gcfs, strains_list): + """Test get_common_strains method for input spectra and gcfs.""" + sut = datalinks.get_common_strains(spectra[:2], gcfs) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[1]): [], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[0]): [], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + sut = datalinks.get_common_strains(spectra[:2], + gcfs, + filter_no_shared=True) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected -def test_collect_mappings_from_molecular_families(molecular_families_gnps): - sut = DataLinks() - actual = sut._collect_mappings_from_molecular_families(molecular_families_gnps) - assert actual.shape == (25935,3) +def test_get_common_strains_mf(datalinks, mfs, gcfs, strains_list): + """Test get_common_strains method for input mfs and gcfs.""" + sut = datalinks.get_common_strains(mfs[:2], gcfs) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[1]): [], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[0]): [], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + sut = datalinks.get_common_strains(mfs[:2], gcfs, filter_no_shared=True) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected -def test_mappings_are_equal(spec_with_families, molecular_families_gnps): - sut = DataLinks() - sut._collect_mappings_from_spectra(spec_with_families.values()) - actual = sut.mapping_spec - sut._collect_mappings_from_molecular_families(molecular_families_gnps) - expected = sut.mapping_spec +def test_get_common_strains_spec_mf(datalinks, spectra, mfs, gcfs, + strains_list): + """Test get_common_strains method for mixed input of spectra and mfs.""" + mixed_input = (*spectra[:2], *mfs[:2]) + sut = datalinks.get_common_strains(mixed_input, gcfs) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[1]): [], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[0]): [], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]], + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[1]): [], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[0]): [], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected - assert actual.eq(expected).all(axis=None) + sut = datalinks.get_common_strains(mixed_input, + gcfs, + filter_no_shared=True) + expected = { + (spectra[0], gcfs[0]): [strains_list[0]], + (spectra[0], gcfs[2]): [strains_list[0]], + (spectra[1], gcfs[1]): [strains_list[1]], + (spectra[1], gcfs[2]): [strains_list[1]], + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + +def test_get_common_strains_sf(datalinks, mfs, gcfs, strains_list): + """Test get_common_strains method for input SingletonFamily.""" + smf = SingletonFamily() + + sut = datalinks.get_common_strains([smf], gcfs) + assert sut == {} + + # the expected are same as `test_get_common_strains_mf` + mfs_mix = (*mfs[:2], smf) + sut = datalinks.get_common_strains(mfs_mix, gcfs) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[1]): [], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[0]): [], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + sut = datalinks.get_common_strains(mfs_mix, gcfs, filter_no_shared=True) + expected = { + (mfs[0], gcfs[0]): [strains_list[0]], + (mfs[0], gcfs[2]): [strains_list[0]], + (mfs[1], gcfs[1]): [strains_list[1]], + (mfs[1], gcfs[2]): [strains_list[1]] + } + assert sut == expected + + +def test_get_common_strains_invalid_value(datalinks, spectra, gcfs): + """Test get_common_strains method for empty arguments.""" + with pytest.raises(ValueError) as e: + datalinks.get_common_strains([], gcfs) + assert "Empty list for first or second argument." in str(e.value) + with pytest.raises(ValueError) as e: + datalinks.get_common_strains(spectra, []) + assert "Empty list for first or second argument." in str(e.value) + + +@pytest.mark.parametrize("first_arg, expected", [ + ([1], "First argument must be Spectrum and/or MolecularFamily objects."), + ([1, 2 + ], "First argument must be Spectrum and/or MolecularFamily objects."), + ("12", "First argument must be Spectrum and/or MolecularFamily objects.") +]) +def test_get_common_strains_invalid_type_first_arg(datalinks, gcfs, first_arg, + expected): + """Test get_common_strains method for invalid 1st arugment.""" + with pytest.raises(TypeError) as e: + datalinks.get_common_strains(first_arg, gcfs) + assert expected in str(e.value) + + +@pytest.mark.parametrize("second_arg, expected", + [([1], "Second argument must be GCF objects.")]) +def test_get_common_strains_invalid_type_second_arg(datalinks, spectra, + second_arg, expected): + """Test get_common_strains method for invalid 2nd argument.""" + with pytest.raises(TypeError) as e: + datalinks.get_common_strains(spectra, second_arg) + assert expected in str(e.value) diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py new file mode 100644 index 00000000..30d84980 --- /dev/null +++ b/tests/scoring/test_link_finder.py @@ -0,0 +1,189 @@ +import numpy as np +import pandas as pd +from pandas.util.testing import assert_frame_equal +import pytest +from pytest import fixture +from nplinker.scoring.linking import LinkFinder + + +@fixture(scope='module') +def linkfinder() -> LinkFinder: + return LinkFinder() + + +def test_init(linkfinder): + assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame()) + assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame()) + assert linkfinder.metcalf_mean is None + assert linkfinder.metcalf_std is None + + +def test_calc_score_raw_score(linkfinder, datalinks): + """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`. + + The expected values are calculated manually by using values from `test_init` + of `test_data_links.py` and the default scoring weights. + """ + # link type = 'spec-gcf' + linkfinder.calc_score(datalinks, link_type='spec-gcf') + assert_frame_equal( + linkfinder.raw_score_spec_gcf, + pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=['spectrum1', 'spectrum2', 'spectrum3'], + columns=['gcf1', 'gcf2', 'gcf3'])) + # link type = 'mf-gcf' + linkfinder.calc_score(datalinks, link_type='mf-gcf') + assert_frame_equal( + linkfinder.raw_score_mf_gcf, + pd.DataFrame([[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=['mf1', 'mf2', 'mf3'], + columns=['gcf1', 'gcf2', 'gcf3'])) + + +def test_calc_score_mean_std(linkfinder, datalinks): + """Test `calc_score` method for `metcalf_mean` and `metcalf_std`.""" + linkfinder.calc_score(datalinks, link_type='spec-gcf') + assert isinstance(linkfinder.metcalf_mean, np.ndarray) + assert isinstance(linkfinder.metcalf_std, np.ndarray) + assert linkfinder.metcalf_mean.shape == (4, 4 + ) # (n_strains+1 , n_strains+1) + assert linkfinder.metcalf_mean.shape == (4, 4) + # TODO CG: add tests for values after refactoring _calc_mean_std method + # assert linkfinder.metcalf_mean == expected_array + + +def test_get_links_gcf(linkfinder, datalinks, gcfs): + """Test `get_links` method for input GCF objects.""" + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') + index_names = ['source', 'target', 'score'] + + # cutoff = negative infinity (float) + links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF) + assert len(links) == 2 + # expected values got from `test_calc_score_raw_score` + assert_frame_equal( + links[0], + pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3, + [ + *['spectrum1'] * 3, + *['spectrum2'] * 3, + *['spectrum3'] * 3, + ], [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + assert_frame_equal( + links[1], + pd.DataFrame([['gcf1', 'gcf2', 'gcf3'] * 3, + [ + *['mf1'] * 3, + *['mf2'] * 3, + *['mf3'] * 3, + ], [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + + # cutoff = 0 + links = linkfinder.get_links(*gcfs, score_cutoff=0) + assert len(links) == 2 + assert_frame_equal( + links[0], + pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [ + *['spectrum1'] * 2, + *['spectrum2'] * 2, + *['spectrum3'] * 3, + ], [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + assert_frame_equal( + links[1], + pd.DataFrame([['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [ + *['mf1'] * 2, + *['mf2'] * 2, + *['mf3'] * 3, + ], [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + + +def test_get_links_spec(linkfinder, datalinks, spectra): + """Test `get_links` method for input Spectrum objects.""" + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') + index_names = ['source', 'target', 'score'] + # cutoff = negative infinity (float) + links = linkfinder.get_links(*spectra, score_cutoff=np.NINF) + assert len(links) == 1 + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['spectrum1'] * 3, + *['spectrum2'] * 3, + *['spectrum3'] * 3, + ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + # cutoff = 0 + links = linkfinder.get_links(*spectra, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['spectrum1'] * 2, + *['spectrum2'] * 2, + *['spectrum3'] * 3, + ], ['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + + +def test_get_links_mf(linkfinder, datalinks, mfs): + """Test `get_links` method for input MolecularFamily objects.""" + linkfinder.calc_score(datalinks, link_type='spec-gcf') + linkfinder.calc_score(datalinks, link_type='mf-gcf') + index_names = ['source', 'target', 'score'] + # cutoff = negative infinity (float) + links = linkfinder.get_links(*mfs, score_cutoff=np.NINF) + assert len(links) == 1 + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['mf1'] * 3, + *['mf2'] * 3, + *['mf3'] * 3, + ], ['gcf1', 'gcf2', 'gcf3'] * 3, [12, -9, 11, -9, 12, 11, 1, 1, 21]], + index=index_names)) + # cutoff = 0 + links = linkfinder.get_links(*mfs, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame([[ + *['mf1'] * 2, + *['mf2'] * 2, + *['mf3'] * 3, + ], ['gcf1', 'gcf3', 'gcf2', 'gcf3', 'gcf1', 'gcf2', 'gcf3'], + [12, 11, 12, 11, 1, 1, 21]], + index=index_names)) + + +@pytest.mark.parametrize("objects, expected", [([], "Empty input objects"), + ("", "Empty input objects")]) +def test_get_links_invalid_value(linkfinder, objects, expected): + with pytest.raises(ValueError) as e: + linkfinder.get_links(*objects) + assert expected in str(e.value) + + +@pytest.mark.parametrize("objects, expected", + [([1], "Invalid type {}"), + ([1, 2], "Invalid type {}"), + ("12", "Invalid type {}")]) +def test_get_links_invalid_type(linkfinder, objects, expected): + with pytest.raises(TypeError) as e: + linkfinder.get_links(*objects) + assert expected in str(e.value) + + +def test_get_links_invalid_mixed_types(linkfinder, spectra, mfs): + objects = (*spectra, *mfs) + with pytest.raises(TypeError) as e: + linkfinder.get_links(*objects) + assert "Invalid type" in str(e.value) + assert ".MolecularFamily" in str(e.value) + assert ".Spectrum" in str(e.value) diff --git a/tests/scoring/test_data_linking_functions.py b/tests/scoring/test_linking_utils.py similarity index 89% rename from tests/scoring/test_data_linking_functions.py rename to tests/scoring/test_linking_utils.py index 9f55d1ac..200c198e 100644 --- a/tests/scoring/test_data_linking_functions.py +++ b/tests/scoring/test_linking_utils.py @@ -15,7 +15,7 @@ # test functions import numpy as np -from nplinker.scoring.linking.data_linking_functions import calc_correlation_matrix +from nplinker.scoring.linking import calc_correlation_matrix def test_calc_correlation_matrix(): @@ -37,7 +37,7 @@ def test_calc_correlation_matrix(): [1, 2, 2, 2, 2, 3], [1, 2, 2, 2, 2, 3]])) -from nplinker.scoring.linking.data_linking_functions import calc_likelihood_matrix +from nplinker.scoring.linking.utils import calc_likelihood_matrix def test_calc_likelihood_matrix(): @@ -59,7 +59,7 @@ def test_calc_likelihood_matrix(): assert LBA.shape == (len(A), len(B)) # must have shape len(A), len(B) -from nplinker.scoring.linking.data_linking_functions import pair_prob_hg +from nplinker.scoring.linking.utils import pair_prob_hg def test_pair_prob_hg(): @@ -70,7 +70,7 @@ def test_pair_prob_hg(): assert pair_prob_hg(1, 100, 2, 2) == 98 / 100 * 2 / 99 + 2 / 100 * 98 / 99 -from nplinker.scoring.linking.data_linking_functions import hit_prob_dist +from nplinker.scoring.linking.utils import hit_prob_dist def test_hit_prob_dist(): @@ -82,7 +82,7 @@ def test_hit_prob_dist(): assert pks[0][0] == 0.99**100 -from nplinker.scoring.linking.data_linking_functions import permutation_unique +from nplinker.scoring.linking.utils import permutation_unique def test_permutation_unique(): @@ -101,7 +101,7 @@ def test_permutation_unique(): testlist))) == 20 # math.factorial(5)/math.factorial(3) -from nplinker.scoring.linking.data_linking_functions import pair_prob +from nplinker.scoring.linking.utils import pair_prob def test_pair_prob(): @@ -124,7 +124,7 @@ def test_pair_prob(): assert pair_prob(P_str, XG, Ny, hits) < (2 / 90 + 0.00000001) -from nplinker.scoring.linking.data_linking_functions import link_prob +from nplinker.scoring.linking.utils import link_prob def test_link_prob(): diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py new file mode 100644 index 00000000..471c073c --- /dev/null +++ b/tests/scoring/test_metcalf_scoring.py @@ -0,0 +1,208 @@ +import numpy as np +from numpy.testing import assert_array_equal +from pandas.util.testing import assert_frame_equal +import pytest +from nplinker.scoring import LinkCollection +from nplinker.scoring import MetcalfScoring +from nplinker.scoring import ObjectLink +from nplinker.scoring.linking import DataLinks +from nplinker.scoring.linking import LinkFinder + + +def test_init(npl): + mc = MetcalfScoring(npl) + assert mc.npl == npl + assert mc.name == 'metcalf' + assert mc.cutoff == 1.0 + assert mc.standardised is True + assert mc.DATALINKS is None + assert mc.LINKFINDER is None + + +def test_setup(mc, datalinks, linkfinder): + """Test `setup` method when cache file does not exist.""" + assert isinstance(mc.DATALINKS, DataLinks) + assert isinstance(mc.LINKFINDER, LinkFinder) + + assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, + datalinks.occurrence_gcf_strain) + assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, + datalinks.cooccurrence_spec_gcf) + + assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, + linkfinder.raw_score_spec_gcf) + assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, + linkfinder.raw_score_mf_gcf) + assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean) + assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std) + + +def test_setup_load_cache(mc, npl, datalinks, linkfinder, caplog): + """Test `setup` method when cache file exists.""" + mc.setup(npl) + assert "MetcalfScoring.setup loading cached data" in caplog.text + assert "MetcalfScoring.setup caching results" not in caplog.text + + assert isinstance(mc.DATALINKS, DataLinks) + assert isinstance(mc.LINKFINDER, LinkFinder) + + assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, + datalinks.occurrence_gcf_strain) + assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, + datalinks.cooccurrence_spec_gcf) + + assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, + linkfinder.raw_score_spec_gcf) + assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, + linkfinder.raw_score_mf_gcf) + assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean) + assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std) + + +def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs): + """Test `get_links` method when input is GCF objects and `standardised` is False.""" + # test raw scores (no standardisation) + mc.standardised = False + + # when cutoff is negative infinity, i.e. taking all scores + mc.cutoff = np.NINF + links = mc.get_links(*gcfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + # expected values are from `test_get_links_gcf` of test_link_finder.py + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]][spectra[0]].data(mc) == -9 + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + + # when test cutoff is 0, i.e. taking scores >= 0 + mc.cutoff = 0 + links = mc.get_links(*gcfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]].get(spectra[0]) is None + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_gcf_standardised_true(mc, gcfs, spectra, mfs): + """Test `get_links` method when input is GCF objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_spec_standardised_false(mc, gcfs, spectra): + """Test `get_links` method when input is Spectrum objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = mc.get_links(*spectra, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]][gcfs[1]].data(mc) == -9 + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + + mc.cutoff = 0 + links = mc.get_links(*spectra, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]].get(gcfs[1]) is None + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_spec_standardised_true(mc, gcfs, spectra): + """Test `get_links` method when input is Spectrum objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_mf_standardised_false(mc, gcfs, mfs): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = mc.get_links(*mfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]][gcfs[1]].data(mc) == -9 + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + + mc.cutoff = 0 + links = mc.get_links(*mfs, link_collection=LinkCollection()) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]].get(gcfs[1]) is None + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_mf_standardised_true(mc, gcfs, mfs): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is True.""" + mc.standardised = True + ... + + +@pytest.mark.parametrize("objects, expected", [([], "Empty input objects"), + ("", "Empty input objects")]) +def test_get_links_invalid_input_value(mc, objects, expected): + with pytest.raises(ValueError) as e: + mc.get_links(*objects, link_collection=LinkCollection()) + assert expected in str(e.value) + + +@pytest.mark.parametrize("objects, expected", + [([1], "Invalid type {}"), + ([1, 2], "Invalid type {}"), + ("12", "Invalid type {}")]) +def test_get_links_invalid_input_type(mc, objects, expected): + with pytest.raises(TypeError) as e: + mc.get_links(*objects, link_collection=LinkCollection()) + assert expected in str(e.value) + + +def test_get_links_invalid_mixed_types(mc, spectra, mfs): + objects = (*spectra, *mfs) + with pytest.raises(TypeError) as e: + mc.get_links(*objects, link_collection=LinkCollection()) + assert "Invalid type" in str(e.value) + assert ".MolecularFamily" in str(e.value) + assert ".Spectrum" in str(e.value) + + +def test_get_links_no_linkfinder(npl, gcfs): + """Test `get_links` method when no LinkFinder object is found.""" + mc = MetcalfScoring(npl) + mc.LINKFINDER = None + with pytest.raises(ValueError) as e: + mc.get_links(*gcfs, link_collection=LinkCollection()) + assert "LinkFinder object not found." in str(e.value) diff --git a/tests/scoring/test_nplinker_scoring.py b/tests/scoring/test_nplinker_scoring.py new file mode 100644 index 00000000..77743dbb --- /dev/null +++ b/tests/scoring/test_nplinker_scoring.py @@ -0,0 +1,145 @@ +import numpy as np +import pytest +from nplinker.scoring import LinkCollection +from nplinker.scoring import ObjectLink + + +def test_get_links_gcf_standardised_false(npl, mc, gcfs, spectra, mfs, + strains_list): + """Test `get_links` method when input is GCF objects and `standardised` is False.""" + # test raw scores (no standardisation) + mc.standardised = False + + # when cutoff is negative infinity, i.e. taking all scores + mc.cutoff = np.NINF + links = npl.get_links(list(gcfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + # expected values are from `test_get_links_gcf` of test_link_finder.py + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]][spectra[0]].data(mc) == -9 + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + # expected values are from `test_get_common_strains_spec` of test_data_links.py + assert links[gcfs[0]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[1]][spectra[0]].shared_strains == [] + assert links[gcfs[2]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[0]][mfs[0]].shared_strains == [strains_list[0]] + assert links[gcfs[1]][mfs[1]].shared_strains == [strains_list[1]] + assert set(links[gcfs[2]][mfs[2]].shared_strains) == set(strains_list[0:2]) + + # when test cutoff is 0, i.e. taking scores >= 0 + mc.cutoff = 0 + links = npl.get_links(list(gcfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links + assert {i.gcf_id for i in links.keys()} == {'gcf1', 'gcf2', 'gcf3'} + assert isinstance(links[gcfs[0]][spectra[0]], ObjectLink) + # test scores + assert links[gcfs[0]][spectra[0]].data(mc) == 12 + assert links[gcfs[1]].get(spectra[0]) is None + assert links[gcfs[2]][spectra[0]].data(mc) == 11 + assert links[gcfs[0]][mfs[0]].data(mc) == 12 + assert links[gcfs[1]][mfs[1]].data(mc) == 12 + assert links[gcfs[2]][mfs[2]].data(mc) == 21 + # test shared strains + assert links[gcfs[0]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[2]][spectra[0]].shared_strains == [strains_list[0]] + assert links[gcfs[0]][mfs[0]].shared_strains == [strains_list[0]] + assert links[gcfs[1]][mfs[1]].shared_strains == [strains_list[1]] + assert set(links[gcfs[2]][mfs[2]].shared_strains) == set(strains_list[0:2]) + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_gcf_standardised_true(npl, mc, gcfs, spectra, mfs, + strains_list): + """Test `get_links` method when input is GCF objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_spec_standardised_false(npl, mc, gcfs, spectra, + strains_list): + """Test `get_links` method when input is Spectrum objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = npl.get_links(list(spectra), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]][gcfs[1]].data(mc) == -9 + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + assert links[spectra[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[spectra[0]][gcfs[1]].shared_strains == [] + assert links[spectra[0]][gcfs[2]].shared_strains == [strains_list[0]] + + mc.cutoff = 0 + links = npl.get_links(list(spectra), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links # dict of link values + assert len(links) == 3 + assert {i.spectrum_id + for i in links.keys()} == {'spectrum1', 'spectrum2', 'spectrum3'} + assert isinstance(links[spectra[0]][gcfs[0]], ObjectLink) + assert links[spectra[0]][gcfs[0]].data(mc) == 12 + assert links[spectra[0]].get(gcfs[1]) is None + assert links[spectra[0]][gcfs[2]].data(mc) == 11 + assert links[spectra[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[spectra[0]][gcfs[2]].shared_strains == [strains_list[0]] + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_spec_standardised_true(npl, mc, gcfs, spectra, + strains_list): + """Test `get_links` method when input is Spectrum objects and `standardised` is True.""" + mc.standardised = True + ... + + +def test_get_links_mf_standardised_false(npl, mc, gcfs, mfs, strains_list): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is False.""" + mc.standardised = False + + mc.cutoff = np.NINF + links = npl.get_links(list(mfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]][gcfs[1]].data(mc) == -9 + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + assert links[mfs[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[mfs[0]][gcfs[1]].shared_strains == [] + assert links[mfs[0]][gcfs[2]].shared_strains == [strains_list[0]] + + mc.cutoff = 0 + links = npl.get_links(list(mfs), mc, and_mode=True) + assert isinstance(links, LinkCollection) + links = links.links + assert len(links) == 3 + assert {i.family_id for i in links.keys()} == {'mf1', 'mf2', 'mf3'} + assert isinstance(links[mfs[0]][gcfs[0]], ObjectLink) + assert links[mfs[0]][gcfs[0]].data(mc) == 12 + assert links[mfs[0]].get(gcfs[1]) is None + assert links[mfs[0]][gcfs[2]].data(mc) == 11 + assert links[mfs[0]][gcfs[0]].shared_strains == [strains_list[0]] + assert links[mfs[0]][gcfs[2]].shared_strains == [strains_list[0]] + + +@pytest.mark.skip(reason='To add after refactoring relevant code.') +def test_get_links_mf_standardised_true(npl, mc, gcfs, mfs, strains_list): + """Test `get_links` method when input is MolecularFamily objects and `standardised` is True.""" + mc.standardised = True + ... diff --git a/tests/scoring/test_scoring.py b/tests/scoring/test_scoring.py deleted file mode 100644 index fe75611f..00000000 --- a/tests/scoring/test_scoring.py +++ /dev/null @@ -1,136 +0,0 @@ -import numpy as np -import pandas as pd -from nplinker.genomics import BGC -from nplinker.genomics import GCF -from nplinker.metabolomics.spectrum import Spectrum -from nplinker.scoring.linking.data_linking import DataLinks -from nplinker.scoring.linking.link_finder import LinkFinder -from nplinker.scoring.linking.misc_deprecated import hg_scoring -from nplinker.scoring.linking.misc_deprecated import metcalf_scoring -from nplinker.strains import Strain - - -np.random.seed(50) - - -def create_strains(n=10): - return [Strain(f'strain_{x:02d}') for x in range(n)] - - -def create_gcfs(strains, n=3): - gcfs = [] - for i in range(n): - gcf = GCF(f'fake_gcf_{i}') - num_strains = np.random.randint(1, len(strains)) - randoms = list(range(len(strains))) - np.random.shuffle(randoms) - randoms = randoms[:num_strains] - - for j in range(num_strains): - bgc = BGC(j, strains[randoms[j]]) - gcf.add_bgc(bgc) - - gcfs.append(gcf) - - return gcfs - - -def create_spectra(strains, n=3): - spectra = [] - - families = np.random.randint(0, n * 2, n) - - for i in range(n): - # (id, peaks, spectrum_id, precursor_mz, parent_mz=None, rt=None): - spec = Spectrum(i, [(1, 2), (3, 4)], i, np.random.random()) - num_strains = np.random.randint(1, len(strains)) - randoms = list(range(len(strains))) - np.random.shuffle(randoms) - randoms = randoms[:num_strains] - spec.family = int(families[i]) - for j in range(num_strains): - spec.add_strain(strains[randoms[j]], 'foo', 1) - - spectra.append(spec) - - return spectra - - -def do_scoring_old(gcfs, spectra, strains, standardised): - scores = {} - for gcf in gcfs: - scores[gcf] = {} - for spec in spectra: - score = metcalf_scoring(spec, - gcf, - strains, - standardised=standardised) - scores[gcf][spec] = score - - return scores - - -def do_scoring_new(gcfs, spectra, strains, standardised): - datalinks = DataLinks() - datalinks.load_data(spectra, gcfs, strains) - datalinks.find_correlations() - lf = LinkFinder() - scores = lf.metcalf_scoring(datalinks) - - # print(lf.metcalf_expected) - if standardised: - # standardised scoring thing - for i, gcf in enumerate(gcfs): - for j, spec in enumerate(spectra): - # get expected score, variance for objects with the current combo of strain counts - # (note that spectrum = type 1 object here) - met_strains = len(spec.strains) - gen_strains = len(gcf.strains) - expected = lf.metcalf_expected[met_strains][gen_strains] - variance = lf.metcalf_variance[met_strains][gen_strains] - scores[j][i] = (scores[j][i] - expected) / np.sqrt(variance) - return scores - - -def do_scoring_old_hg(gcfs, spectra, strains): - scores = {} - for gcf in gcfs: - scores[gcf] = {} - for spec in spectra: - score, _ = hg_scoring(spec, gcf, strains) - scores[gcf][spec] = score - - return scores - - -def do_scoring_new_hg(gcfs, spectra, strains): - datalinks = DataLinks() - datalinks.load_data(spectra, gcfs, strains) - datalinks.find_correlations() - lf = LinkFinder() - scores = lf.hg_scoring(datalinks) - return scores - - -def run_metcalf_test(n_strains=3, n_gcfs=5, n_spectra=4, standardised=False): - strains = create_strains(n_strains) - gcfs = create_gcfs(strains, n_gcfs) - spectra = create_spectra(strains, n_spectra) - - old_scores = do_scoring_old(gcfs, spectra, strains, standardised) - new_scores = do_scoring_new(gcfs, spectra, strains, standardised) - - dfdata = {'nonvec_score': [], 'vec_score': [], 'gcf': [], 'spec': []} - - for i, gcf in enumerate(gcfs): - for j, spec in enumerate(spectra): - dfdata['nonvec_score'].append(old_scores[gcf][spec]) - dfdata['vec_score'].append(new_scores[j][i]) - dfdata['gcf'].append(gcf) - dfdata['spec'].append(spec) - - return pd.DataFrame(data=dfdata) - - -if __name__ == "__main__": - run_metcalf_test() diff --git a/tests/test_nplinker.py b/tests/test_nplinker.py deleted file mode 100644 index 3b9171af..00000000 --- a/tests/test_nplinker.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest -from nplinker.nplinker import NPLinker -import os - -from . import DATA_DIR - -@pytest.fixture(scope='module') -def instance() -> NPLinker: - npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) - npl.load_data() - return npl - - -@pytest.mark.skipif(os.environ.get('CI') == 'true', reason="Skip when running on CI") -def test_load_data(instance: NPLinker): - assert len(instance.bgcs) == 390 - assert len(instance.gcfs) == 113 - assert len(instance.spectra) == 25935 - assert len(instance.molfams) == 25769 - - -@pytest.mark.skipif(os.environ.get('CI') == 'true' , reason="Skip when running on CI") -def test_get_links(instance: NPLinker): - mc = instance.scoring_method('metcalf') - mc.cutoff = 3.5 - mc.standardised = True - - actual = instance.get_links(instance.gcfs, mc, and_mode=True) - assert len(actual) == len(actual.sources) == len(actual.links) == 101 - - actual.filter_links(lambda link: link[mc] > 5.0) - assert len(actual.links) == 60 - diff --git a/tests/test_nplinker_local.py b/tests/test_nplinker_local.py new file mode 100644 index 00000000..77640e6f --- /dev/null +++ b/tests/test_nplinker_local.py @@ -0,0 +1,30 @@ +import os +from pathlib import Path +import pytest +from nplinker.nplinker import NPLinker +from . import DATA_DIR + + +# NOTE: This file only contains tests that run locally and are skipped on CI. +# Basically, only tests related to data loading should be put here. +# For tests on scoring/links, add them to `scoring/test_nplinker_scoring.py`. + + +@pytest.fixture(scope='module') +def npl() -> NPLinker: + npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) + npl.load_data() + # remove cached results before running tests + root_dir = Path(npl.root_dir) + score_cache = root_dir / 'metcalf' / 'metcalf_scores.pckl' + score_cache.unlink(missing_ok=True) + return npl + + +@pytest.mark.skipif(os.environ.get('CI') == 'true', + reason="Skip when running on CI") +def test_load_data(npl: NPLinker): + assert len(npl.bgcs) == 390 + assert len(npl.gcfs) == 113 + assert len(npl.spectra) == 25935 + assert len(npl.molfams) == 25769 diff --git a/tests/test_strain_collection.py b/tests/test_strain_collection.py index 26ab2767..dcdad3df 100644 --- a/tests/test_strain_collection.py +++ b/tests/test_strain_collection.py @@ -41,24 +41,21 @@ def test_iter(collection: StrainCollection, strain: Strain): for actual in collection: assert actual == strain + def test_add(strain: Strain): sut = StrainCollection() sut.add(strain) assert strain in sut for alias in strain.aliases: assert alias in sut - assert sut._strain_dict_index[0] == strain def test_remove(collection: StrainCollection, strain: Strain): assert strain in collection collection.remove(strain) with pytest.raises(KeyError): - _ = collection._strain_dict_id[strain.id] + _ = collection._strain_dict_name[strain.id] assert strain not in collection - # TODO: issue #90 - # with pytest.raises(KeyError): - # collection.lookup_index(0) def test_filter(collection: StrainCollection, strain: Strain): @@ -70,13 +67,6 @@ def test_filter(collection: StrainCollection, strain: Strain): assert len(collection) == 1 -def test_lookup_index(collection: StrainCollection, strain: Strain): - actual = collection.lookup_index(0) - assert actual == strain - with pytest.raises(KeyError): - collection.lookup_index(1) - - def test_lookup(collection: StrainCollection, strain: Strain): assert collection.lookup(strain.id) == strain for alias in strain.aliases: @@ -89,7 +79,6 @@ def test_add_from_file(): sut = StrainCollection() sut.add_from_file(DATA_DIR / "strain_mappings.csv") assert len(sut) == 27 - assert len(sut.lookup_index(1).aliases) == 29 def test_save_to_file(collection: StrainCollection, tmp_path):