From f6c763b72937291504fea7896e65a264c9bd2d8d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 20 Nov 2023 11:43:00 +0100 Subject: [PATCH 1/2] remove deprecated function `load_gcfs` --- src/nplinker/genomics/__init__.py | 2 - src/nplinker/genomics/genomics.py | 172 ------------------------------ 2 files changed, 174 deletions(-) diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py index 5b1721bb..7710f337 100644 --- a/src/nplinker/genomics/__init__.py +++ b/src/nplinker/genomics/__init__.py @@ -6,7 +6,6 @@ from .genomics import generate_mappings_genome_id_bgc_id from .genomics import get_bgcs_from_gcfs from .genomics import get_strains_from_bgcs -from .genomics import load_gcfs from .genomics import map_bgc_to_gcf from .genomics import map_strain_to_bgc @@ -21,7 +20,6 @@ "generate_mappings_genome_id_bgc_id", "get_bgcs_from_gcfs", "get_strains_from_bgcs", - "load_gcfs", "map_bgc_to_gcf", "map_strain_to_bgc", ] diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index 19b6d696..53676be1 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -1,9 +1,7 @@ from __future__ import annotations -import csv import json from os import PathLike from pathlib import Path -from deprecated import deprecated from jsonschema import validate from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.logconfig import LogConfig @@ -144,173 +142,3 @@ def get_strains_from_bgcs(bgcs: list[BGC]) -> StrainCollection: else: logger.warning("Strain is None for BGC %s", bgc.bgc_id) return sc - - -@deprecated( - version="1.3.3", - reason="It is split to separate functions: " - "map_strain_to_bgc, map_bgc_to_gcf, filter_mibig_only_gcf, " - "get_bgcs_from_gcfs and get_strains_from_bgcs.", -) -def load_gcfs( - bigscape_dir: str | PathLike, - strains: StrainCollection, - mibig_bgc_dict: dict[str, BGC], - antismash_bgc_dict: dict[str, BGC], - antismash_file_dict: dict[str, str], - bigscape_cutoff: int, -): - bigscape_dir = Path(bigscape_dir) - product_class_cluster_file = ( - bigscape_dir / "mix" / f"mix_clustering_c0.{bigscape_cutoff:02d}.tsv" - ) - network_annotations_file = bigscape_dir / "Network_Annotations_Full.tsv" - - new_bgc: BGC - num_mibig: int = 0 - bgc_list: list[BGC] = [] - - gcf_dict: dict[str, GCF] = {} - gcf_list: list[GCF] = [] - - used_strains: StrainCollection = StrainCollection() - unknown_strains: dict[str, str] = {} - - # CG: bigscape data - # parse the annotation files (/bigscape//Network_Annotations_.tsv - # these contain fields: - # - BGC name/ID [0] - # - "Accession ID" [1] - # - Description [2] - # - Product prediction [3] - # - Bigscape product type/class [4] - # - Organism [5] - # - Taxonomy [6] - metadata = {} - with open(network_annotations_file) as f: - reader = csv.reader(f, delimiter="\t") - next(reader) # skip headers - for line in reader: - metadata[line[0]] = line - - # CG: bigscape data - # "cluster files" are the various _clustering_c0.xx.tsv files - # - BGC name - # - cluster ID - with open(product_class_cluster_file, "rt") as f: - reader = csv.reader(f, delimiter="\t") - next(reader) # skip headers - for line in reader: - bgc_name = line[0] - family_id = line[1] - - # TODO: is it necessary to keep bigscape_class for GCF class? - # get bgc annotations from bigscape file - metadata_line = metadata[bgc_name] - bigscape_class = metadata_line[4] - - # check strain - try: - strain = strains.lookup(bgc_name) - except KeyError: - logger.warning(f"Unknown strain ID: {bgc_name}") - unknown_strains[bgc_name] = antismash_file_dict[bgc_name] - continue - - # build new bgc - if strain.id.startswith("BGC"): - try: - new_bgc = mibig_bgc_dict[strain.id] - except KeyError: - raise KeyError(f"Unknown MiBIG: {strain.id}") - num_mibig += 1 - else: - try: - new_bgc = antismash_bgc_dict[bgc_name] - except KeyError: - raise KeyError(f"Unknown AntiSMASH BGC: {bgc_name}") - - new_bgc.strain = strain - bgc_list.append(new_bgc) - - # build new gcf - if family_id not in gcf_dict: - new_gcf = GCF(family_id) - gcf_dict[family_id] = new_gcf - gcf_list.append(new_gcf) - - # link bgc to gcf - gcf_dict[family_id].add_bgc(new_bgc) - - # add strain to used strains - used_strains.add(strain) - - logger.info( - "# MiBIG BGCs = {}, non-MiBIG BGCS = {}, total bgcs = {}, GCFs = {}, strains={}".format( - num_mibig, len(bgc_list) - num_mibig, len(bgc_list), len(gcf_dict), len(strains) - ) - ) - - # filter out MiBIG-only GCFs) - gcf_list, bgc_list, used_strains = _filter_gcfs(gcf_list, bgc_list, used_strains) - logger.info( - "# after filtering, total bgcs = {}, GCFs = {}, strains={}, unknown_strains={}".format( - len(bgc_list), len(gcf_list), len(used_strains), len(unknown_strains) - ) - ) - - return gcf_list, bgc_list, used_strains, unknown_strains - - -@deprecated( - version="1.3.3", - reason="It is split to separate functions: " - "filter_mibig_only_gcf, get_bgcs_from_gcfs and get_strains_from_bgcs.", -) -def _filter_gcfs( - gcfs: list[GCF], bgcs: list[BGC], strains: StrainCollection -) -> tuple[list[GCF], list[BGC], StrainCollection]: - """Remove a GCF from given GCF list if it only has MIBiG BGC members, - correspondingly remove relevant BGC and strain from given list/collection. - - GCF and BGC internal id is updated to keep ids consectutive in a list. - - Args: - gcfs(list[GCF]): list of GCF objects - bgcs(list[BGC]): list of BGC objects - strains(StrainCollection): StrainCollection object - - Returns: - tuple[list[GCF], list[BGC], StrainCollection]: updated list of GCF - objects, updated list of BGC objects and updated StrainCollection - object. - """ - gcfs_to_remove = set() - bgcs_to_remove = set() - - for gcf in gcfs: - num_non_mibig_bgcs = len(list(filter(lambda bgc: not bgc.is_mibig(), gcf.bgcs))) - if num_non_mibig_bgcs == 0: - gcfs_to_remove.add(gcf) - for bgc in gcf.bgcs: - bgcs_to_remove.add(bgc) - - for bgc in bgcs: - if len(bgc.parents) == 0: - bgcs_to_remove.add(bgc) - - for gcf in gcfs_to_remove: - gcfs.remove(gcf) - - for bgc in bgcs_to_remove: - bgcs.remove(bgc) - if bgc.strain is not None: - strains.remove(bgc.strain) - - logger.info( - "Remove GCFs that has only MIBiG BGCs: removing {} GCFs and {} BGCs".format( - len(gcfs_to_remove), len(bgcs_to_remove) - ) - ) - - return gcfs, bgcs, strains From 87edc7e1db8c01020d62bfa62a8ffe2d749ce490 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 20 Nov 2023 11:46:01 +0100 Subject: [PATCH 2/2] remove function `filter_mibig_only_gcf` This function is needed anymore. GCF loader already has the functionality to filter mibig only GCF during loading. --- src/nplinker/genomics/__init__.py | 2 -- src/nplinker/genomics/genomics.py | 9 --------- src/nplinker/loader.py | 3 +-- tests/genomics/test_genomics.py | 8 -------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py index 7710f337..cffff7c5 100644 --- a/src/nplinker/genomics/__init__.py +++ b/src/nplinker/genomics/__init__.py @@ -2,7 +2,6 @@ from .abc import BGCLoaderBase from .bgc import BGC from .gcf import GCF -from .genomics import filter_mibig_only_gcf from .genomics import generate_mappings_genome_id_bgc_id from .genomics import get_bgcs_from_gcfs from .genomics import get_strains_from_bgcs @@ -16,7 +15,6 @@ "BGCLoaderBase", "BGC", "GCF", - "filter_mibig_only_gcf", "generate_mappings_genome_id_bgc_id", "get_bgcs_from_gcfs", "get_strains_from_bgcs", diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index 53676be1..babadfe7 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -116,15 +116,6 @@ def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]): gcf.add_bgc(bgc) -def filter_mibig_only_gcf(gcfs: list[GCF]) -> list[GCF]: - """Filter out GCFs that contain only MIBiG BGC objects. - - This method returns a new list of GCFs that have at least one non-MIBiG - BGC object as its child. - """ - return [gcf for gcf in gcfs if gcf.has_mibig_only() is False] - - def get_bgcs_from_gcfs(gcfs: list[GCF]) -> list[BGC]: """Get all BGC objects from given GCF objects.""" s = set() diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index b6391c00..5155b723 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -8,7 +8,6 @@ from nplinker.genomics import generate_mappings_genome_id_bgc_id from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.bigscape import BigscapeGCFLoader -from nplinker.genomics.genomics import filter_mibig_only_gcf from nplinker.genomics.genomics import get_bgcs_from_gcfs from nplinker.genomics.genomics import get_strains_from_bgcs from nplinker.genomics.genomics import map_bgc_to_gcf @@ -453,7 +452,7 @@ def _load_genomics(self): map_bgc_to_gcf(raw_bgcs, raw_gcfs) # Step 5: get clean GCF objects, BGC objects and Strain objects - self.gcfs = filter_mibig_only_gcf(raw_gcfs) + self.gcfs = raw_gcfs self.bgcs = get_bgcs_from_gcfs(self.gcfs) self.strains = get_strains_from_bgcs(self.bgcs) diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index f691d561..394f5a36 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -3,7 +3,6 @@ import pytest from nplinker.genomics import BGC from nplinker.genomics import GCF -from nplinker.genomics import filter_mibig_only_gcf from nplinker.genomics import generate_mappings_genome_id_bgc_id from nplinker.genomics import get_bgcs_from_gcfs from nplinker.genomics import get_strains_from_bgcs @@ -149,13 +148,6 @@ def test_map_bgc_to_gcf_error(bgc_list, gcf_list_error): assert "BGC id 'BGC_04' from GCF object '1' not found" in e.value.args[0] -def test_filter_mibig_only_gcf(bgc_list, gcf_list): - map_bgc_to_gcf(bgc_list, gcf_list) - gcfs = filter_mibig_only_gcf(gcf_list) - assert len(gcfs) == 1 - assert gcfs[0].gcf_id == "2" - - def test_get_bgcs_from_gcfs(bgc_list, gcf_list): map_bgc_to_gcf(bgc_list, gcf_list) bgcs = get_bgcs_from_gcfs(gcf_list)