Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor loading genomics #168

Merged
merged 21 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
8e4dcd8
remove useless step
CunliangGeng Aug 11, 2023
4917161
remove completed TODO comment
CunliangGeng Aug 11, 2023
afd81e5
remove param `bgc_genome_mapping` from func `map_strain_to_bgc`
CunliangGeng Aug 11, 2023
21cd0fc
add property names to class `Strain`
CunliangGeng Aug 11, 2023
3485155
update the logic of method `__contains__` in StrainCollection
CunliangGeng Aug 15, 2023
1211657
add method `has_name` to class `StrainCollection`
CunliangGeng Aug 14, 2023
14b3453
update method `has_strain` in classes
CunliangGeng Aug 14, 2023
ab226d5
rename MibigBGCLoader to MibigLoader
CunliangGeng Aug 14, 2023
1058928
rename method `get_bgc_genome_mapping` to `get_strain_bgc_mapping`
CunliangGeng Aug 14, 2023
36fda27
refactor mibig loading related code to `_load_mibig` method
CunliangGeng Aug 14, 2023
c668ca8
remove mibig downloading in class `DatasetLoader`
CunliangGeng Aug 14, 2023
a9b808f
refactor bigscape running code to `_run_bigscape`
CunliangGeng Aug 14, 2023
682a0a5
refactor method `_load_genomics`
CunliangGeng Aug 14, 2023
0991a47
add TODO comments
CunliangGeng Aug 15, 2023
70e1640
add TODO comment
CunliangGeng Aug 15, 2023
10364ef
change `StrainCollection._strain_dict_name` dict value type to list
CunliangGeng Aug 16, 2023
43d7f23
update references of method `StrainCollection.lookup`
CunliangGeng Aug 16, 2023
c489986
change KeyError to ValueError in `StrainCollection.lookup` method
CunliangGeng Aug 24, 2023
f73c683
add ValueError to `StrainCollection.remove` method
CunliangGeng Aug 24, 2023
11c9563
change KeyError to ValueError for function `map_strain_to_bgc`
CunliangGeng Aug 24, 2023
bb9b7d6
update docstring for function `map_strain_to_bgc`
CunliangGeng Aug 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from nplinker.logconfig import LogConfig
from nplinker.strain_collection import StrainCollection


if TYPE_CHECKING:
from nplinker.strains import Strain
from .bgc import BGC
Expand Down Expand Up @@ -33,8 +34,6 @@ def __init__(self, gcf_id: str, /) -> None:
self.gcf_id = gcf_id
self._bgcs: set[BGC] = set()
self.bigscape_class: str | None = None
# CG TODO: remove attribute id, see issue 103
# https://github.com/NPLinker/nplinker/issues/103
self.bgc_ids: set[str] = set()
self.strains: StrainCollection = StrainCollection()

Expand Down Expand Up @@ -83,11 +82,11 @@ def detach_bgc(self, bgc: BGC) -> None:
return
self.strains.remove(bgc.strain)

def has_strain(self, strain: str | Strain) -> bool:
def has_strain(self, strain: Strain) -> bool:
"""Check if the given strain exists.

Args:
strain(str | Strain): strain id or `Strain` object.
strain(Strain): `Strain` object.

Returns:
bool: True when the given strain exist.
Expand Down
34 changes: 14 additions & 20 deletions src/nplinker/genomics/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,39 +69,31 @@ def generate_mappings_genome_id_bgc_id(
logger.info("Generated genome-BGC mappings file: %s", output_file)


def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC],
bgc_genome_mapping: dict[str, str]):
def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]):
"""To set BGC object's strain with representative strain object.

This method changes the list `bgcs` in place.

It's assumed that BGC's genome id is used as strain's name or alias, and
the genome id is used to lookup the representative strain.

Args:
strains(StrainCollection): A collection of all strain objects.
bgcs(list[BGC]): A list of BGC objects.
bgc_genome_mapping(dict[str, str]): The mappings from BGC id (key) to
genome id (value).

Raises:
KeyError: BGC id not found in the `bgc_genome_mapping` dict.
KeyError: Strain id not found in the strain collection.
"""
for bgc in bgcs:
try:
genome_id = bgc_genome_mapping[bgc.bgc_id]
strain_list = strains.lookup(bgc.bgc_id)
if len(strain_list) > 1:
raise KeyError(
f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
f"BGC object accept only one strain."
)
except KeyError as e:
raise KeyError(
f"Not found BGC id {bgc.bgc_id} in BGC-genome mappings."
) from e
try:
strain = strains.lookup(genome_id)
except KeyError as e:
raise KeyError(
f"Strain id {genome_id} from BGC object {bgc.bgc_id} "
"not found in the StrainCollection object.") from e
bgc.strain = strain
f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' "
"not found in the strain collection.") from e
bgc.strain = strain_list[0]


def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]):
Expand All @@ -122,8 +114,9 @@ def map_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]):
try:
bgc = bgc_dict[bgc_id]
except KeyError as e:
raise KeyError(f"BGC id {bgc_id} from GCF object {gcf.gcf_id} "
"not found in the list of BGC objects.") from e
raise KeyError(
f"BGC id '{bgc_id}' from GCF object '{gcf.gcf_id}' "
"not found in the list of BGC objects.") from e
gcf.add_bgc(bgc)


Expand Down Expand Up @@ -155,6 +148,7 @@ def get_strains_from_bgcs(bgcs: list[BGC]) -> StrainCollection:
return sc



@deprecated(version="1.3.3", reason="It is split to separate functions: " \
"map_strain_to_bgc, map_bgc_to_gcf, filter_mibig_only_gcf, " \
"get_bgcs_from_gcfs and get_strains_from_bgcs.")
Expand Down
4 changes: 3 additions & 1 deletion src/nplinker/genomics/mibig/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from .mibig_downloader import download_and_extract_mibig_metadata
from .mibig_loader import MibigBGCLoader, parse_bgc_metadata_json
from .mibig_loader import MibigLoader
from .mibig_loader import parse_bgc_metadata_json
from .mibig_metadata import MibigMetadata


logging.getLogger(__name__).addHandler(logging.NullHandler())
15 changes: 6 additions & 9 deletions src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger = LogConfig.getLogger(__name__)


class MibigBGCLoader:
class MibigLoader:

def __init__(self, data_dir: str):
"""Parse MIBiG metadata files and return BGC objects
Expand All @@ -26,16 +26,13 @@ def __init__(self, data_dir: str):
self._metadata_dict = self._parse_metadatas()
self._bgc_dict = self._parse_bgcs()

def get_bgc_genome_mapping(self) -> dict[str, str]:
"""Get the mapping from BGC to genome.
def get_strain_bgc_mapping(self) -> dict[str, str]:
"""Get the mapping from strain to BGC.

Note that for MIBiG BGC, same value is used for BGC id and genome id.
Users don't have to provide genome id for MIBiG BGCs in the
`strain_mappings.json` file.
Note that for MIBiG BGC, same value is used for strain name and BGC id.

Returns:
dict[str, str]: key is BGC id/accession, value is
genome id that uses the value of BGC accession.
dict[str, str]: key is strain name, value is BGC id.
"""
return {bid: bid for bid in self._file_dict}

Expand Down Expand Up @@ -129,4 +126,4 @@ def parse_bgc_metadata_json(file: str) -> BGC:


# register as virtual class to prevent metaclass conflicts
BGCLoaderBase.register(MibigBGCLoader)
BGCLoaderBase.register(MibigLoader)
Loading
Loading