Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update strain mappings generator #212

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions src/nplinker/pairedomics/strain_mappings_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def podp_generate_strain_mappings(
podp_project_json_file: str | PathLike,
genome_status_json_file: str | PathLike,
genome_bgc_mappings_file: str | PathLike,
gnps_file_mapping_tsv_file: str | PathLike,
gnps_file_mappings_file: str | PathLike,
output_json_file: str | PathLike,
) -> StrainCollection:
"""Generate strain mappings JSON file for PODP pipeline.
Expand All @@ -44,7 +44,7 @@ def podp_generate_strain_mappings(
- "original_genome_id <-> resolved_genome_id" is extracted from `genome_status_json_file`.
- "resolved_genome_id <-> bgc_id" is extracted from `genome_bgc_mappings_file`.
- "strain_id <-> MS_filename" is extracted from `podp_project_json_file`.
- "MS_filename <-> spectrum_id" is extracted from `gnps_file_mapping_tsv_file`.
- "MS_filename <-> spectrum_id" is extracted from `gnps_file_mappings_file`.

Args:
podp_project_json_file(str | PathLike): The path to the PODP project
Expand All @@ -53,8 +53,8 @@ def podp_generate_strain_mappings(
JSON file.
genome_bgc_mappings_file(str | PathLike): The path to the genome BGC
mappings JSON file.
gnps_file_mapping_tsv_file(str | PathLike): The path to the GNPS file
mapping TSV file.
gnps_file_mappings_file(str | PathLike): The path to the GNPS file
mappings file (csv or tsv).
output_json_file(str | PathLike): The path to the output JSON file.

Returns:
Expand Down Expand Up @@ -84,7 +84,7 @@ def podp_generate_strain_mappings(
# Get mappings strain_id <-> MS_filename <-> spectrum_id
mappings_strain_id_spectrum_id = get_mappings_strain_id_spectrum_id(
extract_mappings_strain_id_ms_filename(podp_project_json_file),
extract_mappings_ms_filename_spectrum_id(gnps_file_mapping_tsv_file),
extract_mappings_ms_filename_spectrum_id(gnps_file_mappings_file),
)

# Get mappings strain_id <-> bgc_id / spectrum_id
Expand Down Expand Up @@ -280,24 +280,26 @@ def extract_mappings_strain_id_ms_filename(
return mappings_dict


def extract_mappings_ms_filename_spectrum_id(tsv_file: str | PathLike) -> dict[str, set[str]]:
def extract_mappings_ms_filename_spectrum_id(
gnps_file_mappings_file: str | PathLike
) -> dict[str, set[str]]:
"""Extract mappings "MS_filename <-> spectrum_id".

Args:
tsv_file(str | PathLike): The path to the GNPS file mapping TSV file.
gnps_file_mappings_file(str | PathLike): The path to the GNPS file mappings file (csv or
tsv).

Returns:
dict[str, set[str]]: Key is MS filename and value is a set of spectrum ids.

Notes:
The `tsv_file` is generated by GNPS molecular networking. It's downloaded
from GNPS website to a file with a default name defined in
`GNPS_FILE_MAPPINGS_FILENAME`.
The `gnps_file_mappings_file` is generated by GNPS molecular networking. It's downloaded
from GNPS website to a file with a default name defined in `GNPS_FILE_MAPPINGS_FILENAME`.

See Also:
`GNPSFileMappingLoader`: A class to load GNPS file mapping TSV file.
`GNPSFileMappingLoader`: A class to load GNPS file mappings file.
"""
loader = GNPSFileMappingLoader(tsv_file)
loader = GNPSFileMappingLoader(gnps_file_mappings_file)
return loader.mapping_reversed


Expand Down
Loading