Skip to content

Replace completeness multitasker with d4tools perc_cov #349

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- `d4_genes_condensed_summary` coverage endpoint will not convert `nan` or `inf` coverage values to None, but to str(value)
- Updated the Dockerfile base image so it contains the latest d4tools (master branch)
- Updated tests workflow to cargo install the latest d4tools from git (master branch)
- Computing coverage completeness stats using d4tools `perc_cov` stat function (much quicker reports)
### Fixed
- Updated dependencies including `certifi` to address dependabot alert
- Update pytest to v.7.4.4 to address a `ReDoS` vulnerability
Expand Down
32 changes: 12 additions & 20 deletions src/chanjo2/endpoints/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@
from chanjo2.crud.intervals import get_genes, set_sql_intervals
from chanjo2.dbutil import get_session
from chanjo2.meta.handle_bed import bed_file_interval_id_coords
from chanjo2.meta.handle_completeness_tasks import (
coverage_completeness_multitasker,
get_d4tools_coverage_completeness,
)
from chanjo2.meta.handle_completeness_stats import get_completeness_stats
from chanjo2.meta.handle_d4 import (
get_d4tools_chromosome_mean_coverage,
get_d4tools_intervals_coverage,
Expand Down Expand Up @@ -61,21 +58,20 @@ def d4_interval_coverage(query: FileCoverageQuery):
)

interval += f"\t{query.start}\t{query.end}"
completeness_dict = {}

mean_coverage: float = get_d4tools_intervals_mean_coverage(
d4_file_path=query.coverage_file_path, intervals=[interval]
)[0]
get_d4tools_coverage_completeness(

completeness_stats = get_completeness_stats(
d4_file_path=query.coverage_file_path,
interval_ids_coords=[(interval, (query.chromosome, query.start, query.end))],
thresholds=query.completeness_thresholds,
return_dict=completeness_dict,
)

return IntervalCoverage(
mean_coverage=mean_coverage,
completeness=completeness_dict.get(interval),
completeness=completeness_stats[interval],
interval_id=f"{query.chromosome}:{query.start}-{query.end}",
)

Expand Down Expand Up @@ -107,12 +103,10 @@ def d4_intervals_coverage(query: FileCoverageIntervalsFileQuery):
intervals_coverage: List[float] = get_d4tools_intervals_coverage(
d4_file_path=query.coverage_file_path, bed_file_path=query.intervals_bed_path
)
intervals_completeness: Dict[str, Dict[int, float]] = (
coverage_completeness_multitasker(
d4_file_path=query.coverage_file_path,
thresholds=query.completeness_thresholds,
interval_ids_coords=interval_id_coords,
)
intervals_completeness: Dict[str, Dict[int, float]] = get_completeness_stats(
d4_file_path=query.coverage_file_path,
thresholds=query.completeness_thresholds,
interval_ids_coords=interval_id_coords,
)

results: List[IntervalCoverage] = []
Expand Down Expand Up @@ -176,12 +170,10 @@ def d4_genes_condensed_summary(
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
for interval in sql_intervals
]
genes_coverage_completeness: Dict[str, dict] = (
coverage_completeness_multitasker(
d4_file_path=sample.coverage_file_path,
thresholds=[query.coverage_threshold],
interval_ids_coords=interval_ids_coords,
)
genes_coverage_completeness: Dict[str, dict] = get_completeness_stats(
d4_file_path=sample.coverage_file_path,
thresholds=[query.coverage_threshold],
interval_ids_coords=interval_ids_coords,
)
genes_coverage_completeness_values: List[float] = [
value[query.coverage_threshold] * 100
Expand Down
77 changes: 77 additions & 0 deletions src/chanjo2/meta/handle_completeness_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import subprocess
import tempfile
from typing import Dict, List, Tuple

CHROM_INDEX = 0
START_INDEX = 1
STOP_INDEX = 2


def get_d4tools_intervals_completeness(
d4_file_path: str, bed_file_path: str, completeness_thresholds: List[int]
) -> List[Dict]:
"""Return coverage completeness over all intervals of a bed file using the perc_cov d4tools command."""
threshold_stats = []
d4tools_stats_perc_cov: str = subprocess.check_output(
[
"d4tools",
"stat",
"-s",
f"perc_cov={','.join(str(threshold) for threshold in completeness_thresholds)}",
"--region",
bed_file_path,
d4_file_path,
],
text=True,
)
for line in d4tools_stats_perc_cov.splitlines():
stats_dict: Dict = dict(
(
zip(
completeness_thresholds,
[float(stat) for stat in line.rstrip().split("\t")[3:]],
)
)
)
threshold_stats.append(stats_dict)

return threshold_stats


def get_completeness_stats(
d4_file_path: str,
thresholds: List[int],
interval_ids_coords: List[Tuple[str, tuple]],
) -> Dict[str, dict]:
"""Compute coverage and completeness over the given intervals of a d4 file."""

interval_ids_coords = sorted(
interval_ids_coords,
key=lambda interval_coord: (
interval_coord[1][CHROM_INDEX],
interval_coord[1][START_INDEX],
interval_coord[1][STOP_INDEX],
),
)
interval_id_completeness_stats: Dict[str:dict] = {}

bed_lines = [
f"{coords[CHROM_INDEX]}\t{coords[START_INDEX]}\t{coords[STOP_INDEX]}"
for _, coords in interval_ids_coords
]
# Write genomic intervals to a temporary file
with tempfile.NamedTemporaryFile(mode="w") as intervals_bed:
intervals_bed.write("\n".join(bed_lines))
intervals_bed.flush()
intervals_completeness = get_d4tools_intervals_completeness(
d4_file_path=d4_file_path,
bed_file_path=intervals_bed.name,
completeness_thresholds=thresholds,
)

for index, interval_id_coord in enumerate(interval_ids_coords):
interval_id_completeness_stats[interval_id_coord[0]] = intervals_completeness[
index
]

return interval_id_completeness_stats
74 changes: 0 additions & 74 deletions src/chanjo2/meta/handle_completeness_tasks.py

This file was deleted.

35 changes: 13 additions & 22 deletions src/chanjo2/meta/handle_d4.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
import subprocess
import tempfile
from statistics import mean
Expand All @@ -8,7 +7,7 @@

from chanjo2.constants import CHROMOSOMES
from chanjo2.crud.intervals import get_gene_intervals, set_sql_intervals
from chanjo2.meta.handle_completeness_tasks import coverage_completeness_multitasker
from chanjo2.meta.handle_completeness_stats import get_completeness_stats
from chanjo2.models import SQLExon, SQLGene, SQLTranscript
from chanjo2.models.pydantic_models import (
GeneCoverage,
Expand All @@ -18,7 +17,6 @@
TranscriptTag,
)

LOG = logging.getLogger(__name__)
CHROM_INDEX = 0
START_INDEX = 1
STOP_INDEX = 2
Expand Down Expand Up @@ -90,10 +88,6 @@ def get_report_sample_interval_coverage(
) -> None:
"""Compute stats to populate a coverage report and coverage overview for one sample."""

if not intervals_coords:
intervals_coords = []
completeness_thresholds = []

# Compute intervals coverage
intervals_coverage: List[float] = get_d4tools_intervals_mean_coverage(
d4_file_path=d4_file_path, intervals=intervals_coords
Expand All @@ -105,12 +99,10 @@ def get_report_sample_interval_coverage(
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
for interval in sql_intervals
]
intervals_coverage_completeness: Dict[str, dict] = (
coverage_completeness_multitasker(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
)
intervals_coverage_completeness: Dict[str, dict] = get_completeness_stats(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
)

interval_ids = set()
Expand Down Expand Up @@ -156,9 +148,10 @@ def get_report_sample_interval_coverage(
interval_ids.add(interval.ensembl_id)

for threshold in completeness_thresholds:
completeness_row_dict[f"completeness_{threshold}"] = round(
mean(thresholds_dict[threshold]) * 100, 2
)
if thresholds_dict[threshold]:
completeness_row_dict[f"completeness_{threshold}"] = round(
mean(thresholds_dict[threshold]) * 100, 2
)

report_data["completeness_rows"].append((sample_name, completeness_row_dict))
report_data["incomplete_coverage_rows"] += incomplete_coverages_rows
Expand Down Expand Up @@ -200,12 +193,10 @@ def get_sample_interval_coverage(
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
for interval in sql_intervals
]
intervals_coverage_completeness: Dict[str, dict] = (
coverage_completeness_multitasker(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
)
intervals_coverage_completeness: Dict[str, dict] = get_completeness_stats(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
)

# Create GeneCoverage objects
Expand Down
Loading