Skip to content

Commit 000f53e

Browse files
Merge pull request #349 from Clinical-Genomics/remove_completeness_multitasker
Replace completeness multitasker with d4tools perc_cov
2 parents c669513 + 4263b2e commit 000f53e

File tree

5 files changed

+103
-116
lines changed

5 files changed

+103
-116
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
- `d4_genes_condensed_summary` coverage endpoint will not convert `nan` or `inf` coverage values to None, but to str(value)
1313
- Updated the Dockerfile base image so it contains the latest d4tools (master branch)
1414
- Updated tests workflow to cargo install the latest d4tools from git (master branch)
15+
- Computing coverage completeness stats using d4tools `perc_cov` stat function (much quicker reports)
1516
### Fixed
1617
- Updated dependencies including `certifi` to address dependabot alert
1718
- Update pytest to v.7.4.4 to address a `ReDoS` vulnerability

src/chanjo2/endpoints/coverage.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,7 @@
1212
from chanjo2.crud.intervals import get_genes, set_sql_intervals
1313
from chanjo2.dbutil import get_session
1414
from chanjo2.meta.handle_bed import bed_file_interval_id_coords
15-
from chanjo2.meta.handle_completeness_tasks import (
16-
coverage_completeness_multitasker,
17-
get_d4tools_coverage_completeness,
18-
)
15+
from chanjo2.meta.handle_completeness_stats import get_completeness_stats
1916
from chanjo2.meta.handle_d4 import (
2017
get_d4tools_chromosome_mean_coverage,
2118
get_d4tools_intervals_coverage,
@@ -61,21 +58,20 @@ def d4_interval_coverage(query: FileCoverageQuery):
6158
)
6259

6360
interval += f"\t{query.start}\t{query.end}"
64-
completeness_dict = {}
6561

6662
mean_coverage: float = get_d4tools_intervals_mean_coverage(
6763
d4_file_path=query.coverage_file_path, intervals=[interval]
6864
)[0]
69-
get_d4tools_coverage_completeness(
65+
66+
completeness_stats = get_completeness_stats(
7067
d4_file_path=query.coverage_file_path,
7168
interval_ids_coords=[(interval, (query.chromosome, query.start, query.end))],
7269
thresholds=query.completeness_thresholds,
73-
return_dict=completeness_dict,
7470
)
7571

7672
return IntervalCoverage(
7773
mean_coverage=mean_coverage,
78-
completeness=completeness_dict.get(interval),
74+
completeness=completeness_stats[interval],
7975
interval_id=f"{query.chromosome}:{query.start}-{query.end}",
8076
)
8177

@@ -107,12 +103,10 @@ def d4_intervals_coverage(query: FileCoverageIntervalsFileQuery):
107103
intervals_coverage: List[float] = get_d4tools_intervals_coverage(
108104
d4_file_path=query.coverage_file_path, bed_file_path=query.intervals_bed_path
109105
)
110-
intervals_completeness: Dict[str, Dict[int, float]] = (
111-
coverage_completeness_multitasker(
112-
d4_file_path=query.coverage_file_path,
113-
thresholds=query.completeness_thresholds,
114-
interval_ids_coords=interval_id_coords,
115-
)
106+
intervals_completeness: Dict[str, Dict[int, float]] = get_completeness_stats(
107+
d4_file_path=query.coverage_file_path,
108+
thresholds=query.completeness_thresholds,
109+
interval_ids_coords=interval_id_coords,
116110
)
117111

118112
results: List[IntervalCoverage] = []
@@ -176,12 +170,10 @@ def d4_genes_condensed_summary(
176170
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
177171
for interval in sql_intervals
178172
]
179-
genes_coverage_completeness: Dict[str, dict] = (
180-
coverage_completeness_multitasker(
181-
d4_file_path=sample.coverage_file_path,
182-
thresholds=[query.coverage_threshold],
183-
interval_ids_coords=interval_ids_coords,
184-
)
173+
genes_coverage_completeness: Dict[str, dict] = get_completeness_stats(
174+
d4_file_path=sample.coverage_file_path,
175+
thresholds=[query.coverage_threshold],
176+
interval_ids_coords=interval_ids_coords,
185177
)
186178
genes_coverage_completeness_values: List[float] = [
187179
value[query.coverage_threshold] * 100
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import subprocess
2+
import tempfile
3+
from typing import Dict, List, Tuple
4+
5+
CHROM_INDEX = 0
6+
START_INDEX = 1
7+
STOP_INDEX = 2
8+
9+
10+
def get_d4tools_intervals_completeness(
11+
d4_file_path: str, bed_file_path: str, completeness_thresholds: List[int]
12+
) -> List[Dict]:
13+
"""Return coverage completeness over all intervals of a bed file using the perc_cov d4tools command."""
14+
threshold_stats = []
15+
d4tools_stats_perc_cov: str = subprocess.check_output(
16+
[
17+
"d4tools",
18+
"stat",
19+
"-s",
20+
f"perc_cov={','.join(str(threshold) for threshold in completeness_thresholds)}",
21+
"--region",
22+
bed_file_path,
23+
d4_file_path,
24+
],
25+
text=True,
26+
)
27+
for line in d4tools_stats_perc_cov.splitlines():
28+
stats_dict: Dict = dict(
29+
(
30+
zip(
31+
completeness_thresholds,
32+
[float(stat) for stat in line.rstrip().split("\t")[3:]],
33+
)
34+
)
35+
)
36+
threshold_stats.append(stats_dict)
37+
38+
return threshold_stats
39+
40+
41+
def get_completeness_stats(
42+
d4_file_path: str,
43+
thresholds: List[int],
44+
interval_ids_coords: List[Tuple[str, tuple]],
45+
) -> Dict[str, dict]:
46+
"""Compute coverage and completeness over the given intervals of a d4 file."""
47+
48+
interval_ids_coords = sorted(
49+
interval_ids_coords,
50+
key=lambda interval_coord: (
51+
interval_coord[1][CHROM_INDEX],
52+
interval_coord[1][START_INDEX],
53+
interval_coord[1][STOP_INDEX],
54+
),
55+
)
56+
interval_id_completeness_stats: Dict[str:dict] = {}
57+
58+
bed_lines = [
59+
f"{coords[CHROM_INDEX]}\t{coords[START_INDEX]}\t{coords[STOP_INDEX]}"
60+
for _, coords in interval_ids_coords
61+
]
62+
# Write genomic intervals to a temporary file
63+
with tempfile.NamedTemporaryFile(mode="w") as intervals_bed:
64+
intervals_bed.write("\n".join(bed_lines))
65+
intervals_bed.flush()
66+
intervals_completeness = get_d4tools_intervals_completeness(
67+
d4_file_path=d4_file_path,
68+
bed_file_path=intervals_bed.name,
69+
completeness_thresholds=thresholds,
70+
)
71+
72+
for index, interval_id_coord in enumerate(interval_ids_coords):
73+
interval_id_completeness_stats[interval_id_coord[0]] = intervals_completeness[
74+
index
75+
]
76+
77+
return interval_id_completeness_stats

src/chanjo2/meta/handle_completeness_tasks.py

Lines changed: 0 additions & 74 deletions
This file was deleted.

src/chanjo2/meta/handle_d4.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import logging
21
import subprocess
32
import tempfile
43
from statistics import mean
@@ -8,7 +7,7 @@
87

98
from chanjo2.constants import CHROMOSOMES
109
from chanjo2.crud.intervals import get_gene_intervals, set_sql_intervals
11-
from chanjo2.meta.handle_completeness_tasks import coverage_completeness_multitasker
10+
from chanjo2.meta.handle_completeness_stats import get_completeness_stats
1211
from chanjo2.models import SQLExon, SQLGene, SQLTranscript
1312
from chanjo2.models.pydantic_models import (
1413
GeneCoverage,
@@ -18,7 +17,6 @@
1817
TranscriptTag,
1918
)
2019

21-
LOG = logging.getLogger(__name__)
2220
CHROM_INDEX = 0
2321
START_INDEX = 1
2422
STOP_INDEX = 2
@@ -90,10 +88,6 @@ def get_report_sample_interval_coverage(
9088
) -> None:
9189
"""Compute stats to populate a coverage report and coverage overview for one sample."""
9290

93-
if not intervals_coords:
94-
intervals_coords = []
95-
completeness_thresholds = []
96-
9791
# Compute intervals coverage
9892
intervals_coverage: List[float] = get_d4tools_intervals_mean_coverage(
9993
d4_file_path=d4_file_path, intervals=intervals_coords
@@ -105,12 +99,10 @@ def get_report_sample_interval_coverage(
10599
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
106100
for interval in sql_intervals
107101
]
108-
intervals_coverage_completeness: Dict[str, dict] = (
109-
coverage_completeness_multitasker(
110-
d4_file_path=d4_file_path,
111-
thresholds=completeness_thresholds,
112-
interval_ids_coords=interval_ids_coords,
113-
)
102+
intervals_coverage_completeness: Dict[str, dict] = get_completeness_stats(
103+
d4_file_path=d4_file_path,
104+
thresholds=completeness_thresholds,
105+
interval_ids_coords=interval_ids_coords,
114106
)
115107

116108
interval_ids = set()
@@ -156,9 +148,10 @@ def get_report_sample_interval_coverage(
156148
interval_ids.add(interval.ensembl_id)
157149

158150
for threshold in completeness_thresholds:
159-
completeness_row_dict[f"completeness_{threshold}"] = round(
160-
mean(thresholds_dict[threshold]) * 100, 2
161-
)
151+
if thresholds_dict[threshold]:
152+
completeness_row_dict[f"completeness_{threshold}"] = round(
153+
mean(thresholds_dict[threshold]) * 100, 2
154+
)
162155

163156
report_data["completeness_rows"].append((sample_name, completeness_row_dict))
164157
report_data["incomplete_coverage_rows"] += incomplete_coverages_rows
@@ -200,12 +193,10 @@ def get_sample_interval_coverage(
200193
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
201194
for interval in sql_intervals
202195
]
203-
intervals_coverage_completeness: Dict[str, dict] = (
204-
coverage_completeness_multitasker(
205-
d4_file_path=d4_file_path,
206-
thresholds=completeness_thresholds,
207-
interval_ids_coords=interval_ids_coords,
208-
)
196+
intervals_coverage_completeness: Dict[str, dict] = get_completeness_stats(
197+
d4_file_path=d4_file_path,
198+
thresholds=completeness_thresholds,
199+
interval_ids_coords=interval_ids_coords,
209200
)
210201

211202
# Create GeneCoverage objects

0 commit comments

Comments
 (0)