Skip to content

Commit 684dcab

Browse files
Merge pull request #394 from Clinical-Genomics/fix_mysql_create_tables
Modify genes table converting the ensembl_id string field to ensembl_ids: an array of strings
2 parents b8aa953 + 1e1eafa commit 684dcab

File tree

9 files changed

+162
-101
lines changed

9 files changed

+162
-101
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
### Changed
33
- Updated several libraries including schug (now v.7)
44
- Update project's Python version to 3.9
5+
- Modified the structure of the database table `genes`, converting the `ensembl_id` string field to `ensembl_ids`: an array of strings. This change addresses recent changes in the MySQL: https://bugs.mysql.com/bug.php?id=114838
56
### Fixed
67
- The MariaDB healthcheck step in docker-compose-mysql.yml, preventing the demo app to start
78

src/chanjo2/crud/intervals.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from typing import List, Optional, Union
33

4-
from sqlalchemy import delete, or_
4+
from sqlalchemy import delete, or_, text
55
from sqlalchemy.orm import Session, query
66
from sqlalchemy.sql.expression import Delete
77

@@ -71,9 +71,19 @@ def get_genes(
7171
) -> List[SQLGene]:
7272
"""Return genes according to specified fields."""
7373
genes: query.Query = db.query(SQLGene)
74-
7574
if ensembl_ids:
76-
genes: query.Query = genes.filter(SQLGene.ensembl_id.in_(ensembl_ids))
75+
ensembl_ids_placeholder = ", ".join(f"'{e}'" for e in ensembl_ids)
76+
genes = genes.filter(
77+
text(
78+
f"""
79+
EXISTS (
80+
SELECT 1
81+
FROM json_each(genes.ensembl_ids)
82+
WHERE value IN ({ensembl_ids_placeholder})
83+
)
84+
"""
85+
)
86+
)
7787
elif hgnc_ids:
7888
genes: query.Query = genes.filter(SQLGene.hgnc_id.in_(hgnc_ids))
7989
elif hgnc_symbols:
@@ -151,7 +161,9 @@ def set_sql_intervals(
151161
ensembl_ids=None,
152162
hgnc_ids=None,
153163
hgnc_symbols=None,
154-
ensembl_gene_ids=[gene.ensembl_id for gene in genes],
164+
ensembl_gene_ids=[
165+
ensembl_id for gene in genes for ensembl_id in gene.ensembl_ids
166+
],
155167
limit=None,
156168
transcript_tags=transcript_tags,
157169
)
@@ -171,27 +183,39 @@ def get_gene_intervals(
171183
) -> List[Union[SQLTranscript, SQLExon]]:
172184
"""Retrieve transcripts or exons from a list of genes."""
173185

174-
intervals: query.Query = db.query(interval_type).join(SQLGene)
186+
intervals = db.query(interval_type).filter(interval_type.build == build)
187+
188+
def get_ensembl_gene_ids_from_gene_filter(
189+
filter_value: List[Union[str, int]], filter_column: str
190+
) -> List[str]:
191+
"""Helper function to get ensembl_gene_ids from either hgnc_ids or hgnc_symbols."""
192+
genes = (
193+
db.query(SQLGene.ensembl_ids).filter(filter_column.in_(filter_value)).all()
194+
)
195+
return [ensembl_id for gene in genes for ensembl_id in gene.ensembl_ids]
196+
175197
if ensembl_ids:
176198
intervals: query.Query = intervals.filter(
177199
interval_type.ensembl_id.in_(ensembl_ids)
178200
)
179-
elif ensembl_gene_ids:
180-
intervals: query.Query = intervals.filter(
181-
interval_type.ensembl_gene_id.in_(ensembl_gene_ids)
182-
)
183201
elif hgnc_ids:
184-
intervals: query.Query = intervals.filter(SQLGene.hgnc_id.in_(hgnc_ids))
202+
ensembl_gene_ids = get_ensembl_gene_ids_from_gene_filter(
203+
hgnc_ids, SQLGene.hgnc_id
204+
)
185205
elif hgnc_symbols:
186-
intervals: query.Query = intervals.filter(SQLGene.hgnc_symbol.in_(hgnc_symbols))
206+
ensembl_gene_ids = get_ensembl_gene_ids_from_gene_filter(
207+
hgnc_symbols, SQLGene.hgnc_symbol
208+
)
209+
if ensembl_gene_ids:
210+
intervals = intervals.filter(
211+
interval_type.ensembl_gene_id.in_(ensembl_gene_ids)
212+
)
187213

188214
if interval_type == SQLTranscript and transcript_tags:
189215
intervals = _filter_transcripts_by_tag(
190216
transcripts=intervals, transcript_tags=transcript_tags
191217
)
192218

193-
intervals: query.Query = intervals.filter(interval_type.build == build)
194-
195219
if limit:
196220
return intervals.limit(limit).all()
197221

src/chanjo2/endpoints/coverage.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
get_d4tools_chromosome_mean_coverage,
2121
get_d4tools_intervals_mean_coverage,
2222
)
23-
from chanjo2.meta.handle_d4 import get_samples_sex_metrics
23+
from chanjo2.meta.handle_d4 import get_samples_sex_metrics, set_interval_ids_coords
2424
from chanjo2.meta.handle_report_contents import INTERVAL_TYPE_SQL_TYPE, get_mean
2525
from chanjo2.models import SQLGene
2626
from chanjo2.models.pydantic_models import (
@@ -167,10 +167,10 @@ def d4_genes_condensed_summary(
167167
detail=WRONG_COVERAGE_FILE_MSG,
168168
)
169169

170-
interval_ids_coords: List[Tuple[str, Tuple[str, int, int]]] = [
171-
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
172-
for interval in sql_intervals
173-
]
170+
interval_ids_coords: List[Tuple[str, Tuple[str, int, int]]] = (
171+
set_interval_ids_coords(sql_intervals=sql_intervals)
172+
)
173+
174174
# Sort intervals by chrom, start & stop
175175
interval_ids_coords = sort_interval_ids_coords(interval_ids_coords)
176176

src/chanjo2/meta/handle_d4.py

Lines changed: 78 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,26 @@
1919
LOG = logging.getLogger(__name__)
2020

2121

22+
def set_interval_ids_coords(
23+
sql_intervals: List[Union[SQLGene, SQLTranscript, SQLExon]]
24+
) -> List[Tuple[str, Tuple[str, int, int]]]:
25+
"""Returns tuples with an ensembl_id and coordinates from a list of SQL intervals."""
26+
27+
if not sql_intervals:
28+
return []
29+
if isinstance(sql_intervals[0], SQLGene):
30+
return [
31+
(ensembl_id, (interval.chromosome, interval.start, interval.stop))
32+
for interval in sql_intervals
33+
for ensembl_id in interval.ensembl_ids
34+
]
35+
else:
36+
return [
37+
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
38+
for interval in sql_intervals
39+
]
40+
41+
2242
def get_report_sample_interval_coverage(
2343
d4_file_path: str,
2444
sample_name: str,
@@ -31,11 +51,9 @@ def get_report_sample_interval_coverage(
3151
"""Compute stats to populate a coverage report for one sample."""
3252

3353
# Compute intervals coverage completeness
34-
interval_ids_coords: List[Tuple[str, Tuple[str, int, int]]] = [
35-
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
36-
for interval in sql_intervals
37-
]
38-
54+
interval_ids_coords: List[Tuple[str, Tuple[str, int, int]]] = (
55+
set_interval_ids_coords(sql_intervals=sql_intervals)
56+
)
3957
interval_ids_coords = sort_interval_ids_coords(interval_ids_coords)
4058

4159
# Compute intervals coverage
@@ -57,51 +75,61 @@ def get_report_sample_interval_coverage(
5775
nr_intervals_covered_under_custom_threshold: int = 0
5876
genes_covered_under_custom_threshold = set()
5977

60-
for interval_nr, interval in enumerate(sql_intervals):
61-
62-
if interval.ensembl_id in interval_ids:
63-
continue
64-
for threshold in completeness_thresholds:
65-
interval_coverage_at_threshold: float = intervals_coverage_completeness[
66-
interval.ensembl_id
67-
][threshold]
68-
thresholds_dict[threshold].append(interval_coverage_at_threshold)
69-
70-
# Collect intervals which are not completely covered at the custom threshold
71-
if threshold == default_threshold and interval_coverage_at_threshold < 1:
72-
nr_intervals_covered_under_custom_threshold += 1
73-
interval_ensembl_gene: str = (
74-
interval.ensembl_id
75-
if interval.ensembl_id.startswith("ENSG")
76-
else interval.ensembl_gene_id
77-
)
78-
interval_hgnc_id: int = gene_ids_mapping[interval_ensembl_gene][
79-
"hgnc_id"
80-
]
81-
interval_hgnc_symbol: str = gene_ids_mapping[interval_ensembl_gene][
82-
"hgnc_symbol"
83-
]
84-
genes_covered_under_custom_threshold.add(interval_hgnc_symbol)
85-
incomplete_coverages_rows.append(
86-
(
87-
interval_hgnc_symbol,
88-
interval_hgnc_id,
89-
interval.ensembl_id,
78+
for interval in sql_intervals:
79+
80+
if hasattr(interval, "ensembl_ids"):
81+
ensembl_ids = interval.ensembl_ids
82+
else:
83+
ensembl_ids = [interval.ensembl_id]
84+
85+
for ensembl_id in ensembl_ids:
86+
87+
if ensembl_id in interval_ids:
88+
continue
89+
for threshold in completeness_thresholds:
90+
interval_coverage_at_threshold: float = intervals_coverage_completeness[
91+
ensembl_id
92+
][threshold]
93+
thresholds_dict[threshold].append(interval_coverage_at_threshold)
94+
95+
# Collect intervals which are not completely covered at the custom threshold
96+
if (
97+
threshold == default_threshold
98+
and interval_coverage_at_threshold < 1
99+
):
100+
nr_intervals_covered_under_custom_threshold += 1
101+
interval_ensembl_gene: str = (
102+
ensembl_id
103+
if ensembl_id.startswith("ENSG")
104+
else interval.ensembl_gene_id
105+
)
106+
interval_hgnc_id: int = gene_ids_mapping[interval_ensembl_gene][
107+
"hgnc_id"
108+
]
109+
interval_hgnc_symbol: str = gene_ids_mapping[interval_ensembl_gene][
110+
"hgnc_symbol"
111+
]
112+
genes_covered_under_custom_threshold.add(interval_hgnc_symbol)
113+
incomplete_coverages_rows.append(
90114
(
91-
{
92-
"mane_select": interval.refseq_mane_select,
93-
"mane_plus_clinical": interval.refseq_mane_plus_clinical,
94-
"mrna": interval.refseq_mrna,
95-
}
96-
if isinstance(interval, SQLTranscript)
97-
else {}
98-
),
99-
sample_name,
100-
round(interval_coverage_at_threshold * 100, 2),
115+
interval_hgnc_symbol,
116+
interval_hgnc_id,
117+
ensembl_id,
118+
(
119+
{
120+
"mane_select": interval.refseq_mane_select,
121+
"mane_plus_clinical": interval.refseq_mane_plus_clinical,
122+
"mrna": interval.refseq_mrna,
123+
}
124+
if isinstance(interval, SQLTranscript)
125+
else {}
126+
),
127+
sample_name,
128+
round(interval_coverage_at_threshold * 100, 2),
129+
)
101130
)
102-
)
103131

104-
interval_ids.add(interval.ensembl_id)
132+
interval_ids.add(ensembl_id)
105133

106134
for threshold in completeness_thresholds:
107135
if thresholds_dict[threshold]:
@@ -166,10 +194,9 @@ def get_gene_overview_stats(
166194
completeness_thresholds: List[int],
167195
) -> Dict[str, list]:
168196
"""Returns stats to be included in the gene overview page."""
169-
interval_ids_coords: List[Tuple[str, Tuple[str, int, int]]] = [
170-
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
171-
for interval in sql_intervals
172-
]
197+
interval_ids_coords: List[Tuple[str, Tuple[str, int, int]]] = (
198+
set_interval_ids_coords(sql_intervals=sql_intervals)
199+
)
173200
interval_ids_coords = tuple(
174201
sort_interval_ids_coords(set(interval_ids_coords))
175202
) # removes duplicates and orders intervals by chromosome, start and stop

src/chanjo2/meta/handle_load_intervals.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from chanjo2.models.pydantic_models import (
2424
Builds,
2525
ExonBase,
26-
GeneBase,
2726
IntervalType,
2827
TranscriptBase,
2928
)
@@ -45,8 +44,11 @@ def read_resource_lines(build: Builds, interval_type: IntervalType) -> Iterator[
4544

4645

4746
def _replace_empty_cols(line: str, nr_expected_columns: int) -> List[Union[str, None]]:
48-
"""Split gene line into columns, replacing empty columns with None values."""
49-
cols = [None if col == "" else col.replace("HGNC:", "") for col in line.split("\t")]
47+
"""Split line into columns, replacing empty columns with None values."""
48+
cols = [
49+
None if cell == "" else cell.replace("HGNC:", "") for cell in line.split("\t")
50+
]
51+
5052
# Make sure that expected nr of cols are returned if last cols are blank
5153
cols += [None] * (nr_expected_columns - len(cols))
5254
return cols
@@ -57,6 +59,24 @@ async def update_genes(
5759
) -> Optional[int]:
5860
"""Loads genes into the database."""
5961

62+
def update_or_insert_gene(session, sql_gene):
63+
# Try to find the gene in the database
64+
65+
existing_gene = (
66+
session.query(SQLGene)
67+
.filter_by(
68+
chromosome=sql_gene.chromosome, start=sql_gene.start, stop=sql_gene.stop
69+
)
70+
.first()
71+
)
72+
73+
if existing_gene:
74+
# Gene exists, append the new ensembl_id to the existing ensembl_ids
75+
existing_gene.ensembl_ids.append(sql_gene.ensembl_ids[0])
76+
else:
77+
# Gene does not exist, add a new record
78+
session.add(sql_gene)
79+
6080
LOG.info(f"Loading gene intervals. Genome build --> {build}")
6181
if lines is None:
6282
lines: Iterator[str] = read_resource_lines(
@@ -82,16 +102,17 @@ async def update_genes(
82102
items: List = _replace_empty_cols(line=line, nr_expected_columns=len(header))
83103

84104
try:
85-
gene: GeneBase = GeneBase(
105+
sql_gene = SQLGene(
86106
build=build,
87107
chromosome=items[0],
88108
start=int(items[1]),
89109
stop=int(items[2]),
90-
ensembl_id=items[3],
110+
ensembl_ids=[items[3]],
91111
hgnc_symbol=items[4],
92112
hgnc_id=items[5],
93113
)
94-
genes_bulk.append(gene)
114+
115+
update_or_insert_gene(session, sql_gene) # Update or insert the gene
95116

96117
if len(genes_bulk) > MAX_NR_OF_RECORDS:
97118
bulk_insert_genes(db=session, genes=genes_bulk)

src/chanjo2/meta/handle_report_contents.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,9 @@ def get_report_data(
123123
]
124124

125125
gene_ids_mapping: Dict[str, dict] = {
126-
gene.ensembl_id: {"hgnc_id": gene.hgnc_id, "hgnc_symbol": gene.hgnc_symbol}
126+
ensembl_id: {"hgnc_id": gene.hgnc_id, "hgnc_symbol": gene.hgnc_symbol}
127127
for gene in genes
128+
for ensembl_id in gene.ensembl_ids
128129
}
129130

130131
sql_intervals: list = []

src/chanjo2/models/pydantic_models.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,13 @@ class Interval(IntervalBase):
6363

6464
class GeneBase(IntervalBase):
6565
build: Builds
66-
ensembl_id: str
66+
ensembl_ids: List[str]
6767
hgnc_id: Optional[int]
6868
hgnc_symbol: Optional[str]
6969

70+
class Config:
71+
orm_mode = True
72+
7073

7174
class GeneQuery(BaseModel):
7275
build: Builds

0 commit comments

Comments
 (0)