Skip to content

Commit 37f90bc

Browse files
authored
Merge branch 'main' into do_clump_step
2 parents 4ef0cd2 + a4a44da commit 37f90bc

23 files changed

+826
-265
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ docs/assets/schemas/
1010
src/airflow/logs/*
1111
!src/airflow/logs/.gitkeep
1212
site/
13+
.env

config/step/locus_to_gene.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ wandb_run_name: null
88
perform_cross_validation: false
99
model_path: ${datasets.l2g_model}
1010
predictions_path: ${datasets.l2g_predictions}
11-
study_locus_path: ${datasets.study_locus}
11+
credible_set_path: ${datasets.credible_set}
1212
variant_gene_path: ${datasets.v2g}
1313
colocalisation_path: ${datasets.colocalisation}
1414
study_index_path: ${datasets.catalog_study_index}

poetry.lock

Lines changed: 96 additions & 86 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ xgboost = "^1.7.3"
2121
scikit-learn = "^1.2.1"
2222
numpy = "^1.26.1"
2323
hail = "0.2.126"
24-
pyarrow = "^11.0.0"
24+
pyarrow = "^14.0.1"
2525
wandb = "^0.16.0"
2626

2727
[tool.poetry.dev-dependencies]
@@ -32,11 +32,11 @@ pep8-naming = "^0.13.2"
3232
interrogate = "^1.5.0"
3333
isort = "^5.12.0"
3434
darglint = "^1.8.1"
35-
ruff = "^0.1.3"
35+
ruff = "^0.1.6"
3636

3737
[tool.poetry.group.docs.dependencies]
3838
mkdocs = "^1.5.3"
39-
mkdocstrings-python = "^1.7.4"
39+
mkdocstrings-python = "^1.7.5"
4040
mkdocs-material = "*"
4141
mkdocs-section-index = "^0.3.4"
4242
mkdocs-git-revision-date-localized-plugin = "^1.2.1"
@@ -59,7 +59,7 @@ pytest-xdist = "^3.4.0"
5959

6060

6161
[tool.poetry.group.dev.dependencies]
62-
ipython = "^8.5.0"
62+
ipython = "^8.18.1"
6363
ipykernel = "^6.19.0"
6464
google-cloud-dataproc = "^5.7.0"
6565
apache-airflow = "^2.7.3"

src/airflow/dags/configs/dag.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
- "ukbiobank"
1515
- id: "locus_to_gene"
1616
prerequisites:
17+
- "gwas_catalog"
18+
- "ukbiobank"
1719
- "variant_index"
1820
- "v2g"
1921
- "study_locus_overlap"

src/otg/assets/schemas/l2g_gold_standard.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
"nullable": false,
88
"metadata": {}
99
},
10+
{
11+
"name": "variantId",
12+
"type": "string",
13+
"nullable": false,
14+
"metadata": {}
15+
},
1016
{
1117
"name": "geneId",
1218
"type": "string",
@@ -22,7 +28,7 @@
2228
{
2329
"metadata": {},
2430
"name": "sources",
25-
"nullable": false,
31+
"nullable": true,
2632
"type": {
2733
"containsNull": true,
2834
"elementType": "string",

src/otg/assets/schemas/study_locus_overlap.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
{
1616
"metadata": {},
1717
"name": "chromosome",
18-
"nullable": false,
18+
"nullable": true,
1919
"type": "string"
2020
},
2121
{
@@ -27,7 +27,7 @@
2727
{
2828
"metadata": {},
2929
"name": "statistics",
30-
"nullable": false,
30+
"nullable": true,
3131
"type": {
3232
"fields": [
3333
{

src/otg/common/spark_helpers.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,38 @@ def normalise_column(
250250
)
251251

252252

253+
def neglog_pvalue_to_mantissa_and_exponent(p_value: Column) -> tuple[Column, Column]:
254+
"""Computing p-value mantissa and exponent based on the negative 10 based logarithm of the p-value.
255+
256+
Args:
257+
p_value (Column): Neg-log p-value (string)
258+
259+
Returns:
260+
tuple[Column, Column]: mantissa and exponent of the p-value
261+
262+
Examples:
263+
>>> (
264+
... spark.createDataFrame([(4.56, 'a'),(2109.23, 'b')], ['negLogPv', 'label'])
265+
... .select('negLogPv',*neglog_pvalue_to_mantissa_and_exponent(f.col('negLogPv')))
266+
... .show()
267+
... )
268+
+--------+------------------+--------------+
269+
|negLogPv| pValueMantissa|pValueExponent|
270+
+--------+------------------+--------------+
271+
| 4.56| 3.63078054770101| -5|
272+
| 2109.23|1.6982436524618154| -2110|
273+
+--------+------------------+--------------+
274+
<BLANKLINE>
275+
"""
276+
exponent: Column = f.ceil(p_value)
277+
mantissa: Column = f.pow(f.lit(10), (p_value - exponent + f.lit(1)))
278+
279+
return (
280+
mantissa.cast(t.DoubleType()).alias("pValueMantissa"),
281+
(-1 * exponent).cast(t.IntegerType()).alias("pValueExponent"),
282+
)
283+
284+
253285
def calculate_neglog_pvalue(
254286
p_value_mantissa: Column, p_value_exponent: Column
255287
) -> Column:

src/otg/dataset/l2g_gold_standard.py

Lines changed: 146 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22
from __future__ import annotations
33

44
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING
5+
from typing import TYPE_CHECKING, Type
6+
7+
import pyspark.sql.functions as f
8+
from pyspark.sql import Window
69

710
from otg.common.schemas import parse_spark_schema
11+
from otg.common.spark_helpers import get_record_with_maximum_value
812
from otg.dataset.dataset import Dataset
913

1014
if TYPE_CHECKING:
@@ -19,6 +23,10 @@
1923
class L2GGoldStandard(Dataset):
2024
"""L2G gold standard dataset."""
2125

26+
INTERACTION_THRESHOLD = 0.7
27+
GS_POSITIVE_LABEL = "positive"
28+
GS_NEGATIVE_LABEL = "negative"
29+
2230
@classmethod
2331
def from_otg_curation(
2432
cls: type[L2GGoldStandard],
@@ -42,8 +50,12 @@ def from_otg_curation(
4250
OpenTargetsL2GGoldStandard,
4351
)
4452

45-
return OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
46-
gold_standard_curation, v2g, study_locus_overlap, interactions
53+
interactions_df = cls.process_gene_interactions(interactions)
54+
55+
return (
56+
OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g)
57+
.filter_unique_associations(study_locus_overlap)
58+
.remove_false_negatives(interactions_df)
4759
)
4860

4961
@classmethod
@@ -54,3 +66,134 @@ def get_schema(cls: type[L2GGoldStandard]) -> StructType:
5466
StructType: Spark schema for the L2GGoldStandard dataset
5567
"""
5668
return parse_spark_schema("l2g_gold_standard.json")
69+
70+
@classmethod
71+
def process_gene_interactions(
72+
cls: Type[L2GGoldStandard], interactions: DataFrame
73+
) -> DataFrame:
74+
"""Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
75+
76+
Args:
77+
interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform
78+
79+
Returns:
80+
DataFrame: Top scoring gene-gene interaction per pair of genes
81+
82+
Examples:
83+
>>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
84+
>>> L2GGoldStandard.process_gene_interactions(interactions).show()
85+
+-------+-------+-----+
86+
|geneIdA|geneIdB|score|
87+
+-------+-------+-----+
88+
| gene1| gene2| 0.8|
89+
| gene2| gene3| 0.7|
90+
+-------+-------+-----+
91+
<BLANKLINE>
92+
"""
93+
return get_record_with_maximum_value(
94+
interactions,
95+
["targetA", "targetB"],
96+
"scoring",
97+
).selectExpr(
98+
"targetA as geneIdA",
99+
"targetB as geneIdB",
100+
"scoring as score",
101+
)
102+
103+
def filter_unique_associations(
104+
self: L2GGoldStandard,
105+
study_locus_overlap: StudyLocusOverlap,
106+
) -> L2GGoldStandard:
107+
"""Refines the gold standard to filter out loci that are not independent.
108+
109+
Rules:
110+
- If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one.
111+
- If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one.
112+
- If two loci point to different genes, and have overlapping variants, we keep both.
113+
114+
Args:
115+
study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.
116+
117+
Returns:
118+
L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
119+
"""
120+
squared_overlaps = study_locus_overlap._convert_to_square_matrix()
121+
unique_associations = (
122+
self.df.alias("left")
123+
# identify all the study loci that point to the same gene
124+
.withColumn(
125+
"sl_same_gene",
126+
f.collect_set("studyLocusId").over(Window.partitionBy("geneId")),
127+
)
128+
# identify all the study loci that have an overlapping variant
129+
.join(
130+
squared_overlaps.df.alias("right"),
131+
(f.col("left.studyLocusId") == f.col("right.leftStudyLocusId"))
132+
& (f.col("left.variantId") == f.col("right.tagVariantId")),
133+
"left",
134+
)
135+
.withColumn(
136+
"overlaps",
137+
f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise(
138+
f.lit(False)
139+
),
140+
)
141+
# drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
142+
.filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
143+
.select(*self.df.columns)
144+
)
145+
return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())
146+
147+
def remove_false_negatives(
148+
self: L2GGoldStandard,
149+
interactions_df: DataFrame,
150+
) -> L2GGoldStandard:
151+
"""Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.
152+
153+
Args:
154+
interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes
155+
156+
Returns:
157+
L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
158+
"""
159+
squared_interactions = interactions_df.unionByName(
160+
interactions_df.selectExpr(
161+
"geneIdB as geneIdA", "geneIdA as geneIdB", "score"
162+
)
163+
).filter(f.col("score") > self.INTERACTION_THRESHOLD)
164+
df = (
165+
self.df.alias("left")
166+
.join(
167+
# bring gene partners
168+
squared_interactions.alias("right"),
169+
f.col("left.geneId") == f.col("right.geneIdA"),
170+
"left",
171+
)
172+
.withColumnRenamed("geneIdB", "interactorGeneId")
173+
.join(
174+
# bring gold standard status for gene partners
175+
self.df.selectExpr(
176+
"geneId as interactorGeneId",
177+
"goldStandardSet as interactorGeneIdGoldStandardSet",
178+
),
179+
"interactorGeneId",
180+
"left",
181+
)
182+
# remove self-interactions
183+
.filter(
184+
(f.col("geneId") != f.col("interactorGeneId"))
185+
| (f.col("interactorGeneId").isNull())
186+
)
187+
# remove false negatives
188+
.filter(
189+
# drop rows where the GS gene is negative but the interactor is a GS positive
190+
~(f.col("goldStandardSet") == "negative")
191+
& (f.col("interactorGeneIdGoldStandardSet") == "positive")
192+
|
193+
# keep rows where the gene does not interact
194+
(f.col("interactorGeneId").isNull())
195+
)
196+
.select(*self.df.columns)
197+
.distinct()
198+
)
199+
return L2GGoldStandard(_df=df, _schema=self.get_schema())

src/otg/dataset/l2g_prediction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType:
4141
return parse_spark_schema("l2g_predictions.json")
4242

4343
@classmethod
44-
def from_study_locus(
44+
def from_credible_set(
4545
cls: Type[L2GPrediction],
4646
model_path: str,
4747
study_locus: StudyLocus,

src/otg/dataset/study_locus_overlap.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,20 @@ def from_associations(
4747
StudyLocusOverlap: Study-locus overlap dataset
4848
"""
4949
return study_locus.find_overlaps(study_index)
50+
51+
def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
52+
"""Convert the dataset to a square matrix.
53+
54+
Returns:
55+
StudyLocusOverlap: Square matrix of the dataset
56+
"""
57+
return StudyLocusOverlap(
58+
_df=self.df.unionByName(
59+
self.df.selectExpr(
60+
"leftStudyLocusId as rightStudyLocusId",
61+
"rightStudyLocusId as leftStudyLocusId",
62+
"tagVariantId",
63+
)
64+
).distinct(),
65+
_schema=self.get_schema(),
66+
)

0 commit comments

Comments
 (0)