Skip to content

Commit

Permalink
Merge branch 'main' into do_gwascat_harmonisation
Browse files Browse the repository at this point in the history
  • Loading branch information
d0choa authored Nov 27, 2023
2 parents 85cfff5 + a4a44da commit 019c52f
Show file tree
Hide file tree
Showing 14 changed files with 595 additions and 209 deletions.
2 changes: 1 addition & 1 deletion config/step/locus_to_gene.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ wandb_run_name: null
perform_cross_validation: false
model_path: ${datasets.l2g_model}
predictions_path: ${datasets.l2g_predictions}
study_locus_path: ${datasets.study_locus}
credible_set_path: ${datasets.credible_set}
variant_gene_path: ${datasets.v2g}
colocalisation_path: ${datasets.colocalisation}
study_index_path: ${datasets.catalog_study_index}
Expand Down
182 changes: 96 additions & 86 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ xgboost = "^1.7.3"
scikit-learn = "^1.2.1"
numpy = "^1.26.1"
hail = "0.2.126"
pyarrow = "^11.0.0"
pyarrow = "^14.0.1"
wandb = "^0.16.0"

[tool.poetry.dev-dependencies]
Expand All @@ -32,11 +32,11 @@ pep8-naming = "^0.13.2"
interrogate = "^1.5.0"
isort = "^5.12.0"
darglint = "^1.8.1"
ruff = "^0.1.3"
ruff = "^0.1.6"

[tool.poetry.group.docs.dependencies]
mkdocs = "^1.5.3"
mkdocstrings-python = "^1.7.4"
mkdocstrings-python = "^1.7.5"
mkdocs-material = "*"
mkdocs-section-index = "^0.3.4"
mkdocs-git-revision-date-localized-plugin = "^1.2.1"
Expand All @@ -59,7 +59,7 @@ pytest-xdist = "^3.4.0"


[tool.poetry.group.dev.dependencies]
ipython = "^8.5.0"
ipython = "^8.18.1"
ipykernel = "^6.19.0"
google-cloud-dataproc = "^5.7.0"
apache-airflow = "^2.7.3"
Expand Down
2 changes: 2 additions & 0 deletions src/airflow/dags/configs/dag.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
- "ukbiobank"
- id: "locus_to_gene"
prerequisites:
- "gwas_catalog"
- "ukbiobank"
- "variant_index"
- "v2g"
- "study_locus_overlap"
8 changes: 7 additions & 1 deletion src/otg/assets/schemas/l2g_gold_standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
"nullable": false,
"metadata": {}
},
{
"name": "variantId",
"type": "string",
"nullable": false,
"metadata": {}
},
{
"name": "geneId",
"type": "string",
Expand All @@ -22,7 +28,7 @@
{
"metadata": {},
"name": "sources",
"nullable": false,
"nullable": true,
"type": {
"containsNull": true,
"elementType": "string",
Expand Down
4 changes: 2 additions & 2 deletions src/otg/assets/schemas/study_locus_overlap.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
{
"metadata": {},
"name": "chromosome",
"nullable": false,
"nullable": true,
"type": "string"
},
{
Expand All @@ -27,7 +27,7 @@
{
"metadata": {},
"name": "statistics",
"nullable": false,
"nullable": true,
"type": {
"fields": [
{
Expand Down
149 changes: 146 additions & 3 deletions src/otg/dataset/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Type

import pyspark.sql.functions as f
from pyspark.sql import Window

from otg.common.schemas import parse_spark_schema
from otg.common.spark_helpers import get_record_with_maximum_value
from otg.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand All @@ -19,6 +23,10 @@
class L2GGoldStandard(Dataset):
"""L2G gold standard dataset."""

INTERACTION_THRESHOLD = 0.7
GS_POSITIVE_LABEL = "positive"
GS_NEGATIVE_LABEL = "negative"

@classmethod
def from_otg_curation(
cls: type[L2GGoldStandard],
Expand All @@ -42,8 +50,12 @@ def from_otg_curation(
OpenTargetsL2GGoldStandard,
)

return OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
gold_standard_curation, v2g, study_locus_overlap, interactions
interactions_df = cls.process_gene_interactions(interactions)

return (
OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g)
.filter_unique_associations(study_locus_overlap)
.remove_false_negatives(interactions_df)
)

@classmethod
Expand All @@ -54,3 +66,134 @@ def get_schema(cls: type[L2GGoldStandard]) -> StructType:
StructType: Spark schema for the L2GGoldStandard dataset
"""
return parse_spark_schema("l2g_gold_standard.json")

@classmethod
def process_gene_interactions(
cls: Type[L2GGoldStandard], interactions: DataFrame
) -> DataFrame:
"""Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
Args:
interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform
Returns:
DataFrame: Top scoring gene-gene interaction per pair of genes
Examples:
>>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
>>> L2GGoldStandard.process_gene_interactions(interactions).show()
+-------+-------+-----+
|geneIdA|geneIdB|score|
+-------+-------+-----+
| gene1| gene2| 0.8|
| gene2| gene3| 0.7|
+-------+-------+-----+
<BLANKLINE>
"""
return get_record_with_maximum_value(
interactions,
["targetA", "targetB"],
"scoring",
).selectExpr(
"targetA as geneIdA",
"targetB as geneIdB",
"scoring as score",
)

def filter_unique_associations(
self: L2GGoldStandard,
study_locus_overlap: StudyLocusOverlap,
) -> L2GGoldStandard:
"""Refines the gold standard to filter out loci that are not independent.
Rules:
- If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one.
- If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one.
- If two loci point to different genes, and have overlapping variants, we keep both.
Args:
study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.
Returns:
L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
"""
squared_overlaps = study_locus_overlap._convert_to_square_matrix()
unique_associations = (
self.df.alias("left")
# identify all the study loci that point to the same gene
.withColumn(
"sl_same_gene",
f.collect_set("studyLocusId").over(Window.partitionBy("geneId")),
)
# identify all the study loci that have an overlapping variant
.join(
squared_overlaps.df.alias("right"),
(f.col("left.studyLocusId") == f.col("right.leftStudyLocusId"))
& (f.col("left.variantId") == f.col("right.tagVariantId")),
"left",
)
.withColumn(
"overlaps",
f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise(
f.lit(False)
),
)
# drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
.filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
.select(*self.df.columns)
)
return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())

def remove_false_negatives(
self: L2GGoldStandard,
interactions_df: DataFrame,
) -> L2GGoldStandard:
"""Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.
Args:
interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes
Returns:
L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
"""
squared_interactions = interactions_df.unionByName(
interactions_df.selectExpr(
"geneIdB as geneIdA", "geneIdA as geneIdB", "score"
)
).filter(f.col("score") > self.INTERACTION_THRESHOLD)
df = (
self.df.alias("left")
.join(
# bring gene partners
squared_interactions.alias("right"),
f.col("left.geneId") == f.col("right.geneIdA"),
"left",
)
.withColumnRenamed("geneIdB", "interactorGeneId")
.join(
# bring gold standard status for gene partners
self.df.selectExpr(
"geneId as interactorGeneId",
"goldStandardSet as interactorGeneIdGoldStandardSet",
),
"interactorGeneId",
"left",
)
# remove self-interactions
.filter(
(f.col("geneId") != f.col("interactorGeneId"))
| (f.col("interactorGeneId").isNull())
)
# remove false negatives
.filter(
# drop rows where the GS gene is negative but the interactor is a GS positive
~(f.col("goldStandardSet") == "negative")
& (f.col("interactorGeneIdGoldStandardSet") == "positive")
|
# keep rows where the gene does not interact
(f.col("interactorGeneId").isNull())
)
.select(*self.df.columns)
.distinct()
)
return L2GGoldStandard(_df=df, _schema=self.get_schema())
2 changes: 1 addition & 1 deletion src/otg/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType:
return parse_spark_schema("l2g_predictions.json")

@classmethod
def from_study_locus(
def from_credible_set(
cls: Type[L2GPrediction],
model_path: str,
study_locus: StudyLocus,
Expand Down
17 changes: 17 additions & 0 deletions src/otg/dataset/study_locus_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,20 @@ def from_associations(
StudyLocusOverlap: Study-locus overlap dataset
"""
return study_locus.find_overlaps(study_index)

def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
"""Convert the dataset to a square matrix.
Returns:
StudyLocusOverlap: Square matrix of the dataset
"""
return StudyLocusOverlap(
_df=self.df.unionByName(
self.df.selectExpr(
"leftStudyLocusId as rightStudyLocusId",
"rightStudyLocusId as leftStudyLocusId",
"tagVariantId",
)
).distinct(),
_schema=self.get_schema(),
)
Loading

0 comments on commit 019c52f

Please sign in to comment.