Merge branch 'main' into do_gwascat_harmonisation

opentargets · Nov 27, 2023 · 019c52f · 019c52f
2 parents 85cfff5 + a4a44da
commit 019c52f
Show file tree

Hide file tree

Showing 14 changed files with 595 additions and 209 deletions.
diff --git a/config/step/locus_to_gene.yaml b/config/step/locus_to_gene.yaml
@@ -8,7 +8,7 @@ wandb_run_name: null
 perform_cross_validation: false
 model_path: ${datasets.l2g_model}
 predictions_path: ${datasets.l2g_predictions}
-study_locus_path: ${datasets.study_locus}
+credible_set_path: ${datasets.credible_set}
 variant_gene_path: ${datasets.v2g}
 colocalisation_path: ${datasets.colocalisation}
 study_index_path: ${datasets.catalog_study_index}

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ xgboost = "^1.7.3"
 scikit-learn = "^1.2.1"
 numpy = "^1.26.1"
 hail = "0.2.126"
-pyarrow = "^11.0.0"
+pyarrow = "^14.0.1"
 wandb = "^0.16.0"
 
 [tool.poetry.dev-dependencies]
@@ -32,11 +32,11 @@ pep8-naming = "^0.13.2"
 interrogate = "^1.5.0"
 isort = "^5.12.0"
 darglint = "^1.8.1"
-ruff = "^0.1.3"
+ruff = "^0.1.6"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.3"
-mkdocstrings-python = "^1.7.4"
+mkdocstrings-python = "^1.7.5"
 mkdocs-material = "*"
 mkdocs-section-index = "^0.3.4"
 mkdocs-git-revision-date-localized-plugin = "^1.2.1"
@@ -59,7 +59,7 @@ pytest-xdist = "^3.4.0"
 
 
 [tool.poetry.group.dev.dependencies]
-ipython = "^8.5.0"
+ipython = "^8.18.1"
 ipykernel = "^6.19.0"
 google-cloud-dataproc = "^5.7.0"
 apache-airflow = "^2.7.3"

diff --git a/src/airflow/dags/configs/dag.yaml b/src/airflow/dags/configs/dag.yaml
@@ -14,6 +14,8 @@
     - "ukbiobank"
 - id: "locus_to_gene"
   prerequisites:
+    - "gwas_catalog"
+    - "ukbiobank"
     - "variant_index"
     - "v2g"
     - "study_locus_overlap"
diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json
@@ -7,6 +7,12 @@
       "nullable": false,
       "metadata": {}
     },
+    {
+      "name": "variantId",
+      "type": "string",
+      "nullable": false,
+      "metadata": {}
+    },
     {
       "name": "geneId",
       "type": "string",
@@ -22,7 +28,7 @@
     {
       "metadata": {},
       "name": "sources",
-      "nullable": false,
+      "nullable": true,
       "type": {
         "containsNull": true,
         "elementType": "string",

diff --git a/src/otg/assets/schemas/study_locus_overlap.json b/src/otg/assets/schemas/study_locus_overlap.json
@@ -15,7 +15,7 @@
     {
       "metadata": {},
       "name": "chromosome",
-      "nullable": false,
+      "nullable": true,
       "type": "string"
     },
     {
@@ -27,7 +27,7 @@
     {
       "metadata": {},
       "name": "statistics",
-      "nullable": false,
+      "nullable": true,
       "type": {
         "fields": [
           {

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
@@ -2,9 +2,13 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Type
+
+import pyspark.sql.functions as f
+from pyspark.sql import Window
 
 from otg.common.schemas import parse_spark_schema
+from otg.common.spark_helpers import get_record_with_maximum_value
 from otg.dataset.dataset import Dataset
 
 if TYPE_CHECKING:
@@ -19,6 +23,10 @@
 class L2GGoldStandard(Dataset):
     """L2G gold standard dataset."""
 
+    INTERACTION_THRESHOLD = 0.7
+    GS_POSITIVE_LABEL = "positive"
+    GS_NEGATIVE_LABEL = "negative"
+
     @classmethod
     def from_otg_curation(
         cls: type[L2GGoldStandard],
@@ -42,8 +50,12 @@ def from_otg_curation(
             OpenTargetsL2GGoldStandard,
         )
 
-        return OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
-            gold_standard_curation, v2g, study_locus_overlap, interactions
+        interactions_df = cls.process_gene_interactions(interactions)
+
+        return (
+            OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g)
+            .filter_unique_associations(study_locus_overlap)
+            .remove_false_negatives(interactions_df)
         )
 
     @classmethod
@@ -54,3 +66,134 @@ def get_schema(cls: type[L2GGoldStandard]) -> StructType:
             StructType: Spark schema for the L2GGoldStandard dataset
         """
         return parse_spark_schema("l2g_gold_standard.json")
+
+    @classmethod
+    def process_gene_interactions(
+        cls: Type[L2GGoldStandard], interactions: DataFrame
+    ) -> DataFrame:
+        """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
+
+        Args:
+            interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform
+
+        Returns:
+            DataFrame: Top scoring gene-gene interaction per pair of genes
+
+        Examples:
+            >>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
+            >>> L2GGoldStandard.process_gene_interactions(interactions).show()
+            +-------+-------+-----+
+            |geneIdA|geneIdB|score|
+            +-------+-------+-----+
+            |  gene1|  gene2|  0.8|
+            |  gene2|  gene3|  0.7|
+            +-------+-------+-----+
+            <BLANKLINE>
+        """
+        return get_record_with_maximum_value(
+            interactions,
+            ["targetA", "targetB"],
+            "scoring",
+        ).selectExpr(
+            "targetA as geneIdA",
+            "targetB as geneIdB",
+            "scoring as score",
+        )
+
+    def filter_unique_associations(
+        self: L2GGoldStandard,
+        study_locus_overlap: StudyLocusOverlap,
+    ) -> L2GGoldStandard:
+        """Refines the gold standard to filter out loci that are not independent.
+
+        Rules:
+        - If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one.
+        - If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one.
+        - If two loci point to different genes, and have overlapping variants, we keep both.
+
+        Args:
+            study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.
+
+        Returns:
+            L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
+        """
+        squared_overlaps = study_locus_overlap._convert_to_square_matrix()
+        unique_associations = (
+            self.df.alias("left")
+            # identify all the study loci that point to the same gene
+            .withColumn(
+                "sl_same_gene",
+                f.collect_set("studyLocusId").over(Window.partitionBy("geneId")),
+            )
+            # identify all the study loci that have an overlapping variant
+            .join(
+                squared_overlaps.df.alias("right"),
+                (f.col("left.studyLocusId") == f.col("right.leftStudyLocusId"))
+                & (f.col("left.variantId") == f.col("right.tagVariantId")),
+                "left",
+            )
+            .withColumn(
+                "overlaps",
+                f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise(
+                    f.lit(False)
+                ),
+            )
+            # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
+            .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
+            .select(*self.df.columns)
+        )
+        return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())
+
+    def remove_false_negatives(
+        self: L2GGoldStandard,
+        interactions_df: DataFrame,
+    ) -> L2GGoldStandard:
+        """Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.
+
+        Args:
+            interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes
+
+        Returns:
+            L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
+        """
+        squared_interactions = interactions_df.unionByName(
+            interactions_df.selectExpr(
+                "geneIdB as geneIdA", "geneIdA as geneIdB", "score"
+            )
+        ).filter(f.col("score") > self.INTERACTION_THRESHOLD)
+        df = (
+            self.df.alias("left")
+            .join(
+                # bring gene partners
+                squared_interactions.alias("right"),
+                f.col("left.geneId") == f.col("right.geneIdA"),
+                "left",
+            )
+            .withColumnRenamed("geneIdB", "interactorGeneId")
+            .join(
+                # bring gold standard status for gene partners
+                self.df.selectExpr(
+                    "geneId as interactorGeneId",
+                    "goldStandardSet as interactorGeneIdGoldStandardSet",
+                ),
+                "interactorGeneId",
+                "left",
+            )
+            # remove self-interactions
+            .filter(
+                (f.col("geneId") != f.col("interactorGeneId"))
+                | (f.col("interactorGeneId").isNull())
+            )
+            # remove false negatives
+            .filter(
+                # drop rows where the GS gene is negative but the interactor is a GS positive
+                ~(f.col("goldStandardSet") == "negative")
+                & (f.col("interactorGeneIdGoldStandardSet") == "positive")
+                |
+                # keep rows where the gene does not interact
+                (f.col("interactorGeneId").isNull())
+            )
+            .select(*self.df.columns)
+            .distinct()
+        )
+        return L2GGoldStandard(_df=df, _schema=self.get_schema())
diff --git a/src/otg/dataset/l2g_prediction.py b/src/otg/dataset/l2g_prediction.py
@@ -41,7 +41,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType:
         return parse_spark_schema("l2g_predictions.json")
 
     @classmethod
-    def from_study_locus(
+    def from_credible_set(
         cls: Type[L2GPrediction],
         model_path: str,
         study_locus: StudyLocus,

diff --git a/src/otg/dataset/study_locus_overlap.py b/src/otg/dataset/study_locus_overlap.py
@@ -47,3 +47,20 @@ def from_associations(
             StudyLocusOverlap: Study-locus overlap dataset
         """
         return study_locus.find_overlaps(study_index)
+
+    def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
+        """Convert the dataset to a square matrix.
+
+        Returns:
+            StudyLocusOverlap: Square matrix of the dataset
+        """
+        return StudyLocusOverlap(
+            _df=self.df.unionByName(
+                self.df.selectExpr(
+                    "leftStudyLocusId as rightStudyLocusId",
+                    "rightStudyLocusId as leftStudyLocusId",
+                    "tagVariantId",
+                )
+            ).distinct(),
+            _schema=self.get_schema(),
+        )