chore(l2ggoldstandard): add studyId to schema (#305)

* chore(l2ggoldstandard): add studyId to schema * fix: add `studyId` to gold standards testing fixtures --------- Co-authored-by: David Ochoa <ochoa@ebi.ac.uk>
opentargets · Dec 5, 2023 · 52784dc · 52784dc
1 parent ae68d84
commit 52784dc
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 16 deletions.
diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json
@@ -13,6 +13,12 @@
       "nullable": false,
       "metadata": {}
     },
+    {
+      "name": "studyId",
+      "type": "string",
+      "nullable": false,
+      "metadata": {}
+    },
     {
       "name": "geneId",
       "type": "string",

diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -116,10 +116,8 @@ def as_l2g_gold_standard(
             L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
         """
         return L2GGoldStandard(
-            _df=cls.parse_positive_curation(gold_standard_curation)
-            .transform(cls.expand_gold_standard_with_negatives, v2g)
-            .drop(
-                "studyId",
+            _df=cls.parse_positive_curation(gold_standard_curation).transform(
+                cls.expand_gold_standard_with_negatives, v2g
             ),
             _schema=L2GGoldStandard.get_schema(),
         )
diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py
@@ -40,27 +40,30 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
     """Test filter_unique_associations."""
     mock_l2g_gs_df = spark.createDataFrame(
         [
-            (1, "variant1", "gene1", "positive"),
+            (1, "variant1", "study1", "gene1", "positive"),
             (
                 2,
                 "variant2",
+                "study1",
                 "gene1",
                 "negative",
             ),  # in the same locus as sl1 and pointing to same gene, has to be dropped
             (
                 3,
                 "variant3",
+                "study1",
                 "gene1",
                 "positive",
             ),  # in diff locus as sl1 and pointing to same gene, has to be kept
             (
                 4,
                 "variant4",
+                "study1",
                 "gene2",
                 "positive",
             ),  # in same locus as sl1 and pointing to diff gene, has to be kept
         ],
-        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_sl_overlap_df = spark.createDataFrame(
@@ -70,11 +73,11 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
 
     expected_df = spark.createDataFrame(
         [
-            (1, "variant1", "gene1", "positive"),
-            (3, "variant3", "gene1", "positive"),
-            (4, "variant4", "gene2", "positive"),
+            (1, "variant1", "study1", "gene1", "positive"),
+            (3, "variant3", "study1", "gene1", "positive"),
+            (4, "variant4", "study1", "gene2", "positive"),
         ],
-        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_l2g_gs = L2GGoldStandard(
@@ -93,27 +96,30 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
     """Test `remove_false_negatives`."""
     mock_l2g_gs_df = spark.createDataFrame(
         [
-            (1, "variant1", "gene1", "positive"),
+            (1, "variant1", "study1", "gene1", "positive"),
             (
                 2,
                 "variant2",
+                "study1",
                 "gene2",
                 "negative",
             ),  # gene2 is a partner of gene1, has to be dropped
             (
                 3,
                 "variant3",
+                "study1",
                 "gene3",
                 "negative",
             ),  # gene 3 is not a partner of gene1, has to be kept
             (
                 4,
                 "variant4",
+                "study1",
                 "gene4",
                 "positive",
             ),  # gene 4 is a partner of gene1, has to be kept because it's positive
         ],
-        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_interactions_df = spark.createDataFrame(
@@ -127,11 +133,11 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
 
     expected_df = spark.createDataFrame(
         [
-            (1, "variant1", "gene1", "positive"),
-            (3, "variant3", "gene3", "negative"),
-            (4, "variant4", "gene4", "positive"),
+            (1, "variant1", "study1", "gene1", "positive"),
+            (3, "variant3", "study1", "gene3", "negative"),
+            (4, "variant4", "study1", "gene4", "positive"),
         ],
-        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_l2g_gs = L2GGoldStandard(