From 52784dc7647e9d05fec4681456b476efde940795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Tue, 5 Dec 2023 12:17:36 +0000 Subject: [PATCH] chore(l2ggoldstandard): add studyId to schema (#305) * chore(l2ggoldstandard): add studyId to schema * fix: add `studyId` to gold standards testing fixtures --------- Co-authored-by: David Ochoa --- src/otg/assets/schemas/l2g_gold_standard.json | 6 ++++ .../open_targets/l2g_gold_standard.py | 6 ++-- tests/dataset/test_l2g.py | 30 +++++++++++-------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json index ba494b9aa..cf19d6b52 100644 --- a/src/otg/assets/schemas/l2g_gold_standard.json +++ b/src/otg/assets/schemas/l2g_gold_standard.json @@ -13,6 +13,12 @@ "nullable": false, "metadata": {} }, + { + "name": "studyId", + "type": "string", + "nullable": false, + "metadata": {} + }, { "name": "geneId", "type": "string", diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 532e382fe..b51099cff 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -116,10 +116,8 @@ def as_l2g_gold_standard( L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed. """ return L2GGoldStandard( - _df=cls.parse_positive_curation(gold_standard_curation) - .transform(cls.expand_gold_standard_with_negatives, v2g) - .drop( - "studyId", + _df=cls.parse_positive_curation(gold_standard_curation).transform( + cls.expand_gold_standard_with_negatives, v2g ), _schema=L2GGoldStandard.get_schema(), ) diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index eb42d01c9..3bf5d472c 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -40,27 +40,30 @@ def test_filter_unique_associations(spark: SparkSession) -> None: """Test filter_unique_associations.""" mock_l2g_gs_df = spark.createDataFrame( [ - (1, "variant1", "gene1", "positive"), + (1, "variant1", "study1", "gene1", "positive"), ( 2, "variant2", + "study1", "gene1", "negative", ), # in the same locus as sl1 and pointing to same gene, has to be dropped ( 3, "variant3", + "study1", "gene1", "positive", ), # in diff locus as sl1 and pointing to same gene, has to be kept ( 4, "variant4", + "study1", "gene2", "positive", ), # in same locus as sl1 and pointing to diff gene, has to be kept ], - "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_sl_overlap_df = spark.createDataFrame( @@ -70,11 +73,11 @@ def test_filter_unique_associations(spark: SparkSession) -> None: expected_df = spark.createDataFrame( [ - (1, "variant1", "gene1", "positive"), - (3, "variant3", "gene1", "positive"), - (4, "variant4", "gene2", "positive"), + (1, "variant1", "study1", "gene1", "positive"), + (3, "variant3", "study1", "gene1", "positive"), + (4, "variant4", "study1", "gene2", "positive"), ], - "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_l2g_gs = L2GGoldStandard( @@ -93,27 +96,30 @@ def test_remove_false_negatives(spark: SparkSession) -> None: """Test `remove_false_negatives`.""" mock_l2g_gs_df = spark.createDataFrame( [ - (1, "variant1", "gene1", "positive"), + (1, "variant1", "study1", "gene1", "positive"), ( 2, "variant2", + "study1", "gene2", "negative", ), # gene2 is a partner of gene1, has to be dropped ( 3, "variant3", + "study1", "gene3", "negative", ), # gene 3 is not a partner of gene1, has to be kept ( 4, "variant4", + "study1", "gene4", "positive", ), # gene 4 is a partner of gene1, has to be kept because it's positive ], - "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_interactions_df = spark.createDataFrame( @@ -127,11 +133,11 @@ def test_remove_false_negatives(spark: SparkSession) -> None: expected_df = spark.createDataFrame( [ - (1, "variant1", "gene1", "positive"), - (3, "variant3", "gene3", "negative"), - (4, "variant4", "gene4", "positive"), + (1, "variant1", "study1", "gene1", "positive"), + (3, "variant3", "study1", "gene3", "negative"), + (4, "variant4", "study1", "gene4", "positive"), ], - "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_l2g_gs = L2GGoldStandard(