Skip to content

Commit

Permalink
chore(l2ggoldstandard): add studyId to schema (#305)
Browse files Browse the repository at this point in the history
* chore(l2ggoldstandard): add studyId to schema

* fix: add `studyId` to gold standards testing fixtures

---------

Co-authored-by: David Ochoa <ochoa@ebi.ac.uk>
  • Loading branch information
ireneisdoomed and d0choa authored Dec 5, 2023
1 parent ae68d84 commit 52784dc
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 16 deletions.
6 changes: 6 additions & 0 deletions src/otg/assets/schemas/l2g_gold_standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
"nullable": false,
"metadata": {}
},
{
"name": "studyId",
"type": "string",
"nullable": false,
"metadata": {}
},
{
"name": "geneId",
"type": "string",
Expand Down
6 changes: 2 additions & 4 deletions src/otg/datasource/open_targets/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,8 @@ def as_l2g_gold_standard(
L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
"""
return L2GGoldStandard(
_df=cls.parse_positive_curation(gold_standard_curation)
.transform(cls.expand_gold_standard_with_negatives, v2g)
.drop(
"studyId",
_df=cls.parse_positive_curation(gold_standard_curation).transform(
cls.expand_gold_standard_with_negatives, v2g
),
_schema=L2GGoldStandard.get_schema(),
)
30 changes: 18 additions & 12 deletions tests/dataset/test_l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,27 +40,30 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
"""Test filter_unique_associations."""
mock_l2g_gs_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(
2,
"variant2",
"study1",
"gene1",
"negative",
), # in the same locus as sl1 and pointing to same gene, has to be dropped
(
3,
"variant3",
"study1",
"gene1",
"positive",
), # in diff locus as sl1 and pointing to same gene, has to be kept
(
4,
"variant4",
"study1",
"gene2",
"positive",
), # in same locus as sl1 and pointing to diff gene, has to be kept
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_sl_overlap_df = spark.createDataFrame(
Expand All @@ -70,11 +73,11 @@ def test_filter_unique_associations(spark: SparkSession) -> None:

expected_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(3, "variant3", "gene1", "positive"),
(4, "variant4", "gene2", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(3, "variant3", "study1", "gene1", "positive"),
(4, "variant4", "study1", "gene2", "positive"),
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_l2g_gs = L2GGoldStandard(
Expand All @@ -93,27 +96,30 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
"""Test `remove_false_negatives`."""
mock_l2g_gs_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(
2,
"variant2",
"study1",
"gene2",
"negative",
), # gene2 is a partner of gene1, has to be dropped
(
3,
"variant3",
"study1",
"gene3",
"negative",
), # gene 3 is not a partner of gene1, has to be kept
(
4,
"variant4",
"study1",
"gene4",
"positive",
), # gene 4 is a partner of gene1, has to be kept because it's positive
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_interactions_df = spark.createDataFrame(
Expand All @@ -127,11 +133,11 @@ def test_remove_false_negatives(spark: SparkSession) -> None:

expected_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(3, "variant3", "gene3", "negative"),
(4, "variant4", "gene4", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(3, "variant3", "study1", "gene3", "negative"),
(4, "variant4", "study1", "gene4", "positive"),
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_l2g_gs = L2GGoldStandard(
Expand Down

0 comments on commit 52784dc

Please sign in to comment.