Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(l2ggoldstandard): add studyId to schema #305

Merged
merged 8 commits into from
Dec 5, 2023
6 changes: 6 additions & 0 deletions src/otg/assets/schemas/l2g_gold_standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
"nullable": false,
"metadata": {}
},
{
"name": "studyId",
"type": "string",
"nullable": false,
"metadata": {}
},
{
"name": "geneId",
"type": "string",
Expand Down
6 changes: 2 additions & 4 deletions src/otg/datasource/open_targets/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,8 @@ def as_l2g_gold_standard(
L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
"""
return L2GGoldStandard(
_df=cls.parse_positive_curation(gold_standard_curation)
.transform(cls.expand_gold_standard_with_negatives, v2g)
.drop(
"studyId",
_df=cls.parse_positive_curation(gold_standard_curation).transform(
cls.expand_gold_standard_with_negatives, v2g
),
_schema=L2GGoldStandard.get_schema(),
)
30 changes: 18 additions & 12 deletions tests/dataset/test_l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,27 +40,30 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
"""Test filter_unique_associations."""
mock_l2g_gs_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(
2,
"variant2",
"study1",
"gene1",
"negative",
), # in the same locus as sl1 and pointing to same gene, has to be dropped
(
3,
"variant3",
"study1",
"gene1",
"positive",
), # in diff locus as sl1 and pointing to same gene, has to be kept
(
4,
"variant4",
"study1",
"gene2",
"positive",
), # in same locus as sl1 and pointing to diff gene, has to be kept
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_sl_overlap_df = spark.createDataFrame(
Expand All @@ -70,11 +73,11 @@ def test_filter_unique_associations(spark: SparkSession) -> None:

expected_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(3, "variant3", "gene1", "positive"),
(4, "variant4", "gene2", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(3, "variant3", "study1", "gene1", "positive"),
(4, "variant4", "study1", "gene2", "positive"),
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_l2g_gs = L2GGoldStandard(
Expand All @@ -93,27 +96,30 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
"""Test `remove_false_negatives`."""
mock_l2g_gs_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(
2,
"variant2",
"study1",
"gene2",
"negative",
), # gene2 is a partner of gene1, has to be dropped
(
3,
"variant3",
"study1",
"gene3",
"negative",
), # gene 3 is not a partner of gene1, has to be kept
(
4,
"variant4",
"study1",
"gene4",
"positive",
), # gene 4 is a partner of gene1, has to be kept because it's positive
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_interactions_df = spark.createDataFrame(
Expand All @@ -127,11 +133,11 @@ def test_remove_false_negatives(spark: SparkSession) -> None:

expected_df = spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(3, "variant3", "gene3", "negative"),
(4, "variant4", "gene4", "positive"),
(1, "variant1", "study1", "gene1", "positive"),
(3, "variant3", "study1", "gene3", "negative"),
(4, "variant4", "study1", "gene4", "positive"),
],
"studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
"studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
)

mock_l2g_gs = L2GGoldStandard(
Expand Down