From 4389315f40985e1efa637b5eaeee6c2c63c8b549 Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 12:10:23 +0100 Subject: [PATCH 01/11] feat: change studyLocusId to string in schema --- src/gentropy/assets/schemas/colocalisation.json | 4 ++-- src/gentropy/assets/schemas/l2g_feature.json | 2 +- src/gentropy/assets/schemas/l2g_gold_standard.json | 2 +- src/gentropy/assets/schemas/l2g_predictions.json | 2 +- src/gentropy/assets/schemas/study_locus.json | 2 +- src/gentropy/assets/schemas/study_locus_overlap.json | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/gentropy/assets/schemas/colocalisation.json b/src/gentropy/assets/schemas/colocalisation.json index 7ff7453b9..953b5afa4 100644 --- a/src/gentropy/assets/schemas/colocalisation.json +++ b/src/gentropy/assets/schemas/colocalisation.json @@ -4,13 +4,13 @@ { "name": "leftStudyLocusId", "nullable": false, - "type": "long", + "type": "string", "metadata": {} }, { "name": "rightStudyLocusId", "nullable": false, - "type": "long", + "type": "string", "metadata": {} }, { diff --git a/src/gentropy/assets/schemas/l2g_feature.json b/src/gentropy/assets/schemas/l2g_feature.json index 3139a57e4..314b4dde0 100644 --- a/src/gentropy/assets/schemas/l2g_feature.json +++ b/src/gentropy/assets/schemas/l2g_feature.json @@ -3,7 +3,7 @@ "fields": [ { "name": "studyLocusId", - "type": "long", + "type": "string", "nullable": false, "metadata": {} }, diff --git a/src/gentropy/assets/schemas/l2g_gold_standard.json b/src/gentropy/assets/schemas/l2g_gold_standard.json index cf19d6b52..6af921d61 100644 --- a/src/gentropy/assets/schemas/l2g_gold_standard.json +++ b/src/gentropy/assets/schemas/l2g_gold_standard.json @@ -3,7 +3,7 @@ "fields": [ { "name": "studyLocusId", - "type": "long", + "type": "string", "nullable": false, "metadata": {} }, diff --git a/src/gentropy/assets/schemas/l2g_predictions.json b/src/gentropy/assets/schemas/l2g_predictions.json index 16b274207..238ff4087 100644 --- a/src/gentropy/assets/schemas/l2g_predictions.json +++ b/src/gentropy/assets/schemas/l2g_predictions.json @@ -3,7 +3,7 @@ "fields": [ { "name": "studyLocusId", - "type": "long", + "type": "string", "nullable": false, "metadata": {} }, diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json index 11908f687..0d2307fad 100644 --- a/src/gentropy/assets/schemas/study_locus.json +++ b/src/gentropy/assets/schemas/study_locus.json @@ -4,7 +4,7 @@ "metadata": {}, "name": "studyLocusId", "nullable": false, - "type": "long" + "type": "string" }, { "metadata": {}, diff --git a/src/gentropy/assets/schemas/study_locus_overlap.json b/src/gentropy/assets/schemas/study_locus_overlap.json index 9a8e123cd..aab8408f7 100644 --- a/src/gentropy/assets/schemas/study_locus_overlap.json +++ b/src/gentropy/assets/schemas/study_locus_overlap.json @@ -4,13 +4,13 @@ "metadata": {}, "name": "leftStudyLocusId", "nullable": false, - "type": "long" + "type": "string" }, { "metadata": {}, "name": "rightStudyLocusId", "nullable": false, - "type": "long" + "type": "string" }, { "metadata": {}, From 7dc153afd74034a1564ce3eca310c39090840d5d Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 14:09:14 +0100 Subject: [PATCH 02/11] feat: change studyLocusId of example data to string in tests --- tests/gentropy/conftest.py | 6 +- tests/gentropy/dataset/test_colocalisation.py | 6 +- tests/gentropy/dataset/test_l2g.py | 46 ++++----- .../dataset/test_l2g_feature_matrix.py | 6 +- tests/gentropy/dataset/test_study_locus.py | 95 +++++++++---------- .../dataset/test_study_locus_overlap.py | 10 +- .../dataset/test_study_locus_overlaps.py | 22 ++--- .../test_gwas_catalog_associations.py | 4 +- .../datasource/open_targets/test_variants.py | 2 +- tests/gentropy/method/test_clump.py | 2 +- .../method/test_colocalisation_method.py | 22 ++--- 11 files changed, 110 insertions(+), 111 deletions(-) diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py index 93ee38471..ded2b9e34 100644 --- a/tests/gentropy/conftest.py +++ b/tests/gentropy/conftest.py @@ -587,10 +587,10 @@ def mock_l2g_feature_matrix(spark: SparkSession) -> L2GFeatureMatrix: return L2GFeatureMatrix( _df=spark.createDataFrame( [ - (1, "gene1", 100.0, None), - (2, "gene2", 1000.0, 0.0), + ("1", "gene1", 100.0, None), + ("2", "gene2", 1000.0, 0.0), ], - "studyLocusId LONG, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT", + "studyLocusId STRING, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT", ), with_gold_standard=False, ) diff --git a/tests/gentropy/dataset/test_colocalisation.py b/tests/gentropy/dataset/test_colocalisation.py index 5371cf42c..672f5c00d 100644 --- a/tests/gentropy/dataset/test_colocalisation.py +++ b/tests/gentropy/dataset/test_colocalisation.py @@ -72,12 +72,12 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None: _df=spark.createDataFrame( [ ( - 1, + "1", "var1", "gwas1", ), ( - 2, + "2", "var2", "eqtl1", ), @@ -100,7 +100,7 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None: ) self.sample_colocalisation = Colocalisation( _df=spark.createDataFrame( - [(1, 2, "X", "COLOC", 1, 0.9)], + [("1", "2", "X", "COLOC", 1, 0.9)], [ "leftStudyLocusId", "rightStudyLocusId", diff --git a/tests/gentropy/dataset/test_l2g.py b/tests/gentropy/dataset/test_l2g.py index d37ce5a4a..2472ad445 100644 --- a/tests/gentropy/dataset/test_l2g.py +++ b/tests/gentropy/dataset/test_l2g.py @@ -43,44 +43,44 @@ def test_filter_unique_associations(spark: SparkSession) -> None: """Test filter_unique_associations.""" mock_l2g_gs_df = spark.createDataFrame( [ - (1, "variant1", "study1", "gene1", "positive"), + ("1", "variant1", "study1", "gene1", "positive"), ( - 2, + "2", "variant2", "study1", "gene1", "negative", ), # in the same locus as sl1 and pointing to same gene, has to be dropped ( - 3, + "3", "variant3", "study1", "gene1", "positive", ), # in diff locus as sl1 and pointing to same gene, has to be kept ( - 4, + "4", "variant4", "study1", "gene2", "positive", ), # in same locus as sl1 and pointing to diff gene, has to be kept ], - "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_sl_overlap_df = spark.createDataFrame( - [(1, 2, "variant2"), (1, 4, "variant4")], - "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + [("1", "2", "variant2"), ("1", "4", "variant4")], + "leftStudyLocusId STRING, rightStudyLocusId STRING, tagVariantId STRING", ) expected_df = spark.createDataFrame( [ - (1, "variant1", "study1", "gene1", "positive"), - (3, "variant3", "study1", "gene1", "positive"), - (4, "variant4", "study1", "gene2", "positive"), + ("1", "variant1", "study1", "gene1", "positive"), + ("3", "variant3", "study1", "gene1", "positive"), + ("4", "variant4", "study1", "gene2", "positive"), ], - "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_l2g_gs = L2GGoldStandard( @@ -99,30 +99,30 @@ def test_remove_false_negatives(spark: SparkSession) -> None: """Test `remove_false_negatives`.""" mock_l2g_gs_df = spark.createDataFrame( [ - (1, "variant1", "study1", "gene1", "positive"), + ("1", "variant1", "study1", "gene1", "positive"), ( - 2, + "2", "variant2", "study1", "gene2", "negative", ), # gene2 is a partner of gene1, has to be dropped ( - 3, + "3", "variant3", "study1", "gene3", "negative", ), # gene 3 is not a partner of gene1, has to be kept ( - 4, + "4", "variant4", "study1", "gene4", "positive", ), # gene 4 is a partner of gene1, has to be kept because it's positive ], - "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_interactions_df = spark.createDataFrame( @@ -136,11 +136,11 @@ def test_remove_false_negatives(spark: SparkSession) -> None: expected_df = spark.createDataFrame( [ - (1, "variant1", "study1", "gene1", "positive"), - (3, "variant3", "study1", "gene3", "negative"), - (4, "variant4", "study1", "gene4", "positive"), + ("1", "variant1", "study1", "gene1", "positive"), + ("3", "variant3", "study1", "gene3", "negative"), + ("4", "variant4", "study1", "gene4", "positive"), ], - "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", + "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING", ) mock_l2g_gs = L2GGoldStandard( @@ -161,10 +161,10 @@ def test_l2g_feature_constructor_with_schema_mismatch( fm = L2GFeatureMatrix( _df=spark.createDataFrame( [ - (1, "gene1", 100.0), - (2, "gene2", 1000.0), + ("1", "gene1", 100.0), + ("2", "gene2", 1000.0), ], - "studyLocusId LONG, geneId STRING, distanceTssMean DOUBLE", + "studyLocusId STRING, geneId STRING, distanceTssMean DOUBLE", ), with_gold_standard=False, ) diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py index 46384239c..6ab1d4ddb 100644 --- a/tests/gentropy/dataset/test_l2g_feature_matrix.py +++ b/tests/gentropy/dataset/test_l2g_feature_matrix.py @@ -81,7 +81,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: _df=spark.createDataFrame( [ ( - 1, + "1", "var1", "gwas1", [ @@ -90,7 +90,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: ], ), ( - 2, + "2", "var2", "eqtl1", [ @@ -136,7 +136,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: ) self.sample_colocalisation = Colocalisation( _df=spark.createDataFrame( - [(1, 2, "X", "COLOC", 1, 0.9)], + [("1", "2", "X", "COLOC", 1, 0.9)], [ "leftStudyLocusId", "rightStudyLocusId", diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index c89521b3c..45646c82c 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -12,7 +12,6 @@ ArrayType, BooleanType, DoubleType, - LongType, StringType, StructField, StructType, @@ -41,8 +40,8 @@ True, [ { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, + "leftStudyLocusId": "1", + "rightStudyLocusId": "2", "chromosome": "1", "tagVariantId": "commonTag", "statistics": { @@ -51,8 +50,8 @@ }, }, { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, + "leftStudyLocusId": "1", + "rightStudyLocusId": "2", "chromosome": "1", "tagVariantId": "nonCommonTag", "statistics": { @@ -76,7 +75,7 @@ def test_find_overlaps_semantic( # 2 associations with a common variant in the locus [ { - "studyLocusId": 1, + "studyLocusId": "1", "variantId": "lead1", "studyId": "study1", "locus": [ @@ -85,7 +84,7 @@ def test_find_overlaps_semantic( "chromosome": "1", }, { - "studyLocusId": 2, + "studyLocusId": "2", "variantId": "lead2", "studyId": "study2", "locus": [ @@ -105,7 +104,7 @@ def test_find_overlaps_semantic( # 2 associations with no common variants in the locus [ { - "studyLocusId": 1, + "studyLocusId": "1", "variantId": "lead1", "studyId": "study1", "locus": [ @@ -114,7 +113,7 @@ def test_find_overlaps_semantic( "chromosome": "1", }, { - "studyLocusId": 2, + "studyLocusId": "2", "variantId": "lead2", "studyId": "study2", "locus": None, @@ -181,13 +180,13 @@ def test_filter_by_study_type( [ { # from gwas - "studyLocusId": 1, + "studyLocusId": "1", "variantId": "lead1", "studyId": "study1", }, { # from eqtl - "studyLocusId": 2, + "studyLocusId": "2", "variantId": "lead2", "studyId": "study2", }, @@ -258,7 +257,7 @@ def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None: # Locus is not null, should return union between variants in locus and lead variant [ ( - 1, + "1", "traitA", "22_varA", [ @@ -281,7 +280,7 @@ def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None: ( # locus is null, should return lead variant [ - (1, "traitA", "22_varA", None), + ("1", "traitA", "22_varA", None), ], [ ( @@ -299,7 +298,7 @@ def test_unique_variants_in_locus( # assert isinstance(mock_study_locus.test_unique_variants_in_locus(), DataFrame) schema = StructType( [ - StructField("studyLocusId", LongType(), True), + StructField("studyLocusId", StringType(), True), StructField("studyId", StringType(), True), StructField("variantId", StringType(), True), StructField( @@ -342,7 +341,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Observed ( - 1, + "1", "traitA", "leadB", [{"variantId": "tagVariantA", "posteriorProbability": 1.0}], @@ -351,7 +350,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Expected ( - 1, + "1", "traitA", "leadB", [ @@ -370,7 +369,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Observed ( - 1, + "1", "traitA", "leadA", [ @@ -387,7 +386,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Expected ( - 1, + "1", "traitA", "leadA", [ @@ -442,7 +441,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Observed ( - 1, + "1", "traitA", "leadB", None, @@ -451,7 +450,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Expected ( - 1, + "1", "traitA", "leadB", None, @@ -463,7 +462,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Observed ( - 1, + "1", "traitA", "leadB", [], @@ -472,7 +471,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None: [ # Expected ( - 1, + "1", "traitA", "leadB", None, @@ -487,7 +486,7 @@ def test_annotate_credible_sets( """Test annotate_credible_sets.""" schema = StructType( [ - StructField("studyLocusId", LongType(), True), + StructField("studyLocusId", StringType(), True), StructField("studyId", StringType(), True), StructField("variantId", StringType(), True), StructField( @@ -590,12 +589,12 @@ class TestStudyLocusVariantValidation: STUDYLOCUS_DATA = [ # First studylocus passes qc: - (1, "v1", "s1", "v1"), - (1, "v1", "s1", "v2"), - (1, "v1", "s1", "v3"), + ("1", "v1", "s1", "v1"), + ("1", "v1", "s1", "v2"), + ("1", "v1", "s1", "v3"), # Second studylocus passes qc: - (2, "v1", "s1", "v1"), - (2, "v1", "s1", "v5"), + ("2", "v1", "s1", "v1"), + ("2", "v1", "s1", "v5"), ] STUDYLOCUS_HEADER = ["studyLocusId", "variantId", "studyId", "tagVariantId"] @@ -612,7 +611,7 @@ def _setup(self: TestStudyLocusVariantValidation, spark: SparkSession) -> None: self.credible_set = StudyLocus( _df=( spark.createDataFrame(self.STUDYLOCUS_DATA, self.STUDYLOCUS_HEADER) - .withColumn("studyLocusId", f.col("studyLocusId").cast(t.LongType())) + .withColumn("studyLocusId", f.col("studyLocusId").cast(t.StringType())) .withColumn("qualityControls", f.array()) .groupBy("studyLocusId", "variantId", "studyId") .agg( @@ -653,7 +652,7 @@ def test_validation_correctness(self: TestStudyLocusVariantValidation) -> None: # Check that the right one is flagged: assert ( validated.filter( - (f.size("qualityControls") > 0) & (f.col("studyLocusId") == 2) + (f.size("qualityControls") > 0) & (f.col("studyLocusId") == "2") ).count() == 1 ) @@ -664,17 +663,17 @@ class TestStudyLocusValidation: STUDY_LOCUS_DATA = [ # Won't be flagged: - (1, "v1", "s1", 1.0, -8, []), + ("1", "v1", "s1", 1.0, -8, []), # Already flagged, needs to be tested if the flag reamins unique: - (2, "v2", "s2", 5.0, -4, [StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG.value]), + ("2", "v2", "s2", 5.0, -4, [StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG.value]), # To be flagged: - (3, "v3", "s3", 1.0, -4, []), - (4, "v4", "s4", 5.0, -3, []), + ("3", "v3", "s3", 1.0, -4, []), + ("4", "v4", "s4", 5.0, -3, []), ] STUDY_LOCUS_SCHEMA = t.StructType( [ - t.StructField("studyLocusId", t.LongType(), False), + t.StructField("studyLocusId", t.StringType(), False), t.StructField("variantId", t.StringType(), False), t.StructField("studyId", t.StringType(), False), t.StructField("pValueMantissa", t.FloatType(), False), @@ -805,23 +804,23 @@ class TestStudyLocusRedundancyFlagging: """Collection of tests related to flagging redundant credible sets.""" STUDY_LOCUS_DATA = [ - (1, "v1", "s1", "pics", []), - (2, "v2", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), - (3, "v3", "s1", "pics", []), - (3, "v3", "s1", "pics", []), - (1, "v1", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), - (1, "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), - (1, "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), - (1, "v1", "s3", "SuSie", []), - (1, "v1", "s3", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), - (1, "v1", "s4", "pics", []), - (1, "v1", "s4", "SuSie", []), - (1, "v1", "s4", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), + ("1", "v1", "s1", "pics", []), + ("2", "v2", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), + ("3", "v3", "s1", "pics", []), + ("3", "v3", "s1", "pics", []), + ("1", "v1", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), + ("1", "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), + ("1", "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), + ("1", "v1", "s3", "SuSie", []), + ("1", "v1", "s3", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), + ("1", "v1", "s4", "pics", []), + ("1", "v1", "s4", "SuSie", []), + ("1", "v1", "s4", "pics", [StudyLocusQualityCheck.TOP_HIT.value]), ] STUDY_LOCUS_SCHEMA = t.StructType( [ - t.StructField("studyLocusId", t.LongType(), False), + t.StructField("studyLocusId", t.StringType(), False), t.StructField("variantId", t.StringType(), False), t.StructField("studyId", t.StringType(), False), t.StructField("finemappingMethod", t.StringType(), False), diff --git a/tests/gentropy/dataset/test_study_locus_overlap.py b/tests/gentropy/dataset/test_study_locus_overlap.py index e26b59c30..5ced1d7a5 100644 --- a/tests/gentropy/dataset/test_study_locus_overlap.py +++ b/tests/gentropy/dataset/test_study_locus_overlap.py @@ -19,19 +19,19 @@ def test_convert_to_square_matrix(spark: SparkSession) -> None: mock_sl_overlap = StudyLocusOverlap( _df=spark.createDataFrame( [ - (1, 2, "variant2"), + ("1", "2", "variant2"), ], - "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + "leftStudyLocusId STRING, rightStudyLocusId STRING, tagVariantId STRING", ), _schema=StudyLocusOverlap.get_schema(), ) expected_df = spark.createDataFrame( [ - (1, 2, "variant2"), - (2, 1, "variant2"), + ("1", "2", "variant2"), + ("2", "1", "variant2"), ], - "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + "leftStudyLocusId STRING, rightStudyLocusId STRING, tagVariantId STRING", ) observed_df = mock_sl_overlap._convert_to_square_matrix().df diff --git a/tests/gentropy/dataset/test_study_locus_overlaps.py b/tests/gentropy/dataset/test_study_locus_overlaps.py index bd3415959..971bbcfcf 100644 --- a/tests/gentropy/dataset/test_study_locus_overlaps.py +++ b/tests/gentropy/dataset/test_study_locus_overlaps.py @@ -38,21 +38,21 @@ def test_study_locus_overlap_from_associations( # observed - input DataFrame representing gwas and nongwas data to find overlapping signals [ { - "studyLocusId": 1, + "studyLocusId": "1", "studyId": "A", "studyType": "gwas", "chromosome": "1", "tagVariantId": "A", }, { - "studyLocusId": 2, + "studyLocusId": "2", "studyId": "B", "studyType": "eqtl", "chromosome": "1", "tagVariantId": "A", }, { - "studyLocusId": 3, + "studyLocusId": "3", "studyId": "C", "studyType": "gwas", "chromosome": "1", @@ -63,14 +63,14 @@ def test_study_locus_overlap_from_associations( False, # expected - output DataFrame with overlapping signals [ - {"leftStudyLocusId": 1, "rightStudyLocusId": 2, "chromosome": "1"}, + {"leftStudyLocusId": "1", "rightStudyLocusId": "2", "chromosome": "1"}, ], ), ( # observed - input DataFrame representing intra-study data to find overlapping signals in the same study [ { - "studyLocusId": 1, + "studyLocusId": "1", "studyId": "A", "studyType": "gwas", "chromosome": "1", @@ -78,7 +78,7 @@ def test_study_locus_overlap_from_associations( "tagVariantId": "A", }, { - "studyLocusId": 2, + "studyLocusId": "2", "studyId": "A", "studyType": "gwas", "chromosome": "1", @@ -86,7 +86,7 @@ def test_study_locus_overlap_from_associations( "tagVariantId": "A", }, { - "studyLocusId": 3, + "studyLocusId": "3", "studyId": "B", "studyType": "gwas", "chromosome": "1", @@ -97,7 +97,7 @@ def test_study_locus_overlap_from_associations( # intrastudy - bool of whether or not to use inter-study or intra-study logic True, # expected - output DataFrame with overlapping signals - [{"leftStudyLocusId": 2, "rightStudyLocusId": 1, "chromosome": "1"}], + [{"leftStudyLocusId": "2", "rightStudyLocusId": "1", "chromosome": "1"}], ), ], ) @@ -110,7 +110,7 @@ def test_overlapping_peaks( """Test overlapping signals between GWAS-GWAS and GWAS-Molecular trait to make sure that mQTLs are always on the right.""" mock_schema = t.StructType( [ - t.StructField("studyLocusId", t.LongType()), + t.StructField("studyLocusId", t.StringType()), t.StructField("studyId", t.StringType()), t.StructField("studyType", t.StringType()), t.StructField("chromosome", t.StringType()), @@ -120,8 +120,8 @@ def test_overlapping_peaks( ) expected_schema = t.StructType( [ - t.StructField("leftStudyLocusId", t.LongType()), - t.StructField("rightStudyLocusId", t.LongType()), + t.StructField("leftStudyLocusId", t.StringType()), + t.StructField("rightStudyLocusId", t.StringType()), t.StructField("chromosome", t.StringType()), ] ) diff --git a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py index 130097f25..fe9608bf0 100644 --- a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py +++ b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py @@ -4,7 +4,7 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as f -from pyspark.sql.types import LongType +from pyspark.sql.types import StringType from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.gwas_catalog.associations import ( @@ -71,7 +71,7 @@ def test_map_variants_to_variant_index( assert isinstance( GWASCatalogCuratedAssociationsParser._map_variants_to_gnomad_variants( sample_gwas_catalog_associations.withColumn( - "studyLocusId", f.monotonically_increasing_id().cast(LongType()) + "studyLocusId", f.monotonically_increasing_id().cast(StringType()) ), mock_variant_index, ), diff --git a/tests/gentropy/datasource/open_targets/test_variants.py b/tests/gentropy/datasource/open_targets/test_variants.py index 247a9d81e..6aa22e628 100644 --- a/tests/gentropy/datasource/open_targets/test_variants.py +++ b/tests/gentropy/datasource/open_targets/test_variants.py @@ -25,7 +25,7 @@ def test_as_vcf_df_credible_set( df_credible_set_df = spark.createDataFrame( [ { - "studyLocusId": 1, + "studyLocusId": "1", "variantId": "1_2_C_G", "studyId": "study1", "locus": [ diff --git a/tests/gentropy/method/test_clump.py b/tests/gentropy/method/test_clump.py index 1e754df3a..83a95e19f 100644 --- a/tests/gentropy/method/test_clump.py +++ b/tests/gentropy/method/test_clump.py @@ -88,7 +88,7 @@ class TestIsLeadLinked: SCHEMA = t.StructType( [ t.StructField("studyId", t.StringType(), True), - t.StructField("studyLocusId", t.LongType(), True), + t.StructField("studyLocusId", t.StringType(), True), t.StructField("chromosome", t.StringType(), True), t.StructField("variantId", t.StringType(), True), t.StructField("pValueMantissa", t.FloatType(), True), diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py index d6798d831..f7ecdb88b 100644 --- a/tests/gentropy/method/test_colocalisation_method.py +++ b/tests/gentropy/method/test_colocalisation_method.py @@ -7,7 +7,7 @@ import pytest from pandas.testing import assert_frame_equal from pyspark.sql import SparkSession -from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType +from pyspark.sql.types import DoubleType, StringType, StructField, StructType from gentropy.dataset.colocalisation import Colocalisation from gentropy.dataset.study_locus_overlap import StudyLocusOverlap @@ -27,8 +27,8 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: # observed overlap [ { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, + "leftStudyLocusId": "1", + "rightStudyLocusId": "2", "chromosome": "1", "tagVariantId": "snp", "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, @@ -50,15 +50,15 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: # observed overlap [ { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, + "leftStudyLocusId": "1", + "rightStudyLocusId": "2", "chromosome": "1", "tagVariantId": "snp1", "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, }, { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, + "leftStudyLocusId": "1", + "rightStudyLocusId": "2", "chromosome": "1", "tagVariantId": "snp2", "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, @@ -117,8 +117,8 @@ def test_coloc_no_logbf( spark.createDataFrame( [ { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, + "leftStudyLocusId": "1", + "rightStudyLocusId": "2", "chromosome": "1", "tagVariantId": "snp", "statistics": { @@ -129,8 +129,8 @@ def test_coloc_no_logbf( ], schema=StructType( [ - StructField("leftStudyLocusId", LongType(), False), - StructField("rightStudyLocusId", LongType(), False), + StructField("leftStudyLocusId", StringType(), False), + StructField("rightStudyLocusId", StringType(), False), StructField("chromosome", StringType(), False), StructField("tagVariantId", StringType(), False), StructField( From 7e62efdcdd7fa8630e10bd201e07a4f337826aec Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 15:45:23 +0100 Subject: [PATCH 03/11] feat: change hashing method to md5 --- src/gentropy/dataset/study_locus.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index e8363aa4e..71388a6c7 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -449,9 +449,12 @@ def assign_study_locus_id( """ if finemapping_col is None: - finemapping_col = f.lit(None).cast(StringType()) - variant_id_col = f.coalesce(variant_id_col, f.rand().cast("string")) - return f.xxhash64(study_id_col, variant_id_col, finemapping_col).alias( + finemapping_col = f.lit("None") + columns = [study_id_col, variant_id_col, finemapping_col] + hashable_columns = [f.when(column.cast("string").isNull(), f.lit("None")) + .otherwise(column.cast("string")) + for column in columns] + return f.md5(f.concat(*hashable_columns)).alias( "studyLocusId" ) From dd354b405e401e139b5a182ed26adc98b54fe658 Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 15:51:06 +0100 Subject: [PATCH 04/11] test: remove test_assign_study_locus_id__null_variant_id as validation will have removed null ids --- tests/gentropy/dataset/test_study_locus.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index 45646c82c..f81b34ff1 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -236,20 +236,6 @@ def test_filter_credible_set(mock_study_locus: StudyLocus) -> None: ) -def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None: - """Test assign study locus id when variant id is null for the same study.""" - df = spark.createDataFrame( - [("GCST000001", None), ("GCST000001", None)], - schema="studyId: string, variantId: string", - ).withColumn( - "studyLocusId", - StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), - ) - assert ( - df.select("studyLocusId").distinct().count() == 2 - ), "studyLocusId is not unique when variantId is null" - - @pytest.mark.parametrize( ("observed", "expected"), [ From 4c7e146a85d44470a330f36d297359ea69bd2906 Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 15:57:23 +0100 Subject: [PATCH 05/11] fix: change studyLocusId to string in remaining files --- src/gentropy/datasource/gwas_catalog/associations.py | 4 ++-- tests/gentropy/dataset/test_l2g_feature_matrix.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py index dd9aa3fe2..5e84079a1 100644 --- a/src/gentropy/datasource/gwas_catalog/associations.py +++ b/src/gentropy/datasource/gwas_catalog/associations.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING import pyspark.sql.functions as f -from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType +from pyspark.sql.types import DoubleType, FloatType, IntegerType, StringType from pyspark.sql.window import Window from gentropy.assets import data @@ -1109,7 +1109,7 @@ def from_source( """ return StudyLocusGWASCatalog( _df=gwas_associations.withColumn( - "studyLocusId", f.monotonically_increasing_id().cast(LongType()) + "studyLocusId", f.monotonically_increasing_id().cast(StringType()) ) .transform( # Map/harmonise variants to variant annotation dataset: diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py index 6ab1d4ddb..1ad203848 100644 --- a/tests/gentropy/dataset/test_l2g_feature_matrix.py +++ b/tests/gentropy/dataset/test_l2g_feature_matrix.py @@ -8,7 +8,6 @@ from pyspark.sql.types import ( ArrayType, DoubleType, - LongType, StringType, StructField, StructType, @@ -100,7 +99,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: ], schema=StructType( [ - StructField("studyLocusId", LongType(), True), + StructField("studyLocusId", StringType(), True), StructField("variantId", StringType(), True), StructField("studyId", StringType(), True), StructField( From 1dec962ed7b715f2fb0019f4d36592fca798d4bc Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:41:14 +0100 Subject: [PATCH 06/11] fix: ensure inputs to assign_study_locus_id are columns and not strings --- src/gentropy/method/pics.py | 2 +- src/gentropy/susie_finemapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py index 2de06f512..6889aaa26 100644 --- a/src/gentropy/method/pics.py +++ b/src/gentropy/method/pics.py @@ -257,7 +257,7 @@ def finemap( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - "studyId", "variantId", "finemappingMethod" + f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") ), ) .drop("neglog_pvalue") diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 587ea7963..a80591c60 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -95,7 +95,7 @@ def __init__( .df.withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - "studyId", "variantId", "finemappingMethod" + f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") ), ) .collect()[0] From bcae23d1ecee35c1b54a57a52b997d6174f5ef75 Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:53:01 +0100 Subject: [PATCH 07/11] fix: change studyLocusId to string in remaining files --- tests/gentropy/dataset/test_study_locus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index f4f80fbda..0ae7ef2d5 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -764,7 +764,7 @@ def _setup(self: TestStudyLocusWindowClumping, spark: SparkSession) -> None: ).withColumns( { "studyLocusId": f.monotonically_increasing_id().cast( - t.LongType() + t.StringType() ), "pValueMantissa": f.lit(1).cast(t.FloatType()), "variantId": f.concat( From 8057a5506137cd28472a0ae84f2709ec3670375d Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 17:07:22 +0100 Subject: [PATCH 08/11] chore: update assign_study_locus_id docstring with updated output --- src/gentropy/dataset/study_locus.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index aa86267c6..6017f2d08 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -462,13 +462,13 @@ def assign_study_locus_id( Examples: >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod") - >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show() - +----------+----------+-----------------+-------------------+ - | studyId| variantId|finemappingMethod| study_locus_id| - +----------+----------+-----------------+-------------------+ - |GCST000001|1_1000_A_C| SuSiE-inf|3801266831619496075| - |GCST000002|1_1000_A_C| pics|1581844826999194430| - +----------+----------+-----------------+-------------------+ + >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False) + ++----------+----------+-----------------+--------------------------------+ + +|studyId |variantId |finemappingMethod|study_locus_id | + ++----------+----------+-----------------+--------------------------------+ + +|GCST000001|1_1000_A_C|SuSiE-inf |109804fe1e20c94231a31bafd71b566e| + +|GCST000002|1_1000_A_C|pics |de310be4558e0482c9cc359c97d37773| + ++----------+----------+-----------------+--------------------------------+ """ if finemapping_col is None: From d8ab71999a0a112d14c1bb942f463b7ebf46968d Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 22:06:45 +0100 Subject: [PATCH 09/11] chore: update assign_study_locus_id docstring with updated output (again) --- src/gentropy/dataset/study_locus.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 6017f2d08..01edd044e 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -463,12 +463,12 @@ def assign_study_locus_id( Examples: >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod") >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False) - ++----------+----------+-----------------+--------------------------------+ - +|studyId |variantId |finemappingMethod|study_locus_id | - ++----------+----------+-----------------+--------------------------------+ - +|GCST000001|1_1000_A_C|SuSiE-inf |109804fe1e20c94231a31bafd71b566e| - +|GCST000002|1_1000_A_C|pics |de310be4558e0482c9cc359c97d37773| - ++----------+----------+-----------------+--------------------------------+ + +----------+----------+-----------------+--------------------------------+ + |studyId |variantId |finemappingMethod|study_locus_id | + +----------+----------+-----------------+--------------------------------+ + |GCST000001|1_1000_A_C|SuSiE-inf |109804fe1e20c94231a31bafd71b566e| + |GCST000002|1_1000_A_C|pics |de310be4558e0482c9cc359c97d37773| + +----------+----------+-----------------+--------------------------------+ """ if finemapping_col is None: From e873353572979030e99c3439b07621d0c6d275bd Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 22:59:06 +0100 Subject: [PATCH 10/11] fix: change studyLocusId to string in recently merged files --- tests/gentropy/dataset/test_study_locus.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index a2048311a..3240cdb02 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -897,7 +897,7 @@ class TestStudyLocusSuSiERedundancyFlagging: STUDY_LOCUS_DATA: Any = [ # to be flagged due to v4 ( - 1, + "1", "v1", "s1", "X", @@ -913,7 +913,7 @@ class TestStudyLocusSuSiERedundancyFlagging: ), # to be flagged due to v4 ( - 2, + "2", "v2", "s1", "X", @@ -928,7 +928,7 @@ class TestStudyLocusSuSiERedundancyFlagging: ), # NOT to be flagged (outside regions) ( - 3, + "3", "v3", "s1", "X", @@ -943,7 +943,7 @@ class TestStudyLocusSuSiERedundancyFlagging: ), # NOT to be flagged (SuSie-Inf credible set) ( - 4, + "4", "v4", "s1", "X", @@ -955,7 +955,7 @@ class TestStudyLocusSuSiERedundancyFlagging: ), # NOT to be flagged (Unresolved LD) ( - 5, + "5", "v5", "s1", "X", @@ -969,7 +969,7 @@ class TestStudyLocusSuSiERedundancyFlagging: ), # NOT to be flagged (different study) ( - 6, + "6", "v6", "s2", "X", @@ -986,7 +986,7 @@ class TestStudyLocusSuSiERedundancyFlagging: STUDY_LOCUS_SCHEMA = t.StructType( [ - t.StructField("studyLocusId", t.LongType(), False), + t.StructField("studyLocusId", t.StringType(), False), t.StructField("variantId", t.StringType(), False), t.StructField("studyId", t.StringType(), False), t.StructField("chromosome", t.StringType(), False), From f1b0817214e06e2b4730d5cae4afb17fdda6313b Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:34:03 +0100 Subject: [PATCH 11/11] feat: move hashing logic to generate_identifier function in Dataset class --- src/gentropy/dataset/dataset.py | 15 +++++++++++ src/gentropy/dataset/study_locus.py | 25 +++++-------------- .../datasource/eqtl_catalogue/finemapping.py | 2 +- .../datasource/finngen/finemapping.py | 2 +- .../datasource/gwas_catalog/associations.py | 2 +- .../open_targets/l2g_gold_standard.py | 2 +- src/gentropy/l2g.py | 17 ++++++++----- src/gentropy/method/locus_breaker_clumping.py | 4 +-- src/gentropy/method/pics.py | 2 +- src/gentropy/method/window_based_clumping.py | 2 +- src/gentropy/susie_finemapper.py | 4 +-- 11 files changed, 42 insertions(+), 35 deletions(-) diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index e56ef2ecc..c822b592a 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -352,3 +352,18 @@ def flag_duplicates(test_column: Column) -> Column: ) > 1 ) + + @staticmethod + def generate_identifier(uniqueness_defining_columns: list[str]) -> Column: + """Hashes the provided columns to generate a unique identifier. + + Args: + uniqueness_defining_columns (list[str]): list of columns defining uniqueness + + Returns: + Column: column with a unique identifier + """ + hashable_columns = [f.when(f.col(column).cast("string").isNull(), f.lit("None")) + .otherwise(f.col(column).cast("string")) + for column in uniqueness_defining_columns] + return f.md5(f.concat(*hashable_columns)) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 1b3473148..a4d35e7d5 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -447,24 +447,18 @@ def _align_overlapping_tags( ) @staticmethod - def assign_study_locus_id( - study_id_col: Column, - variant_id_col: Column, - finemapping_col: Column = None, - ) -> Column: - """Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId. + def assign_study_locus_id(uniqueness_defining_columns: list[str]) -> Column: + """Hashes the provided columns to extract a consistent studyLocusId. Args: - study_id_col (Column): column name with a study ID - variant_id_col (Column): column name with a variant ID - finemapping_col (Column, optional): column with fine mapping methodology + uniqueness_defining_columns (list[str]): list of columns defining uniqueness Returns: Column: column with a study locus ID Examples: >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod") - >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False) + >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(["studyId", "variantId", "finemappingMethod"])).show(truncate=False) +----------+----------+-----------------+--------------------------------+ |studyId |variantId |finemappingMethod|study_locus_id | +----------+----------+-----------------+--------------------------------+ @@ -473,15 +467,8 @@ def assign_study_locus_id( +----------+----------+-----------------+--------------------------------+ """ - if finemapping_col is None: - finemapping_col = f.lit("None") - columns = [study_id_col, variant_id_col, finemapping_col] - hashable_columns = [f.when(column.cast("string").isNull(), f.lit("None")) - .otherwise(column.cast("string")) - for column in columns] - return f.md5(f.concat(*hashable_columns)).alias( - "studyLocusId" - ) + return Dataset.generate_identifier(uniqueness_defining_columns).alias("studyLocusId") + @classmethod def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column: diff --git a/src/gentropy/datasource/eqtl_catalogue/finemapping.py b/src/gentropy/datasource/eqtl_catalogue/finemapping.py index 11ec5bef1..0808b7016 100644 --- a/src/gentropy/datasource/eqtl_catalogue/finemapping.py +++ b/src/gentropy/datasource/eqtl_catalogue/finemapping.py @@ -260,7 +260,7 @@ def from_susie_results( .select( *study_locus_cols, StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), StudyLocus.calculate_credible_set_log10bf( f.col("locus.logBF") diff --git a/src/gentropy/datasource/finngen/finemapping.py b/src/gentropy/datasource/finngen/finemapping.py index 092a79372..3c83ba8ff 100644 --- a/src/gentropy/datasource/finngen/finemapping.py +++ b/src/gentropy/datasource/finngen/finemapping.py @@ -471,7 +471,7 @@ def from_finngen_susie_finemapping( ).withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py index 5e84079a1..b34944b11 100644 --- a/src/gentropy/datasource/gwas_catalog/associations.py +++ b/src/gentropy/datasource/gwas_catalog/associations.py @@ -1188,7 +1188,7 @@ def update_study_id( .drop("subStudyDescription", "updatedStudyId") ).withColumn( "studyLocusId", - StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), + StudyLocus.assign_study_locus_id(["studyId", "variantId"]), ) return self diff --git a/src/gentropy/datasource/open_targets/l2g_gold_standard.py b/src/gentropy/datasource/open_targets/l2g_gold_standard.py index 2cfcd62f8..26d5a0253 100644 --- a/src/gentropy/datasource/open_targets/l2g_gold_standard.py +++ b/src/gentropy/datasource/open_targets/l2g_gold_standard.py @@ -52,7 +52,7 @@ def parse_positive_curation( ) .withColumn( "studyLocusId", - StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), + StudyLocus.assign_study_locus_id(["studyId", "variantId"]), ) .groupBy("studyLocusId", "studyId", "variantId", "geneId") .agg(f.collect_set("source").alias("sources")) diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index 6f80d826e..ff8c6c8ff 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -207,17 +207,22 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr study_locus_overlap = StudyLocus( _df=self.credible_set.df.join( f.broadcast( - self.gs_curation.select( - StudyLocus.assign_study_locus_id( - f.col("association_info.otg_id"), # studyId - f.concat_ws( # variantId + self.gs_curation + .withColumn( + "variantId", + f.concat_ws( "_", f.col("sentinel_variant.locus_GRCh38.chromosome"), f.col("sentinel_variant.locus_GRCh38.position"), f.col("sentinel_variant.alleles.reference"), f.col("sentinel_variant.alleles.alternative"), - ), - ).alias("studyLocusId"), + ) + ) + .select( + StudyLocus.assign_study_locus_id( + ["association_info.otg_id", # studyId + "variantId"] + ), ) ), "studyLocusId", diff --git a/src/gentropy/method/locus_breaker_clumping.py b/src/gentropy/method/locus_breaker_clumping.py index 0ca7ae29b..fd7661a22 100644 --- a/src/gentropy/method/locus_breaker_clumping.py +++ b/src/gentropy/method/locus_breaker_clumping.py @@ -112,8 +112,8 @@ def locus_breaker( .cast(t.ArrayType(t.StringType())) .alias("qualityControls"), StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId") - ).alias("studyLocusId"), + ["studyId", "variantId"] + ), ) ), _schema=StudyLocus.get_schema(), diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py index 6889aaa26..5fd084efd 100644 --- a/src/gentropy/method/pics.py +++ b/src/gentropy/method/pics.py @@ -257,7 +257,7 @@ def finemap( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) .drop("neglog_pvalue") diff --git a/src/gentropy/method/window_based_clumping.py b/src/gentropy/method/window_based_clumping.py index 9ef747abf..3ab15d42f 100644 --- a/src/gentropy/method/window_based_clumping.py +++ b/src/gentropy/method/window_based_clumping.py @@ -247,7 +247,7 @@ def clump( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId") + ["studyId", "variantId"] ), ) # Initialize QC column as array of strings: diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index a80591c60..26c73e20f 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -95,7 +95,7 @@ def __init__( .df.withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) .collect()[0] @@ -247,7 +247,7 @@ def susie_inf_to_studylocus( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) .select(