From 4389315f40985e1efa637b5eaeee6c2c63c8b549 Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:10:23 +0100
Subject: [PATCH 01/11] feat: change studyLocusId to string in schema

---
 src/gentropy/assets/schemas/colocalisation.json      | 4 ++--
 src/gentropy/assets/schemas/l2g_feature.json         | 2 +-
 src/gentropy/assets/schemas/l2g_gold_standard.json   | 2 +-
 src/gentropy/assets/schemas/l2g_predictions.json     | 2 +-
 src/gentropy/assets/schemas/study_locus.json         | 2 +-
 src/gentropy/assets/schemas/study_locus_overlap.json | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gentropy/assets/schemas/colocalisation.json b/src/gentropy/assets/schemas/colocalisation.json
index 7ff7453b9..953b5afa4 100644
--- a/src/gentropy/assets/schemas/colocalisation.json
+++ b/src/gentropy/assets/schemas/colocalisation.json
@@ -4,13 +4,13 @@
     {
       "name": "leftStudyLocusId",
       "nullable": false,
-      "type": "long",
+      "type": "string",
       "metadata": {}
     },
     {
       "name": "rightStudyLocusId",
       "nullable": false,
-      "type": "long",
+      "type": "string",
       "metadata": {}
     },
     {
diff --git a/src/gentropy/assets/schemas/l2g_feature.json b/src/gentropy/assets/schemas/l2g_feature.json
index 3139a57e4..314b4dde0 100644
--- a/src/gentropy/assets/schemas/l2g_feature.json
+++ b/src/gentropy/assets/schemas/l2g_feature.json
@@ -3,7 +3,7 @@
   "fields": [
     {
       "name": "studyLocusId",
-      "type": "long",
+      "type": "string",
       "nullable": false,
       "metadata": {}
     },
diff --git a/src/gentropy/assets/schemas/l2g_gold_standard.json b/src/gentropy/assets/schemas/l2g_gold_standard.json
index cf19d6b52..6af921d61 100644
--- a/src/gentropy/assets/schemas/l2g_gold_standard.json
+++ b/src/gentropy/assets/schemas/l2g_gold_standard.json
@@ -3,7 +3,7 @@
   "fields": [
     {
       "name": "studyLocusId",
-      "type": "long",
+      "type": "string",
       "nullable": false,
       "metadata": {}
     },
diff --git a/src/gentropy/assets/schemas/l2g_predictions.json b/src/gentropy/assets/schemas/l2g_predictions.json
index 16b274207..238ff4087 100644
--- a/src/gentropy/assets/schemas/l2g_predictions.json
+++ b/src/gentropy/assets/schemas/l2g_predictions.json
@@ -3,7 +3,7 @@
   "fields": [
     {
       "name": "studyLocusId",
-      "type": "long",
+      "type": "string",
       "nullable": false,
       "metadata": {}
     },
diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json
index 11908f687..0d2307fad 100644
--- a/src/gentropy/assets/schemas/study_locus.json
+++ b/src/gentropy/assets/schemas/study_locus.json
@@ -4,7 +4,7 @@
       "metadata": {},
       "name": "studyLocusId",
       "nullable": false,
-      "type": "long"
+      "type": "string"
     },
     {
       "metadata": {},
diff --git a/src/gentropy/assets/schemas/study_locus_overlap.json b/src/gentropy/assets/schemas/study_locus_overlap.json
index 9a8e123cd..aab8408f7 100644
--- a/src/gentropy/assets/schemas/study_locus_overlap.json
+++ b/src/gentropy/assets/schemas/study_locus_overlap.json
@@ -4,13 +4,13 @@
       "metadata": {},
       "name": "leftStudyLocusId",
       "nullable": false,
-      "type": "long"
+      "type": "string"
     },
     {
       "metadata": {},
       "name": "rightStudyLocusId",
       "nullable": false,
-      "type": "long"
+      "type": "string"
     },
     {
       "metadata": {},

From 7dc153afd74034a1564ce3eca310c39090840d5d Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:09:14 +0100
Subject: [PATCH 02/11] feat: change studyLocusId of example data to string in
 tests

---
 tests/gentropy/conftest.py                    |  6 +-
 tests/gentropy/dataset/test_colocalisation.py |  6 +-
 tests/gentropy/dataset/test_l2g.py            | 46 ++++-----
 .../dataset/test_l2g_feature_matrix.py        |  6 +-
 tests/gentropy/dataset/test_study_locus.py    | 95 +++++++++----------
 .../dataset/test_study_locus_overlap.py       | 10 +-
 .../dataset/test_study_locus_overlaps.py      | 22 ++---
 .../test_gwas_catalog_associations.py         |  4 +-
 .../datasource/open_targets/test_variants.py  |  2 +-
 tests/gentropy/method/test_clump.py           |  2 +-
 .../method/test_colocalisation_method.py      | 22 ++---
 11 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py
index 93ee38471..ded2b9e34 100644
--- a/tests/gentropy/conftest.py
+++ b/tests/gentropy/conftest.py
@@ -587,10 +587,10 @@ def mock_l2g_feature_matrix(spark: SparkSession) -> L2GFeatureMatrix:
     return L2GFeatureMatrix(
         _df=spark.createDataFrame(
             [
-                (1, "gene1", 100.0, None),
-                (2, "gene2", 1000.0, 0.0),
+                ("1", "gene1", 100.0, None),
+                ("2", "gene2", 1000.0, 0.0),
             ],
-            "studyLocusId LONG, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT",
+            "studyLocusId STRING, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT",
         ),
         with_gold_standard=False,
     )
diff --git a/tests/gentropy/dataset/test_colocalisation.py b/tests/gentropy/dataset/test_colocalisation.py
index 5371cf42c..672f5c00d 100644
--- a/tests/gentropy/dataset/test_colocalisation.py
+++ b/tests/gentropy/dataset/test_colocalisation.py
@@ -72,12 +72,12 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
             _df=spark.createDataFrame(
                 [
                     (
-                        1,
+                        "1",
                         "var1",
                         "gwas1",
                     ),
                     (
-                        2,
+                        "2",
                         "var2",
                         "eqtl1",
                     ),
@@ -100,7 +100,7 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
         )
         self.sample_colocalisation = Colocalisation(
             _df=spark.createDataFrame(
-                [(1, 2, "X", "COLOC", 1, 0.9)],
+                [("1", "2", "X", "COLOC", 1, 0.9)],
                 [
                     "leftStudyLocusId",
                     "rightStudyLocusId",
diff --git a/tests/gentropy/dataset/test_l2g.py b/tests/gentropy/dataset/test_l2g.py
index d37ce5a4a..2472ad445 100644
--- a/tests/gentropy/dataset/test_l2g.py
+++ b/tests/gentropy/dataset/test_l2g.py
@@ -43,44 +43,44 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
     """Test filter_unique_associations."""
     mock_l2g_gs_df = spark.createDataFrame(
         [
-            (1, "variant1", "study1", "gene1", "positive"),
+            ("1", "variant1", "study1", "gene1", "positive"),
             (
-                2,
+                "2",
                 "variant2",
                 "study1",
                 "gene1",
                 "negative",
             ),  # in the same locus as sl1 and pointing to same gene, has to be dropped
             (
-                3,
+                "3",
                 "variant3",
                 "study1",
                 "gene1",
                 "positive",
             ),  # in diff locus as sl1 and pointing to same gene, has to be kept
             (
-                4,
+                "4",
                 "variant4",
                 "study1",
                 "gene2",
                 "positive",
             ),  # in same locus as sl1 and pointing to diff gene, has to be kept
         ],
-        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_sl_overlap_df = spark.createDataFrame(
-        [(1, 2, "variant2"), (1, 4, "variant4")],
-        "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING",
+        [("1", "2", "variant2"), ("1", "4", "variant4")],
+        "leftStudyLocusId STRING, rightStudyLocusId STRING, tagVariantId STRING",
     )
 
     expected_df = spark.createDataFrame(
         [
-            (1, "variant1", "study1", "gene1", "positive"),
-            (3, "variant3", "study1", "gene1", "positive"),
-            (4, "variant4", "study1", "gene2", "positive"),
+            ("1", "variant1", "study1", "gene1", "positive"),
+            ("3", "variant3", "study1", "gene1", "positive"),
+            ("4", "variant4", "study1", "gene2", "positive"),
         ],
-        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_l2g_gs = L2GGoldStandard(
@@ -99,30 +99,30 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
     """Test `remove_false_negatives`."""
     mock_l2g_gs_df = spark.createDataFrame(
         [
-            (1, "variant1", "study1", "gene1", "positive"),
+            ("1", "variant1", "study1", "gene1", "positive"),
             (
-                2,
+                "2",
                 "variant2",
                 "study1",
                 "gene2",
                 "negative",
             ),  # gene2 is a partner of gene1, has to be dropped
             (
-                3,
+                "3",
                 "variant3",
                 "study1",
                 "gene3",
                 "negative",
             ),  # gene 3 is not a partner of gene1, has to be kept
             (
-                4,
+                "4",
                 "variant4",
                 "study1",
                 "gene4",
                 "positive",
             ),  # gene 4 is a partner of gene1, has to be kept because it's positive
         ],
-        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_interactions_df = spark.createDataFrame(
@@ -136,11 +136,11 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
 
     expected_df = spark.createDataFrame(
         [
-            (1, "variant1", "study1", "gene1", "positive"),
-            (3, "variant3", "study1", "gene3", "negative"),
-            (4, "variant4", "study1", "gene4", "positive"),
+            ("1", "variant1", "study1", "gene1", "positive"),
+            ("3", "variant3", "study1", "gene3", "negative"),
+            ("4", "variant4", "study1", "gene4", "positive"),
         ],
-        "studyLocusId LONG, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
+        "studyLocusId STRING, variantId STRING, studyId STRING, geneId STRING, goldStandardSet STRING",
     )
 
     mock_l2g_gs = L2GGoldStandard(
@@ -161,10 +161,10 @@ def test_l2g_feature_constructor_with_schema_mismatch(
     fm = L2GFeatureMatrix(
         _df=spark.createDataFrame(
             [
-                (1, "gene1", 100.0),
-                (2, "gene2", 1000.0),
+                ("1", "gene1", 100.0),
+                ("2", "gene2", 1000.0),
             ],
-            "studyLocusId LONG, geneId STRING, distanceTssMean DOUBLE",
+            "studyLocusId STRING, geneId STRING, distanceTssMean DOUBLE",
         ),
         with_gold_standard=False,
     )
diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py
index 46384239c..6ab1d4ddb 100644
--- a/tests/gentropy/dataset/test_l2g_feature_matrix.py
+++ b/tests/gentropy/dataset/test_l2g_feature_matrix.py
@@ -81,7 +81,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None:
             _df=spark.createDataFrame(
                 [
                     (
-                        1,
+                        "1",
                         "var1",
                         "gwas1",
                         [
@@ -90,7 +90,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None:
                         ],
                     ),
                     (
-                        2,
+                        "2",
                         "var2",
                         "eqtl1",
                         [
@@ -136,7 +136,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None:
         )
         self.sample_colocalisation = Colocalisation(
             _df=spark.createDataFrame(
-                [(1, 2, "X", "COLOC", 1, 0.9)],
+                [("1", "2", "X", "COLOC", 1, 0.9)],
                 [
                     "leftStudyLocusId",
                     "rightStudyLocusId",
diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index c89521b3c..45646c82c 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -12,7 +12,6 @@
     ArrayType,
     BooleanType,
     DoubleType,
-    LongType,
     StringType,
     StructField,
     StructType,
@@ -41,8 +40,8 @@
             True,
             [
                 {
-                    "leftStudyLocusId": 1,
-                    "rightStudyLocusId": 2,
+                    "leftStudyLocusId": "1",
+                    "rightStudyLocusId": "2",
                     "chromosome": "1",
                     "tagVariantId": "commonTag",
                     "statistics": {
@@ -51,8 +50,8 @@
                     },
                 },
                 {
-                    "leftStudyLocusId": 1,
-                    "rightStudyLocusId": 2,
+                    "leftStudyLocusId": "1",
+                    "rightStudyLocusId": "2",
                     "chromosome": "1",
                     "tagVariantId": "nonCommonTag",
                     "statistics": {
@@ -76,7 +75,7 @@ def test_find_overlaps_semantic(
                 # 2 associations with a common variant in the locus
                 [
                     {
-                        "studyLocusId": 1,
+                        "studyLocusId": "1",
                         "variantId": "lead1",
                         "studyId": "study1",
                         "locus": [
@@ -85,7 +84,7 @@ def test_find_overlaps_semantic(
                         "chromosome": "1",
                     },
                     {
-                        "studyLocusId": 2,
+                        "studyLocusId": "2",
                         "variantId": "lead2",
                         "studyId": "study2",
                         "locus": [
@@ -105,7 +104,7 @@ def test_find_overlaps_semantic(
                 # 2 associations with no common variants in the locus
                 [
                     {
-                        "studyLocusId": 1,
+                        "studyLocusId": "1",
                         "variantId": "lead1",
                         "studyId": "study1",
                         "locus": [
@@ -114,7 +113,7 @@ def test_find_overlaps_semantic(
                         "chromosome": "1",
                     },
                     {
-                        "studyLocusId": 2,
+                        "studyLocusId": "2",
                         "variantId": "lead2",
                         "studyId": "study2",
                         "locus": None,
@@ -181,13 +180,13 @@ def test_filter_by_study_type(
             [
                 {
                     # from gwas
-                    "studyLocusId": 1,
+                    "studyLocusId": "1",
                     "variantId": "lead1",
                     "studyId": "study1",
                 },
                 {
                     # from eqtl
-                    "studyLocusId": 2,
+                    "studyLocusId": "2",
                     "variantId": "lead2",
                     "studyId": "study2",
                 },
@@ -258,7 +257,7 @@ def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None:
             # Locus is not null, should return union between variants in locus and lead variant
             [
                 (
-                    1,
+                    "1",
                     "traitA",
                     "22_varA",
                     [
@@ -281,7 +280,7 @@ def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None:
         (
             # locus is null, should return lead variant
             [
-                (1, "traitA", "22_varA", None),
+                ("1", "traitA", "22_varA", None),
             ],
             [
                 (
@@ -299,7 +298,7 @@ def test_unique_variants_in_locus(
     # assert isinstance(mock_study_locus.test_unique_variants_in_locus(), DataFrame)
     schema = StructType(
         [
-            StructField("studyLocusId", LongType(), True),
+            StructField("studyLocusId", StringType(), True),
             StructField("studyId", StringType(), True),
             StructField("variantId", StringType(), True),
             StructField(
@@ -342,7 +341,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Observed
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadB",
                     [{"variantId": "tagVariantA", "posteriorProbability": 1.0}],
@@ -351,7 +350,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Expected
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadB",
                     [
@@ -370,7 +369,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Observed
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadA",
                     [
@@ -387,7 +386,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Expected
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadA",
                     [
@@ -442,7 +441,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Observed
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadB",
                     None,
@@ -451,7 +450,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Expected
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadB",
                     None,
@@ -463,7 +462,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Observed
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadB",
                     [],
@@ -472,7 +471,7 @@ def test_clump(mock_study_locus: StudyLocus) -> None:
             [
                 # Expected
                 (
-                    1,
+                    "1",
                     "traitA",
                     "leadB",
                     None,
@@ -487,7 +486,7 @@ def test_annotate_credible_sets(
     """Test annotate_credible_sets."""
     schema = StructType(
         [
-            StructField("studyLocusId", LongType(), True),
+            StructField("studyLocusId", StringType(), True),
             StructField("studyId", StringType(), True),
             StructField("variantId", StringType(), True),
             StructField(
@@ -590,12 +589,12 @@ class TestStudyLocusVariantValidation:
 
     STUDYLOCUS_DATA = [
         # First studylocus passes qc:
-        (1, "v1", "s1", "v1"),
-        (1, "v1", "s1", "v2"),
-        (1, "v1", "s1", "v3"),
+        ("1", "v1", "s1", "v1"),
+        ("1", "v1", "s1", "v2"),
+        ("1", "v1", "s1", "v3"),
         # Second studylocus passes qc:
-        (2, "v1", "s1", "v1"),
-        (2, "v1", "s1", "v5"),
+        ("2", "v1", "s1", "v1"),
+        ("2", "v1", "s1", "v5"),
     ]
     STUDYLOCUS_HEADER = ["studyLocusId", "variantId", "studyId", "tagVariantId"]
 
@@ -612,7 +611,7 @@ def _setup(self: TestStudyLocusVariantValidation, spark: SparkSession) -> None:
         self.credible_set = StudyLocus(
             _df=(
                 spark.createDataFrame(self.STUDYLOCUS_DATA, self.STUDYLOCUS_HEADER)
-                .withColumn("studyLocusId", f.col("studyLocusId").cast(t.LongType()))
+                .withColumn("studyLocusId", f.col("studyLocusId").cast(t.StringType()))
                 .withColumn("qualityControls", f.array())
                 .groupBy("studyLocusId", "variantId", "studyId")
                 .agg(
@@ -653,7 +652,7 @@ def test_validation_correctness(self: TestStudyLocusVariantValidation) -> None:
         # Check that the right one is flagged:
         assert (
             validated.filter(
-                (f.size("qualityControls") > 0) & (f.col("studyLocusId") == 2)
+                (f.size("qualityControls") > 0) & (f.col("studyLocusId") == "2")
             ).count()
             == 1
         )
@@ -664,17 +663,17 @@ class TestStudyLocusValidation:
 
     STUDY_LOCUS_DATA = [
         # Won't be flagged:
-        (1, "v1", "s1", 1.0, -8, []),
+        ("1", "v1", "s1", 1.0, -8, []),
         # Already flagged, needs to be tested if the flag reamins unique:
-        (2, "v2", "s2", 5.0, -4, [StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG.value]),
+        ("2", "v2", "s2", 5.0, -4, [StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG.value]),
         # To be flagged:
-        (3, "v3", "s3", 1.0, -4, []),
-        (4, "v4", "s4", 5.0, -3, []),
+        ("3", "v3", "s3", 1.0, -4, []),
+        ("4", "v4", "s4", 5.0, -3, []),
     ]
 
     STUDY_LOCUS_SCHEMA = t.StructType(
         [
-            t.StructField("studyLocusId", t.LongType(), False),
+            t.StructField("studyLocusId", t.StringType(), False),
             t.StructField("variantId", t.StringType(), False),
             t.StructField("studyId", t.StringType(), False),
             t.StructField("pValueMantissa", t.FloatType(), False),
@@ -805,23 +804,23 @@ class TestStudyLocusRedundancyFlagging:
     """Collection of tests related to flagging redundant credible sets."""
 
     STUDY_LOCUS_DATA = [
-        (1, "v1", "s1", "pics", []),
-        (2, "v2", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
-        (3, "v3", "s1", "pics", []),
-        (3, "v3", "s1", "pics", []),
-        (1, "v1", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
-        (1, "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
-        (1, "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
-        (1, "v1", "s3", "SuSie", []),
-        (1, "v1", "s3", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
-        (1, "v1", "s4", "pics", []),
-        (1, "v1", "s4", "SuSie", []),
-        (1, "v1", "s4", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
+        ("1", "v1", "s1", "pics", []),
+        ("2", "v2", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
+        ("3", "v3", "s1", "pics", []),
+        ("3", "v3", "s1", "pics", []),
+        ("1", "v1", "s1", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
+        ("1", "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
+        ("1", "v1", "s2", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
+        ("1", "v1", "s3", "SuSie", []),
+        ("1", "v1", "s3", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
+        ("1", "v1", "s4", "pics", []),
+        ("1", "v1", "s4", "SuSie", []),
+        ("1", "v1", "s4", "pics", [StudyLocusQualityCheck.TOP_HIT.value]),
     ]
 
     STUDY_LOCUS_SCHEMA = t.StructType(
         [
-            t.StructField("studyLocusId", t.LongType(), False),
+            t.StructField("studyLocusId", t.StringType(), False),
             t.StructField("variantId", t.StringType(), False),
             t.StructField("studyId", t.StringType(), False),
             t.StructField("finemappingMethod", t.StringType(), False),
diff --git a/tests/gentropy/dataset/test_study_locus_overlap.py b/tests/gentropy/dataset/test_study_locus_overlap.py
index e26b59c30..5ced1d7a5 100644
--- a/tests/gentropy/dataset/test_study_locus_overlap.py
+++ b/tests/gentropy/dataset/test_study_locus_overlap.py
@@ -19,19 +19,19 @@ def test_convert_to_square_matrix(spark: SparkSession) -> None:
     mock_sl_overlap = StudyLocusOverlap(
         _df=spark.createDataFrame(
             [
-                (1, 2, "variant2"),
+                ("1", "2", "variant2"),
             ],
-            "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING",
+            "leftStudyLocusId STRING, rightStudyLocusId STRING, tagVariantId STRING",
         ),
         _schema=StudyLocusOverlap.get_schema(),
     )
 
     expected_df = spark.createDataFrame(
         [
-            (1, 2, "variant2"),
-            (2, 1, "variant2"),
+            ("1", "2", "variant2"),
+            ("2", "1", "variant2"),
         ],
-        "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING",
+        "leftStudyLocusId STRING, rightStudyLocusId STRING, tagVariantId STRING",
     )
     observed_df = mock_sl_overlap._convert_to_square_matrix().df
 
diff --git a/tests/gentropy/dataset/test_study_locus_overlaps.py b/tests/gentropy/dataset/test_study_locus_overlaps.py
index bd3415959..971bbcfcf 100644
--- a/tests/gentropy/dataset/test_study_locus_overlaps.py
+++ b/tests/gentropy/dataset/test_study_locus_overlaps.py
@@ -38,21 +38,21 @@ def test_study_locus_overlap_from_associations(
             # observed - input DataFrame representing gwas and nongwas data to find overlapping signals
             [
                 {
-                    "studyLocusId": 1,
+                    "studyLocusId": "1",
                     "studyId": "A",
                     "studyType": "gwas",
                     "chromosome": "1",
                     "tagVariantId": "A",
                 },
                 {
-                    "studyLocusId": 2,
+                    "studyLocusId": "2",
                     "studyId": "B",
                     "studyType": "eqtl",
                     "chromosome": "1",
                     "tagVariantId": "A",
                 },
                 {
-                    "studyLocusId": 3,
+                    "studyLocusId": "3",
                     "studyId": "C",
                     "studyType": "gwas",
                     "chromosome": "1",
@@ -63,14 +63,14 @@ def test_study_locus_overlap_from_associations(
             False,
             # expected - output DataFrame with overlapping signals
             [
-                {"leftStudyLocusId": 1, "rightStudyLocusId": 2, "chromosome": "1"},
+                {"leftStudyLocusId": "1", "rightStudyLocusId": "2", "chromosome": "1"},
             ],
         ),
         (
             # observed - input DataFrame representing intra-study data to find overlapping signals in the same study
             [
                 {
-                    "studyLocusId": 1,
+                    "studyLocusId": "1",
                     "studyId": "A",
                     "studyType": "gwas",
                     "chromosome": "1",
@@ -78,7 +78,7 @@ def test_study_locus_overlap_from_associations(
                     "tagVariantId": "A",
                 },
                 {
-                    "studyLocusId": 2,
+                    "studyLocusId": "2",
                     "studyId": "A",
                     "studyType": "gwas",
                     "chromosome": "1",
@@ -86,7 +86,7 @@ def test_study_locus_overlap_from_associations(
                     "tagVariantId": "A",
                 },
                 {
-                    "studyLocusId": 3,
+                    "studyLocusId": "3",
                     "studyId": "B",
                     "studyType": "gwas",
                     "chromosome": "1",
@@ -97,7 +97,7 @@ def test_study_locus_overlap_from_associations(
             # intrastudy - bool of whether or not to use inter-study or intra-study logic
             True,
             # expected - output DataFrame with overlapping signals
-            [{"leftStudyLocusId": 2, "rightStudyLocusId": 1, "chromosome": "1"}],
+            [{"leftStudyLocusId": "2", "rightStudyLocusId": "1", "chromosome": "1"}],
         ),
     ],
 )
@@ -110,7 +110,7 @@ def test_overlapping_peaks(
     """Test overlapping signals between GWAS-GWAS and GWAS-Molecular trait to make sure that mQTLs are always on the right."""
     mock_schema = t.StructType(
         [
-            t.StructField("studyLocusId", t.LongType()),
+            t.StructField("studyLocusId", t.StringType()),
             t.StructField("studyId", t.StringType()),
             t.StructField("studyType", t.StringType()),
             t.StructField("chromosome", t.StringType()),
@@ -120,8 +120,8 @@ def test_overlapping_peaks(
     )
     expected_schema = t.StructType(
         [
-            t.StructField("leftStudyLocusId", t.LongType()),
-            t.StructField("rightStudyLocusId", t.LongType()),
+            t.StructField("leftStudyLocusId", t.StringType()),
+            t.StructField("rightStudyLocusId", t.StringType()),
             t.StructField("chromosome", t.StringType()),
         ]
     )
diff --git a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
index 130097f25..fe9608bf0 100644
--- a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
+++ b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
@@ -4,7 +4,7 @@
 
 from pyspark.sql import DataFrame
 from pyspark.sql import functions as f
-from pyspark.sql.types import LongType
+from pyspark.sql.types import StringType
 
 from gentropy.dataset.variant_index import VariantIndex
 from gentropy.datasource.gwas_catalog.associations import (
@@ -71,7 +71,7 @@ def test_map_variants_to_variant_index(
     assert isinstance(
         GWASCatalogCuratedAssociationsParser._map_variants_to_gnomad_variants(
             sample_gwas_catalog_associations.withColumn(
-                "studyLocusId", f.monotonically_increasing_id().cast(LongType())
+                "studyLocusId", f.monotonically_increasing_id().cast(StringType())
             ),
             mock_variant_index,
         ),
diff --git a/tests/gentropy/datasource/open_targets/test_variants.py b/tests/gentropy/datasource/open_targets/test_variants.py
index 247a9d81e..6aa22e628 100644
--- a/tests/gentropy/datasource/open_targets/test_variants.py
+++ b/tests/gentropy/datasource/open_targets/test_variants.py
@@ -25,7 +25,7 @@ def test_as_vcf_df_credible_set(
         df_credible_set_df = spark.createDataFrame(
             [
                 {
-                    "studyLocusId": 1,
+                    "studyLocusId": "1",
                     "variantId": "1_2_C_G",
                     "studyId": "study1",
                     "locus": [
diff --git a/tests/gentropy/method/test_clump.py b/tests/gentropy/method/test_clump.py
index 1e754df3a..83a95e19f 100644
--- a/tests/gentropy/method/test_clump.py
+++ b/tests/gentropy/method/test_clump.py
@@ -88,7 +88,7 @@ class TestIsLeadLinked:
     SCHEMA = t.StructType(
         [
             t.StructField("studyId", t.StringType(), True),
-            t.StructField("studyLocusId", t.LongType(), True),
+            t.StructField("studyLocusId", t.StringType(), True),
             t.StructField("chromosome", t.StringType(), True),
             t.StructField("variantId", t.StringType(), True),
             t.StructField("pValueMantissa", t.FloatType(), True),
diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py
index d6798d831..f7ecdb88b 100644
--- a/tests/gentropy/method/test_colocalisation_method.py
+++ b/tests/gentropy/method/test_colocalisation_method.py
@@ -7,7 +7,7 @@
 import pytest
 from pandas.testing import assert_frame_equal
 from pyspark.sql import SparkSession
-from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType
+from pyspark.sql.types import DoubleType, StringType, StructField, StructType
 
 from gentropy.dataset.colocalisation import Colocalisation
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
@@ -27,8 +27,8 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
             # observed overlap
             [
                 {
-                    "leftStudyLocusId": 1,
-                    "rightStudyLocusId": 2,
+                    "leftStudyLocusId": "1",
+                    "rightStudyLocusId": "2",
                     "chromosome": "1",
                     "tagVariantId": "snp",
                     "statistics": {"left_logBF": 10.3, "right_logBF": 10.5},
@@ -50,15 +50,15 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
             # observed overlap
             [
                 {
-                    "leftStudyLocusId": 1,
-                    "rightStudyLocusId": 2,
+                    "leftStudyLocusId": "1",
+                    "rightStudyLocusId": "2",
                     "chromosome": "1",
                     "tagVariantId": "snp1",
                     "statistics": {"left_logBF": 10.3, "right_logBF": 10.5},
                 },
                 {
-                    "leftStudyLocusId": 1,
-                    "rightStudyLocusId": 2,
+                    "leftStudyLocusId": "1",
+                    "rightStudyLocusId": "2",
                     "chromosome": "1",
                     "tagVariantId": "snp2",
                     "statistics": {"left_logBF": 10.3, "right_logBF": 10.5},
@@ -117,8 +117,8 @@ def test_coloc_no_logbf(
             spark.createDataFrame(
                 [
                     {
-                        "leftStudyLocusId": 1,
-                        "rightStudyLocusId": 2,
+                        "leftStudyLocusId": "1",
+                        "rightStudyLocusId": "2",
                         "chromosome": "1",
                         "tagVariantId": "snp",
                         "statistics": {
@@ -129,8 +129,8 @@ def test_coloc_no_logbf(
                 ],
                 schema=StructType(
                     [
-                        StructField("leftStudyLocusId", LongType(), False),
-                        StructField("rightStudyLocusId", LongType(), False),
+                        StructField("leftStudyLocusId", StringType(), False),
+                        StructField("rightStudyLocusId", StringType(), False),
                         StructField("chromosome", StringType(), False),
                         StructField("tagVariantId", StringType(), False),
                         StructField(

From 7e62efdcdd7fa8630e10bd201e07a4f337826aec Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:45:23 +0100
Subject: [PATCH 03/11] feat: change hashing method to md5

---
 src/gentropy/dataset/study_locus.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index e8363aa4e..71388a6c7 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -449,9 +449,12 @@ def assign_study_locus_id(
             <BLANKLINE>
         """
         if finemapping_col is None:
-            finemapping_col = f.lit(None).cast(StringType())
-        variant_id_col = f.coalesce(variant_id_col, f.rand().cast("string"))
-        return f.xxhash64(study_id_col, variant_id_col, finemapping_col).alias(
+            finemapping_col = f.lit("None")
+        columns = [study_id_col, variant_id_col, finemapping_col]
+        hashable_columns = [f.when(column.cast("string").isNull(), f.lit("None"))
+                                 .otherwise(column.cast("string"))
+                                 for column in columns]
+        return f.md5(f.concat(*hashable_columns)).alias(
             "studyLocusId"
         )
 

From dd354b405e401e139b5a182ed26adc98b54fe658 Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:51:06 +0100
Subject: [PATCH 04/11] test: remove
 test_assign_study_locus_id__null_variant_id as validation will have removed
 null ids

---
 tests/gentropy/dataset/test_study_locus.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index 45646c82c..f81b34ff1 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -236,20 +236,6 @@ def test_filter_credible_set(mock_study_locus: StudyLocus) -> None:
     )
 
 
-def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None:
-    """Test assign study locus id when variant id is null for the same study."""
-    df = spark.createDataFrame(
-        [("GCST000001", None), ("GCST000001", None)],
-        schema="studyId: string, variantId: string",
-    ).withColumn(
-        "studyLocusId",
-        StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
-    )
-    assert (
-        df.select("studyLocusId").distinct().count() == 2
-    ), "studyLocusId is not unique when variantId is null"
-
-
 @pytest.mark.parametrize(
     ("observed", "expected"),
     [

From 4c7e146a85d44470a330f36d297359ea69bd2906 Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:57:23 +0100
Subject: [PATCH 05/11] fix: change studyLocusId to string in remaining files

---
 src/gentropy/datasource/gwas_catalog/associations.py | 4 ++--
 tests/gentropy/dataset/test_l2g_feature_matrix.py    | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py
index dd9aa3fe2..5e84079a1 100644
--- a/src/gentropy/datasource/gwas_catalog/associations.py
+++ b/src/gentropy/datasource/gwas_catalog/associations.py
@@ -9,7 +9,7 @@
 from typing import TYPE_CHECKING
 
 import pyspark.sql.functions as f
-from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType
+from pyspark.sql.types import DoubleType, FloatType, IntegerType, StringType
 from pyspark.sql.window import Window
 
 from gentropy.assets import data
@@ -1109,7 +1109,7 @@ def from_source(
         """
         return StudyLocusGWASCatalog(
             _df=gwas_associations.withColumn(
-                "studyLocusId", f.monotonically_increasing_id().cast(LongType())
+                "studyLocusId", f.monotonically_increasing_id().cast(StringType())
             )
             .transform(
                 # Map/harmonise variants to variant annotation dataset:
diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py
index 6ab1d4ddb..1ad203848 100644
--- a/tests/gentropy/dataset/test_l2g_feature_matrix.py
+++ b/tests/gentropy/dataset/test_l2g_feature_matrix.py
@@ -8,7 +8,6 @@
 from pyspark.sql.types import (
     ArrayType,
     DoubleType,
-    LongType,
     StringType,
     StructField,
     StructType,
@@ -100,7 +99,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None:
                 ],
                 schema=StructType(
                     [
-                        StructField("studyLocusId", LongType(), True),
+                        StructField("studyLocusId", StringType(), True),
                         StructField("variantId", StringType(), True),
                         StructField("studyId", StringType(), True),
                         StructField(

From 1dec962ed7b715f2fb0019f4d36592fca798d4bc Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:41:14 +0100
Subject: [PATCH 06/11] fix: ensure inputs to assign_study_locus_id are columns
 and not strings

---
 src/gentropy/method/pics.py      | 2 +-
 src/gentropy/susie_finemapper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py
index 2de06f512..6889aaa26 100644
--- a/src/gentropy/method/pics.py
+++ b/src/gentropy/method/pics.py
@@ -257,7 +257,7 @@ def finemap(
                 .withColumn(
                     "studyLocusId",
                     StudyLocus.assign_study_locus_id(
-                        "studyId", "variantId", "finemappingMethod"
+                        f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
                     ),
                 )
                 .drop("neglog_pvalue")
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 587ea7963..a80591c60 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -95,7 +95,7 @@ def __init__(
             .df.withColumn(
                 "studyLocusId",
                 StudyLocus.assign_study_locus_id(
-                    "studyId", "variantId", "finemappingMethod"
+                    f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
                 ),
             )
             .collect()[0]

From bcae23d1ecee35c1b54a57a52b997d6174f5ef75 Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:53:01 +0100
Subject: [PATCH 07/11] fix: change studyLocusId to string in remaining files

---
 tests/gentropy/dataset/test_study_locus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index f4f80fbda..0ae7ef2d5 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -764,7 +764,7 @@ def _setup(self: TestStudyLocusWindowClumping, spark: SparkSession) -> None:
                 ).withColumns(
                     {
                         "studyLocusId": f.monotonically_increasing_id().cast(
-                            t.LongType()
+                            t.StringType()
                         ),
                         "pValueMantissa": f.lit(1).cast(t.FloatType()),
                         "variantId": f.concat(

From 8057a5506137cd28472a0ae84f2709ec3670375d Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 17:07:22 +0100
Subject: [PATCH 08/11] chore: update assign_study_locus_id docstring with
 updated output

---
 src/gentropy/dataset/study_locus.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index aa86267c6..6017f2d08 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -462,13 +462,13 @@ def assign_study_locus_id(
 
         Examples:
             >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod")
-            >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show()
-            +----------+----------+-----------------+-------------------+
-            |   studyId| variantId|finemappingMethod|     study_locus_id|
-            +----------+----------+-----------------+-------------------+
-            |GCST000001|1_1000_A_C|        SuSiE-inf|3801266831619496075|
-            |GCST000002|1_1000_A_C|             pics|1581844826999194430|
-            +----------+----------+-----------------+-------------------+
+            >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False)
+            ++----------+----------+-----------------+--------------------------------+
+            +|studyId   |variantId |finemappingMethod|study_locus_id                  |
+            ++----------+----------+-----------------+--------------------------------+
+            +|GCST000001|1_1000_A_C|SuSiE-inf        |109804fe1e20c94231a31bafd71b566e|
+            +|GCST000002|1_1000_A_C|pics             |de310be4558e0482c9cc359c97d37773|
+            ++----------+----------+-----------------+--------------------------------+
             <BLANKLINE>
         """
         if finemapping_col is None:

From d8ab71999a0a112d14c1bb942f463b7ebf46968d Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 22:06:45 +0100
Subject: [PATCH 09/11] chore: update assign_study_locus_id docstring with
 updated output (again)

---
 src/gentropy/dataset/study_locus.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 6017f2d08..01edd044e 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -463,12 +463,12 @@ def assign_study_locus_id(
         Examples:
             >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod")
             >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False)
-            ++----------+----------+-----------------+--------------------------------+
-            +|studyId   |variantId |finemappingMethod|study_locus_id                  |
-            ++----------+----------+-----------------+--------------------------------+
-            +|GCST000001|1_1000_A_C|SuSiE-inf        |109804fe1e20c94231a31bafd71b566e|
-            +|GCST000002|1_1000_A_C|pics             |de310be4558e0482c9cc359c97d37773|
-            ++----------+----------+-----------------+--------------------------------+
+            +----------+----------+-----------------+--------------------------------+
+            |studyId   |variantId |finemappingMethod|study_locus_id                  |
+            +----------+----------+-----------------+--------------------------------+
+            |GCST000001|1_1000_A_C|SuSiE-inf        |109804fe1e20c94231a31bafd71b566e|
+            |GCST000002|1_1000_A_C|pics             |de310be4558e0482c9cc359c97d37773|
+            +----------+----------+-----------------+--------------------------------+
             <BLANKLINE>
         """
         if finemapping_col is None:

From e873353572979030e99c3439b07621d0c6d275bd Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Tue, 24 Sep 2024 22:59:06 +0100
Subject: [PATCH 10/11] fix: change studyLocusId to string in recently merged
 files

---
 tests/gentropy/dataset/test_study_locus.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index a2048311a..3240cdb02 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -897,7 +897,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
     STUDY_LOCUS_DATA: Any = [
         # to be flagged due to v4
         (
-            1,
+            "1",
             "v1",
             "s1",
             "X",
@@ -913,7 +913,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
         ),
         # to be flagged due to v4
         (
-            2,
+            "2",
             "v2",
             "s1",
             "X",
@@ -928,7 +928,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
         ),
         # NOT to be flagged (outside regions)
         (
-            3,
+            "3",
             "v3",
             "s1",
             "X",
@@ -943,7 +943,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
         ),
         # NOT to be flagged (SuSie-Inf credible set)
         (
-            4,
+            "4",
             "v4",
             "s1",
             "X",
@@ -955,7 +955,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
         ),
         # NOT to be flagged (Unresolved LD)
         (
-            5,
+            "5",
             "v5",
             "s1",
             "X",
@@ -969,7 +969,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
         ),
         # NOT to be flagged (different study)
         (
-            6,
+            "6",
             "v6",
             "s2",
             "X",
@@ -986,7 +986,7 @@ class TestStudyLocusSuSiERedundancyFlagging:
 
     STUDY_LOCUS_SCHEMA = t.StructType(
         [
-            t.StructField("studyLocusId", t.LongType(), False),
+            t.StructField("studyLocusId", t.StringType(), False),
             t.StructField("variantId", t.StringType(), False),
             t.StructField("studyId", t.StringType(), False),
             t.StructField("chromosome", t.StringType(), False),

From f1b0817214e06e2b4730d5cae4afb17fdda6313b Mon Sep 17 00:00:00 2001
From: vivienho <56025826+vivienho@users.noreply.github.com>
Date: Thu, 26 Sep 2024 12:34:03 +0100
Subject: [PATCH 11/11] feat: move hashing logic to generate_identifier
 function in Dataset class

---
 src/gentropy/dataset/dataset.py               | 15 +++++++++++
 src/gentropy/dataset/study_locus.py           | 25 +++++--------------
 .../datasource/eqtl_catalogue/finemapping.py  |  2 +-
 .../datasource/finngen/finemapping.py         |  2 +-
 .../datasource/gwas_catalog/associations.py   |  2 +-
 .../open_targets/l2g_gold_standard.py         |  2 +-
 src/gentropy/l2g.py                           | 17 ++++++++-----
 src/gentropy/method/locus_breaker_clumping.py |  4 +--
 src/gentropy/method/pics.py                   |  2 +-
 src/gentropy/method/window_based_clumping.py  |  2 +-
 src/gentropy/susie_finemapper.py              |  4 +--
 11 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py
index e56ef2ecc..c822b592a 100644
--- a/src/gentropy/dataset/dataset.py
+++ b/src/gentropy/dataset/dataset.py
@@ -352,3 +352,18 @@ def flag_duplicates(test_column: Column) -> Column:
             )
             > 1
         )
+
+    @staticmethod
+    def generate_identifier(uniqueness_defining_columns: list[str]) -> Column:
+        """Hashes the provided columns to generate a unique identifier.
+
+        Args:
+            uniqueness_defining_columns (list[str]): list of columns defining uniqueness
+
+        Returns:
+            Column: column with a unique identifier
+        """
+        hashable_columns = [f.when(f.col(column).cast("string").isNull(), f.lit("None"))
+                                 .otherwise(f.col(column).cast("string"))
+                                 for column in uniqueness_defining_columns]
+        return f.md5(f.concat(*hashable_columns))
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 1b3473148..a4d35e7d5 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -447,24 +447,18 @@ def _align_overlapping_tags(
         )
 
     @staticmethod
-    def assign_study_locus_id(
-        study_id_col: Column,
-        variant_id_col: Column,
-        finemapping_col: Column = None,
-    ) -> Column:
-        """Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId.
+    def assign_study_locus_id(uniqueness_defining_columns: list[str]) -> Column:
+        """Hashes the provided columns to extract a consistent studyLocusId.
 
         Args:
-            study_id_col (Column): column name with a study ID
-            variant_id_col (Column): column name with a variant ID
-            finemapping_col (Column, optional): column with fine mapping methodology
+            uniqueness_defining_columns (list[str]): list of columns defining uniqueness
 
         Returns:
             Column: column with a study locus ID
 
         Examples:
             >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod")
-            >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False)
+            >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(["studyId", "variantId", "finemappingMethod"])).show(truncate=False)
             +----------+----------+-----------------+--------------------------------+
             |studyId   |variantId |finemappingMethod|study_locus_id                  |
             +----------+----------+-----------------+--------------------------------+
@@ -473,15 +467,8 @@ def assign_study_locus_id(
             +----------+----------+-----------------+--------------------------------+
             <BLANKLINE>
         """
-        if finemapping_col is None:
-            finemapping_col = f.lit("None")
-        columns = [study_id_col, variant_id_col, finemapping_col]
-        hashable_columns = [f.when(column.cast("string").isNull(), f.lit("None"))
-                                 .otherwise(column.cast("string"))
-                                 for column in columns]
-        return f.md5(f.concat(*hashable_columns)).alias(
-            "studyLocusId"
-        )
+        return Dataset.generate_identifier(uniqueness_defining_columns).alias("studyLocusId")
+
 
     @classmethod
     def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column:
diff --git a/src/gentropy/datasource/eqtl_catalogue/finemapping.py b/src/gentropy/datasource/eqtl_catalogue/finemapping.py
index 11ec5bef1..0808b7016 100644
--- a/src/gentropy/datasource/eqtl_catalogue/finemapping.py
+++ b/src/gentropy/datasource/eqtl_catalogue/finemapping.py
@@ -260,7 +260,7 @@ def from_susie_results(
                 .select(
                     *study_locus_cols,
                     StudyLocus.assign_study_locus_id(
-                        f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
+                        ["studyId", "variantId", "finemappingMethod"]
                     ),
                     StudyLocus.calculate_credible_set_log10bf(
                         f.col("locus.logBF")
diff --git a/src/gentropy/datasource/finngen/finemapping.py b/src/gentropy/datasource/finngen/finemapping.py
index 092a79372..3c83ba8ff 100644
--- a/src/gentropy/datasource/finngen/finemapping.py
+++ b/src/gentropy/datasource/finngen/finemapping.py
@@ -471,7 +471,7 @@ def from_finngen_susie_finemapping(
         ).withColumn(
             "studyLocusId",
             StudyLocus.assign_study_locus_id(
-                f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
+                ["studyId", "variantId", "finemappingMethod"]
             ),
         )
 
diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py
index 5e84079a1..b34944b11 100644
--- a/src/gentropy/datasource/gwas_catalog/associations.py
+++ b/src/gentropy/datasource/gwas_catalog/associations.py
@@ -1188,7 +1188,7 @@ def update_study_id(
             .drop("subStudyDescription", "updatedStudyId")
         ).withColumn(
             "studyLocusId",
-            StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
+            StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
         )
         return self
 
diff --git a/src/gentropy/datasource/open_targets/l2g_gold_standard.py b/src/gentropy/datasource/open_targets/l2g_gold_standard.py
index 2cfcd62f8..26d5a0253 100644
--- a/src/gentropy/datasource/open_targets/l2g_gold_standard.py
+++ b/src/gentropy/datasource/open_targets/l2g_gold_standard.py
@@ -52,7 +52,7 @@ def parse_positive_curation(
             )
             .withColumn(
                 "studyLocusId",
-                StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
+                StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
             )
             .groupBy("studyLocusId", "studyId", "variantId", "geneId")
             .agg(f.collect_set("source").alias("sources"))
diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
index 6f80d826e..ff8c6c8ff 100644
--- a/src/gentropy/l2g.py
+++ b/src/gentropy/l2g.py
@@ -207,17 +207,22 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
             study_locus_overlap = StudyLocus(
                 _df=self.credible_set.df.join(
                     f.broadcast(
-                        self.gs_curation.select(
-                            StudyLocus.assign_study_locus_id(
-                                f.col("association_info.otg_id"),  # studyId
-                                f.concat_ws(  # variantId
+                        self.gs_curation
+                        .withColumn(
+                            "variantId",
+                            f.concat_ws(
                                     "_",
                                     f.col("sentinel_variant.locus_GRCh38.chromosome"),
                                     f.col("sentinel_variant.locus_GRCh38.position"),
                                     f.col("sentinel_variant.alleles.reference"),
                                     f.col("sentinel_variant.alleles.alternative"),
-                                ),
-                            ).alias("studyLocusId"),
+                            )
+                        )
+                        .select(
+                            StudyLocus.assign_study_locus_id(
+                                ["association_info.otg_id",  # studyId
+                                "variantId"]
+                            ),
                         )
                     ),
                     "studyLocusId",
diff --git a/src/gentropy/method/locus_breaker_clumping.py b/src/gentropy/method/locus_breaker_clumping.py
index 0ca7ae29b..fd7661a22 100644
--- a/src/gentropy/method/locus_breaker_clumping.py
+++ b/src/gentropy/method/locus_breaker_clumping.py
@@ -112,8 +112,8 @@ def locus_breaker(
                     .cast(t.ArrayType(t.StringType()))
                     .alias("qualityControls"),
                     StudyLocus.assign_study_locus_id(
-                        f.col("studyId"), f.col("variantId")
-                    ).alias("studyLocusId"),
+                        ["studyId", "variantId"]
+                    ),
                 )
             ),
             _schema=StudyLocus.get_schema(),
diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py
index 6889aaa26..5fd084efd 100644
--- a/src/gentropy/method/pics.py
+++ b/src/gentropy/method/pics.py
@@ -257,7 +257,7 @@ def finemap(
                 .withColumn(
                     "studyLocusId",
                     StudyLocus.assign_study_locus_id(
-                        f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
+                        ["studyId", "variantId", "finemappingMethod"]
                     ),
                 )
                 .drop("neglog_pvalue")
diff --git a/src/gentropy/method/window_based_clumping.py b/src/gentropy/method/window_based_clumping.py
index 9ef747abf..3ab15d42f 100644
--- a/src/gentropy/method/window_based_clumping.py
+++ b/src/gentropy/method/window_based_clumping.py
@@ -247,7 +247,7 @@ def clump(
                 .withColumn(
                     "studyLocusId",
                     StudyLocus.assign_study_locus_id(
-                        f.col("studyId"), f.col("variantId")
+                        ["studyId", "variantId"]
                     ),
                 )
                 # Initialize QC column as array of strings:
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index a80591c60..26c73e20f 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -95,7 +95,7 @@ def __init__(
             .df.withColumn(
                 "studyLocusId",
                 StudyLocus.assign_study_locus_id(
-                    f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
+                    ["studyId", "variantId", "finemappingMethod"]
                 ),
             )
             .collect()[0]
@@ -247,7 +247,7 @@ def susie_inf_to_studylocus(
                 .withColumn(
                     "studyLocusId",
                     StudyLocus.assign_study_locus_id(
-                        f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
+                        ["studyId", "variantId", "finemappingMethod"]
                     ),
                 )
                 .select(