From cf2be5b7568926de8bcbdc2737a22d650e633a7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 10 Nov 2023 14:39:12 +0000
Subject: [PATCH 01/25] chore: changes in config

---
 config/datasets/gcp.yaml          | 17 +++++++++++------
 config/step/locus_to_gene.yaml    |  2 +-
 src/airflow/dags/configs/dag.yaml |  3 ---
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml
index fa5b10fad..ad80e4e65 100644
--- a/config/datasets/gcp.yaml
+++ b/config/datasets/gcp.yaml
@@ -27,19 +27,24 @@ gene_index: ${datasets.outputs}/gene_index
 variant_annotation: ${datasets.outputs}/variant_annotation
 variant_index: ${datasets.outputs}/variant_index
 study_locus: ${datasets.outputs}/study_locus
+credible_set: ${datasets.outputs}/credible_set
+study_index: ${datasets.outputs}/study_index
+summary_statistics: ${datasets.outputs}/summary_statistics
 study_locus_overlap: ${datasets.outputs}/study_locus_overlap
 colocalisation: ${datasets.outputs}/colocalisation
 v2g: ${datasets.outputs}/v2g
 ld_index: ${datasets.outputs}/ld_index
-catalog_study_index: ${datasets.outputs}/catalog_study_index
-catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
-finngen_study_index: ${datasets.outputs}/finngen_study_index
-finngen_summary_stats: ${datasets.outputs}/finngen_summary_stats
-ukbiobank_study_index: ${datasets.outputs}/ukbiobank_study_index
+catalog_study_index: ${datasets.study_index}/catalog_curated
+catalog_study_locus: ${datasets.credible_set}/catalog_curated
+finngen_study_index: ${datasets.study_index}/finngen
+finngen_summary_stats: ${datasets.summary_statistics}/finngen
+from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
+from_sumstats_pics: ${datasets.credible_set}/from_sumstats
+ukbiobank_study_index: ${datasets.study_index}/ukbiobank
 l2g_model: ${datasets.outputs}/l2g_model
 l2g_predictions: ${datasets.outputs}/l2g_predictions
 
 # Constants
 finngen_release_prefix: FINNGEN_R9
-finngen_sumstat_url_prefix: https://storage.googleapis.com/finngen-public-data-r9/summary_stats/finngen_R9_
+finngen_sumstat_url_prefix: gs://finngen-public-data-r9/summary_stats/finngen_R9_
 finngen_sumstat_url_suffix: .gz
diff --git a/config/step/locus_to_gene.yaml b/config/step/locus_to_gene.yaml
index 47f014c55..9f7004e17 100644
--- a/config/step/locus_to_gene.yaml
+++ b/config/step/locus_to_gene.yaml
@@ -9,7 +9,7 @@ wandb_run_name: null
 perform_cross_validation: false
 model_path: ${datasets.l2g_model}
 predictions_path: ${datasets.l2g_predictions}
-study_locus_path: ${datasets.study_locus}
+study_locus_path: ${datasets.credible_set}
 variant_gene_path: ${datasets.v2g}
 colocalisation_path: ${datasets.colocalisation}
 study_index_path: ${datasets.catalog_study_index}
diff --git a/src/airflow/dags/configs/dag.yaml b/src/airflow/dags/configs/dag.yaml
index 30a7c3827..d7ffabc03 100644
--- a/src/airflow/dags/configs/dag.yaml
+++ b/src/airflow/dags/configs/dag.yaml
@@ -7,17 +7,14 @@
   prerequisites:
     - "variant_index"
     - "gene_index"
-- id: "finngen"
 - id: "ukbiobank"
 - id: "study_locus_overlap"
   prerequisites:
     - "gwas_catalog"
-    - "finngen"
     - "ukbiobank"
 - id: "locus_to_gene"
   prerequisites:
     - "gwas_catalog"
-    - "finngen"
     - "ukbiobank"
     - "variant_index"
     - "v2g"

From 44766d22febfb2ae2d2ed4224106c90128ffc5b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Thu, 16 Nov 2023 17:33:41 +0000
Subject: [PATCH 02/25] fix: change definition of negative l2g evidence

---
 .../open_targets/l2g_gold_standard.py         | 147 +++++++++---------
 1 file changed, 76 insertions(+), 71 deletions(-)

diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 46ec21502..4bf89732b 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -62,78 +62,83 @@ def as_l2g_gold_standard(
             "leftStudyLocusId", "rightStudyLocusId"
         )
         interactions_df = cls.process_gene_interactions(interactions)
-        return L2GGoldStandard(
-            _df=(
-                gold_standard_curation.filter(
-                    f.col("gold_standard_info.highest_confidence").isin(
-                        ["High", "Medium"]
-                    )
-                )
-                .select(
-                    f.col("association_info.otg_id").alias("studyId"),
-                    f.col("gold_standard_info.gene_id").alias("geneId"),
-                    f.concat_ws(
-                        "_",
-                        f.col("sentinel_variant.locus_GRCh38.chromosome"),
-                        f.col("sentinel_variant.locus_GRCh38.position"),
-                        f.col("sentinel_variant.alleles.reference"),
-                        f.col("sentinel_variant.alleles.alternative"),
-                    ).alias("variantId"),
-                    f.col("metadata.set_label").alias("source"),
-                )
-                .withColumn(
-                    "studyLocusId",
-                    StudyLocus.assign_study_locus_id(
-                        f.col("studyId"), f.col("variantId")
-                    ),
-                )
-                .groupBy("studyLocusId", "studyId", "variantId", "geneId")
-                .agg(
-                    f.collect_set("source").alias("sources"),
-                )
-                # Assign Positive or Negative Status based on confidence
-                .join(
-                    v2g.df.filter(f.col("distance").isNotNull()).select(
-                        "variantId", "geneId", "distance"
-                    ),
-                    on=["variantId", "geneId"],
-                    how="inner",
-                )
-                .withColumn(
-                    "goldStandardSet",
-                    f.when(f.col("distance") <= 500_000, f.lit("positive")).otherwise(
-                        f.lit("negative")
-                    ),
-                )
-                # Remove redundant loci by testing they are truly independent
-                .alias("left")
-                .join(
-                    overlaps_df.alias("right"),
-                    (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
-                    | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
-                    how="left",
-                )
-                .distinct()
-                # Remove redundant genes by testing they do not interact with a positive gene
-                .join(
-                    interactions_df.alias("interactions"),
-                    (f.col("left.geneId") == f.col("interactions.geneIdA"))
-                    | (f.col("left.geneId") == f.col("interactions.geneIdB")),
-                    how="left",
-                )
-                .withColumn("interacting", (f.col("score") > 0.7))
-                # filter out genes where geneIdA has goldStandardSet negative but geneIdA and gene IdB are interacting
-                .filter(
-                    ~(
-                        (f.col("goldStandardSet") == 0)
-                        & (f.col("interacting"))
-                        & (
-                            (f.col("left.geneId") == f.col("interactions.geneIdA"))
-                            | (f.col("left.geneId") == f.col("interactions.geneIdB"))
-                        )
+
+        positive_set = (
+            gold_standard_curation.filter(
+                f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])
+            )
+            .select(
+                f.col("association_info.otg_id").alias("studyId"),
+                f.col("gold_standard_info.gene_id").alias("geneId"),
+                f.concat_ws(
+                    "_",
+                    f.col("sentinel_variant.locus_GRCh38.chromosome"),
+                    f.col("sentinel_variant.locus_GRCh38.position"),
+                    f.col("sentinel_variant.alleles.reference"),
+                    f.col("sentinel_variant.alleles.alternative"),
+                ).alias("variantId"),
+                f.col("metadata.set_label").alias("source"),
+            )
+            .withColumn(
+                "studyLocusId",
+                StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
+            )
+            .groupBy("studyLocusId", "studyId", "variantId", "geneId")
+            .agg(
+                f.collect_set("source").alias("sources"),
+            )
+        )
+
+        full_set = (
+            # Bring negative evidence based on genes that are in the vicinity of the locus but are not part of the positive set
+            positive_set.alias("positives")
+            .join(
+                v2g.df.filter(f.col("distance") <= 500_000)
+                .select("variantId", "geneId", "distance")
+                .alias("negatives"),
+                on="variantId",
+                how="left",
+            )
+            # Assign set label
+            .withColumn(
+                "goldStandardSet",
+                f.when(
+                    (f.col("positives.geneId") == f.col("negatives.geneId"))
+                    # to keep the positives that are outside the v2g dataset
+                    | (f.col("negatives.geneId").isNull()),
+                    f.lit("positive"),
+                ).otherwise("negative"),
+            )
+            # Remove redundant loci by testing they are truly independent
+            .alias("left")
+            .join(
+                overlaps_df.alias("right"),
+                (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
+                | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
+                how="left",
+            )
+            .distinct()
+            # filter out genes where geneIdA has goldStandardSet negative but geneIdA and gene IdB are interacting
+            .join(
+                interactions_df.alias("interactions"),
+                (f.col("left.geneId") == f.col("interactions.geneIdA"))
+                | (f.col("left.geneId") == f.col("interactions.geneIdB")),
+                how="left",
+            )
+            .withColumn("interacting", (f.col("score") > 0.7))
+            .filter(
+                ~(
+                    (f.col("goldStandardSet") == 0)
+                    & (f.col("interacting"))
+                    & (
+                        (f.col("left.geneId") == f.col("interactions.geneIdA"))
+                        | (f.col("left.geneId") == f.col("interactions.geneIdB"))
                     )
                 )
-                .select("studyLocusId", "geneId", "goldStandardSet", "sources")
-            ),
+            )
+            .select("studyLocusId", "geneId", "goldStandardSet", "sources")
+        )
+        return L2GGoldStandard(
+            _df=full_set,
             _schema=L2GGoldStandard.get_schema(),
         )

From e17df5b84d705faec1c021a8691cf80ede7c4fe4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 09:25:31 +0000
Subject: [PATCH 03/25] refactor: modularise logic for gold standards

---
 .../open_targets/l2g_gold_standard.py         | 165 ++++++++++++------
 1 file changed, 107 insertions(+), 58 deletions(-)

diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 4bf89732b..f5e2a0b25 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -15,8 +15,8 @@ class OpenTargetsL2GGoldStandard:
     """Parser for OTGenetics locus to gene gold standards curation.
 
     The curation is processed to generate a dataset with 2 labels:
-        - Gold Standard Positive (GSP): Variant is within 500kb of gene
-        - Gold Standard Negative (GSN): Variant is not within 500kb of gene
+        - Gold Standard Positive (GSP): When the lead variant is part of a curated list of GWAS loci with known gene-trait associations.
+        - Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS.
     """
 
     @staticmethod
@@ -39,31 +39,17 @@ def process_gene_interactions(interactions: DataFrame) -> DataFrame:
             "scoring as score",
         )
 
-    @classmethod
-    def as_l2g_gold_standard(
-        cls: type[OpenTargetsL2GGoldStandard],
-        gold_standard_curation: DataFrame,
-        v2g: V2G,
-        study_locus_overlap: StudyLocusOverlap,
-        interactions: DataFrame,
-    ) -> L2GGoldStandard:
-        """Initialise L2GGoldStandard from source dataset.
+    @staticmethod
+    def create_positive_set(gold_standard_curation: DataFrame) -> DataFrame:
+        """Parse positive set from gold standard curation.
 
         Args:
-            gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
-            v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS
-            study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
-            interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene
+            gold_standard_curation (DataFrame): Gold standard curation dataframe
 
         Returns:
-            L2GGoldStandard: L2G Gold Standard dataset
+            DataFrame: Positive set
         """
-        overlaps_df = study_locus_overlap._df.select(
-            "leftStudyLocusId", "rightStudyLocusId"
-        )
-        interactions_df = cls.process_gene_interactions(interactions)
-
-        positive_set = (
+        return (
             gold_standard_curation.filter(
                 f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])
             )
@@ -84,48 +70,56 @@ def as_l2g_gold_standard(
                 StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
             )
             .groupBy("studyLocusId", "studyId", "variantId", "geneId")
-            .agg(
-                f.collect_set("source").alias("sources"),
-            )
+            .agg(f.collect_set("source").alias("sources"))
         )
 
-        full_set = (
-            # Bring negative evidence based on genes that are in the vicinity of the locus but are not part of the positive set
-            positive_set.alias("positives")
-            .join(
-                v2g.df.filter(f.col("distance") <= 500_000)
-                .select("variantId", "geneId", "distance")
-                .alias("negatives"),
-                on="variantId",
-                how="left",
-            )
-            # Assign set label
-            .withColumn(
-                "goldStandardSet",
-                f.when(
-                    (f.col("positives.geneId") == f.col("negatives.geneId"))
-                    # to keep the positives that are outside the v2g dataset
-                    | (f.col("negatives.geneId").isNull()),
-                    f.lit("positive"),
-                ).otherwise("negative"),
-            )
-            # Remove redundant loci by testing they are truly independent
-            .alias("left")
-            .join(
-                overlaps_df.alias("right"),
-                (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
-                | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
-                how="left",
-            )
-            .distinct()
-            # filter out genes where geneIdA has goldStandardSet negative but geneIdA and gene IdB are interacting
+    @staticmethod
+    def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame:
+        """Create full set of positive and negative evidence of locus to gene associations.
+
+        Args:
+            positive_set (DataFrame): Positive set
+            v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS
+
+        Returns:
+            DataFrame: Full set of positive and negative evidence of locus to gene associations
+        """
+        return positive_set.join(
+            v2g.df.filter(f.col("distance") <= 500_000),
+            on="variantId",
+            how="left",
+        ).withColumn(
+            "goldStandardSet",
+            f.when(
+                (f.col("positives.geneId") == f.col("negatives.geneId"))
+                # to keep the positives that are outside the v2g dataset
+                | (f.col("negatives.geneId").isNull()),
+                f.lit("positive"),
+            ).otherwise("negative"),
+        )
+
+    @staticmethod
+    def remove_false_negatives(
+        full_set: DataFrame, interactions_df: DataFrame
+    ) -> DataFrame:
+        """Remove redundant loci by testing they are truly independent.
+
+        Args:
+            full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives.
+            interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes
+
+        Returns:
+            DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed.
+        """
+        return (
+            full_set.alias("left")
             .join(
                 interactions_df.alias("interactions"),
                 (f.col("left.geneId") == f.col("interactions.geneIdA"))
                 | (f.col("left.geneId") == f.col("interactions.geneIdB")),
                 how="left",
             )
-            .withColumn("interacting", (f.col("score") > 0.7))
+            .withColumn("interacting", (f.col("score") > 0.7))  # remove hardcoded value
             .filter(
                 ~(
                     (f.col("goldStandardSet") == 0)
@@ -136,9 +130,64 @@ def as_l2g_gold_standard(
                     )
                 )
             )
-            .select("studyLocusId", "geneId", "goldStandardSet", "sources")
         )
+
+    @staticmethod
+    def remove_redundant_locus(
+        full_set: DataFrame, study_locus_overlap: StudyLocusOverlap
+    ) -> DataFrame:
+        """Remove redundant loci by testing they are truly independent.
+
+        Args:
+            full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives.
+            study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
+
+        Returns:
+            DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed. # TODO rename
+        """
+        return (
+            full_set.alias("left")
+            .join(
+                study_locus_overlap.df.select(
+                    "leftStudyLocusId", "rightStudyLocusId"
+                ).alias("right"),
+                (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
+                | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
+                how="left",
+            )
+            .distinct()
+        )
+
+    @classmethod
+    def as_l2g_gold_standard(
+        cls: type[OpenTargetsL2GGoldStandard],
+        gold_standard_curation: DataFrame,
+        v2g: V2G,
+        study_locus_overlap: StudyLocusOverlap,
+        interactions: DataFrame,
+    ) -> L2GGoldStandard:
+        """Initialise L2GGoldStandard from source dataset.
+
+        Args:
+            gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
+            v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS
+            study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
+            interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene
+
+        Returns:
+            L2GGoldStandard: L2G Gold Standard dataset
+        """
+        interactions_df = cls.process_gene_interactions(interactions)
+
+        positive_set = cls.create_positive_set(gold_standard_curation)
+
+        full_set = cls.create_full_set(positive_set, v2g)
+
+        final_set = full_set.transform(
+            cls.remove_redundant_locus, study_locus_overlap
+        ).transform(cls.remove_false_negatives, interactions_df)
+
         return L2GGoldStandard(
-            _df=full_set,
+            _df=final_set,
             _schema=L2GGoldStandard.get_schema(),
         )

From f7eba79b6d466a962997f6e8306c469c7483cea8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 09:43:31 +0000
Subject: [PATCH 04/25] refactor: move hardcoded values to constants

---
 .../open_targets/l2g_gold_standard.py         | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index f5e2a0b25..9986339b5 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -19,6 +19,11 @@ class OpenTargetsL2GGoldStandard:
         - Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS.
     """
 
+    LOCUS_TO_GENE_WINDOW = 500_000
+    GS_POSITIVE_LABEL = "positive"
+    GS_NEGATIVE_LABEL = "negative"
+    INTERACTION_THRESHOLD = 0.7
+
     @staticmethod
     def process_gene_interactions(interactions: DataFrame) -> DataFrame:
         """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
@@ -85,7 +90,9 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame:
             DataFrame: Full set of positive and negative evidence of locus to gene associations
         """
         return positive_set.join(
-            v2g.df.filter(f.col("distance") <= 500_000),
+            v2g.df.filter(
+                f.col("distance") <= OpenTargetsL2GGoldStandard.LOCUS_TO_GENE_WINDOW
+            ),
             on="variantId",
             how="left",
         ).withColumn(
@@ -94,8 +101,8 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame:
                 (f.col("positives.geneId") == f.col("negatives.geneId"))
                 # to keep the positives that are outside the v2g dataset
                 | (f.col("negatives.geneId").isNull()),
-                f.lit("positive"),
-            ).otherwise("negative"),
+                f.lit(OpenTargetsL2GGoldStandard.GS_POSITIVE_LABEL),
+            ).otherwise(OpenTargetsL2GGoldStandard.GS_NEGATIVE_LABEL),
         )
 
     @staticmethod
@@ -119,10 +126,15 @@ def remove_false_negatives(
                 | (f.col("left.geneId") == f.col("interactions.geneIdB")),
                 how="left",
             )
-            .withColumn("interacting", (f.col("score") > 0.7))  # remove hardcoded value
+            .withColumn(
+                "interacting",
+                (f.col("score") > OpenTargetsL2GGoldStandard.INTERACTION_THRESHOLD),
+            )
             .filter(
                 ~(
-                    (f.col("goldStandardSet") == 0)
+                    (
+                        f.col("goldStandardSet") == 0
+                    )  # bugfix: goldStandardSet is a string, not an int
                     & (f.col("interacting"))
                     & (
                         (f.col("left.geneId") == f.col("interactions.geneIdA"))
@@ -184,7 +196,9 @@ def as_l2g_gold_standard(
         full_set = cls.create_full_set(positive_set, v2g)
 
         final_set = full_set.transform(
-            cls.remove_redundant_locus, study_locus_overlap
+            # TODO: move logic to L2GGoldStandard
+            cls.remove_redundant_locus,
+            study_locus_overlap,
         ).transform(cls.remove_false_negatives, interactions_df)
 
         return L2GGoldStandard(

From 65be4708f8b1dbfd24b6e5c6b34cd4098932b9f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 10:02:56 +0000
Subject: [PATCH 05/25] refactor: turn `OpenTargetsL2GGoldStandard` into class
 methods

---
 .../open_targets/l2g_gold_standard.py         | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 9986339b5..2e3a7fe8d 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -1,6 +1,8 @@
 """Parser for OTPlatform locus to gene gold standards curation."""
 from __future__ import annotations
 
+from typing import Type
+
 import pyspark.sql.functions as f
 from pyspark.sql import DataFrame
 
@@ -24,8 +26,10 @@ class OpenTargetsL2GGoldStandard:
     GS_NEGATIVE_LABEL = "negative"
     INTERACTION_THRESHOLD = 0.7
 
-    @staticmethod
-    def process_gene_interactions(interactions: DataFrame) -> DataFrame:
+    @classmethod
+    def process_gene_interactions(
+        cls: Type[OpenTargetsL2GGoldStandard], interactions: DataFrame
+    ) -> DataFrame:
         """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
 
         Args:
@@ -44,8 +48,10 @@ def process_gene_interactions(interactions: DataFrame) -> DataFrame:
             "scoring as score",
         )
 
-    @staticmethod
-    def create_positive_set(gold_standard_curation: DataFrame) -> DataFrame:
+    @classmethod
+    def create_positive_set(
+        cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame
+    ) -> DataFrame:
         """Parse positive set from gold standard curation.
 
         Args:
@@ -78,8 +84,10 @@ def create_positive_set(gold_standard_curation: DataFrame) -> DataFrame:
             .agg(f.collect_set("source").alias("sources"))
         )
 
-    @staticmethod
-    def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame:
+    @classmethod
+    def create_full_set(
+        cls: Type[OpenTargetsL2GGoldStandard], positive_set: DataFrame, v2g: V2G
+    ) -> DataFrame:
         """Create full set of positive and negative evidence of locus to gene associations.
 
         Args:
@@ -90,9 +98,7 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame:
             DataFrame: Full set of positive and negative evidence of locus to gene associations
         """
         return positive_set.join(
-            v2g.df.filter(
-                f.col("distance") <= OpenTargetsL2GGoldStandard.LOCUS_TO_GENE_WINDOW
-            ),
+            v2g.df.filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW),
             on="variantId",
             how="left",
         ).withColumn(
@@ -101,13 +107,15 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame:
                 (f.col("positives.geneId") == f.col("negatives.geneId"))
                 # to keep the positives that are outside the v2g dataset
                 | (f.col("negatives.geneId").isNull()),
-                f.lit(OpenTargetsL2GGoldStandard.GS_POSITIVE_LABEL),
-            ).otherwise(OpenTargetsL2GGoldStandard.GS_NEGATIVE_LABEL),
+                f.lit(cls.GS_POSITIVE_LABEL),
+            ).otherwise(cls.GS_NEGATIVE_LABEL),
         )
 
-    @staticmethod
+    @classmethod
     def remove_false_negatives(
-        full_set: DataFrame, interactions_df: DataFrame
+        cls: Type[OpenTargetsL2GGoldStandard],
+        full_set: DataFrame,
+        interactions_df: DataFrame,
     ) -> DataFrame:
         """Remove redundant loci by testing they are truly independent.
 
@@ -128,7 +136,7 @@ def remove_false_negatives(
             )
             .withColumn(
                 "interacting",
-                (f.col("score") > OpenTargetsL2GGoldStandard.INTERACTION_THRESHOLD),
+                (f.col("score") > cls.INTERACTION_THRESHOLD),
             )
             .filter(
                 ~(
@@ -144,9 +152,11 @@ def remove_false_negatives(
             )
         )
 
-    @staticmethod
+    @classmethod
     def remove_redundant_locus(
-        full_set: DataFrame, study_locus_overlap: StudyLocusOverlap
+        cls: Type[OpenTargetsL2GGoldStandard],
+        full_set: DataFrame,
+        study_locus_overlap: StudyLocusOverlap,
     ) -> DataFrame:
         """Remove redundant loci by testing they are truly independent.
 

From 1518156f4f44aa10d544e45ca9bb321ffc485d97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 10:58:32 +0000
Subject: [PATCH 06/25] refactor(gold_standard): move logic to refine gold
 standards to `L2GGoldStandard`

---
 src/otg/dataset/l2g_gold_standard.py          | 104 ++++++++++++++-
 .../open_targets/l2g_gold_standard.py         | 125 ++----------------
 2 files changed, 111 insertions(+), 118 deletions(-)

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
index 44470581f..b5f1189a9 100644
--- a/src/otg/dataset/l2g_gold_standard.py
+++ b/src/otg/dataset/l2g_gold_standard.py
@@ -2,9 +2,12 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Type
+
+import pyspark.sql.functions as f
 
 from otg.common.schemas import parse_spark_schema
+from otg.common.spark_helpers import get_record_with_maximum_value
 from otg.dataset.dataset import Dataset
 
 if TYPE_CHECKING:
@@ -19,6 +22,8 @@
 class L2GGoldStandard(Dataset):
     """L2G gold standard dataset."""
 
+    INTERACTION_THRESHOLD = 0.7
+
     @classmethod
     def from_otg_curation(
         cls: type[L2GGoldStandard],
@@ -42,8 +47,34 @@ def from_otg_curation(
             OpenTargetsL2GGoldStandard,
         )
 
-        return OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
-            gold_standard_curation, v2g, study_locus_overlap, interactions
+        interactions_df = cls.process_gene_interactions(interactions)
+
+        return (
+            OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g)
+            .filter_unique_associations(study_locus_overlap)
+            .remove_false_negatives(interactions_df)
+        )
+
+    @classmethod
+    def process_gene_interactions(
+        cls: Type[L2GGoldStandard], interactions: DataFrame
+    ) -> DataFrame:
+        """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
+
+        Args:
+            interactions (DataFrame): Gene-gene interactions dataset
+
+        Returns:
+            DataFrame: Top scoring gene-gene interaction per pair of genes
+        """
+        return get_record_with_maximum_value(
+            interactions,
+            ["targetA", "targetB"],
+            "scoring",
+        ).selectExpr(
+            "targetA as geneIdA",
+            "targetB as geneIdB",
+            "scoring as score",
         )
 
     @classmethod
@@ -54,3 +85,70 @@ def get_schema(cls: type[L2GGoldStandard]) -> StructType:
             StructType: Spark schema for the L2GGoldStandard dataset
         """
         return parse_spark_schema("l2g_gold_standard.json")
+
+    def filter_unique_associations(
+        self: L2GGoldStandard,
+        study_locus_overlap: StudyLocusOverlap,
+    ) -> L2GGoldStandard:
+        """Refines the gold standard to filter out loci that are not independent. redundant loci by testing they are truly independent.
+
+        Args:
+            study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.
+
+        Returns:
+            L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
+        """
+        # TODO: Test this logic
+        self.df = (
+            self.df.alias("left")
+            .join(
+                study_locus_overlap.df.select(
+                    "leftStudyLocusId", "rightStudyLocusId"
+                ).alias("right"),
+                (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
+                | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
+                how="left",
+            )
+            .distinct()
+        )
+        return self
+
+    def remove_false_negatives(
+        self: L2GGoldStandard,
+        interactions_df: DataFrame,
+    ) -> L2GGoldStandard:
+        """Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.
+
+        Args:
+            interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes
+
+        Returns:
+            L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
+        """
+        # TODO: Test this logic
+        self.df = (
+            self.df.alias("left")
+            .join(
+                interactions_df.alias("interactions"),
+                (f.col("left.geneId") == f.col("interactions.geneIdA"))
+                | (f.col("left.geneId") == f.col("interactions.geneIdB")),
+                how="left",
+            )
+            .withColumn(
+                "interacting",
+                (f.col("score") > self.INTERACTION_THRESHOLD),
+            )
+            .filter(
+                ~(
+                    (
+                        f.col("goldStandardSet") == 0
+                    )  # TODO: goldStandardSet is a string, not an int
+                    & (f.col("interacting"))
+                    & (
+                        (f.col("left.geneId") == f.col("interactions.geneIdA"))
+                        | (f.col("left.geneId") == f.col("interactions.geneIdB"))
+                    )
+                )
+            )
+        )
+        return self
diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 2e3a7fe8d..5b47468f7 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -6,10 +6,8 @@
 import pyspark.sql.functions as f
 from pyspark.sql import DataFrame
 
-from otg.common.spark_helpers import get_record_with_maximum_value
 from otg.dataset.l2g_gold_standard import L2GGoldStandard
 from otg.dataset.study_locus import StudyLocus
-from otg.dataset.study_locus_overlap import StudyLocusOverlap
 from otg.dataset.v2g import V2G
 
 
@@ -24,32 +22,9 @@ class OpenTargetsL2GGoldStandard:
     LOCUS_TO_GENE_WINDOW = 500_000
     GS_POSITIVE_LABEL = "positive"
     GS_NEGATIVE_LABEL = "negative"
-    INTERACTION_THRESHOLD = 0.7
 
     @classmethod
-    def process_gene_interactions(
-        cls: Type[OpenTargetsL2GGoldStandard], interactions: DataFrame
-    ) -> DataFrame:
-        """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
-
-        Args:
-            interactions (DataFrame): Gene-gene interactions dataset
-
-        Returns:
-            DataFrame: Top scoring gene-gene interaction per pair of genes
-        """
-        return get_record_with_maximum_value(
-            interactions,
-            ["targetA", "targetB"],
-            "scoring",
-        ).selectExpr(
-            "targetA as geneIdA",
-            "targetB as geneIdB",
-            "scoring as score",
-        )
-
-    @classmethod
-    def create_positive_set(
+    def parse_positive_curation(
         cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame
     ) -> DataFrame:
         """Parse positive set from gold standard curation.
@@ -85,18 +60,21 @@ def create_positive_set(
         )
 
     @classmethod
-    def create_full_set(
+    def expand_gold_standard_with_negatives(
         cls: Type[OpenTargetsL2GGoldStandard], positive_set: DataFrame, v2g: V2G
     ) -> DataFrame:
         """Create full set of positive and negative evidence of locus to gene associations.
 
+        Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.
+
         Args:
-            positive_set (DataFrame): Positive set
+            positive_set (DataFrame): Positive set from curation
             v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS
 
         Returns:
             DataFrame: Full set of positive and negative evidence of locus to gene associations
         """
+        # TODO: test function
         return positive_set.join(
             v2g.df.filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW),
             on="variantId",
@@ -111,107 +89,24 @@ def create_full_set(
             ).otherwise(cls.GS_NEGATIVE_LABEL),
         )
 
-    @classmethod
-    def remove_false_negatives(
-        cls: Type[OpenTargetsL2GGoldStandard],
-        full_set: DataFrame,
-        interactions_df: DataFrame,
-    ) -> DataFrame:
-        """Remove redundant loci by testing they are truly independent.
-
-        Args:
-            full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives.
-            interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes
-
-        Returns:
-            DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed.
-        """
-        return (
-            full_set.alias("left")
-            .join(
-                interactions_df.alias("interactions"),
-                (f.col("left.geneId") == f.col("interactions.geneIdA"))
-                | (f.col("left.geneId") == f.col("interactions.geneIdB")),
-                how="left",
-            )
-            .withColumn(
-                "interacting",
-                (f.col("score") > cls.INTERACTION_THRESHOLD),
-            )
-            .filter(
-                ~(
-                    (
-                        f.col("goldStandardSet") == 0
-                    )  # bugfix: goldStandardSet is a string, not an int
-                    & (f.col("interacting"))
-                    & (
-                        (f.col("left.geneId") == f.col("interactions.geneIdA"))
-                        | (f.col("left.geneId") == f.col("interactions.geneIdB"))
-                    )
-                )
-            )
-        )
-
-    @classmethod
-    def remove_redundant_locus(
-        cls: Type[OpenTargetsL2GGoldStandard],
-        full_set: DataFrame,
-        study_locus_overlap: StudyLocusOverlap,
-    ) -> DataFrame:
-        """Remove redundant loci by testing they are truly independent.
-
-        Args:
-            full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives.
-            study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
-
-        Returns:
-            DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed. # TODO rename
-        """
-        return (
-            full_set.alias("left")
-            .join(
-                study_locus_overlap.df.select(
-                    "leftStudyLocusId", "rightStudyLocusId"
-                ).alias("right"),
-                (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
-                | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
-                how="left",
-            )
-            .distinct()
-        )
-
     @classmethod
     def as_l2g_gold_standard(
         cls: type[OpenTargetsL2GGoldStandard],
         gold_standard_curation: DataFrame,
         v2g: V2G,
-        study_locus_overlap: StudyLocusOverlap,
-        interactions: DataFrame,
     ) -> L2GGoldStandard:
         """Initialise L2GGoldStandard from source dataset.
 
         Args:
             gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
             v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS
-            study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
-            interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene
 
         Returns:
-            L2GGoldStandard: L2G Gold Standard dataset
+            L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
         """
-        interactions_df = cls.process_gene_interactions(interactions)
-
-        positive_set = cls.create_positive_set(gold_standard_curation)
-
-        full_set = cls.create_full_set(positive_set, v2g)
-
-        final_set = full_set.transform(
-            # TODO: move logic to L2GGoldStandard
-            cls.remove_redundant_locus,
-            study_locus_overlap,
-        ).transform(cls.remove_false_negatives, interactions_df)
-
         return L2GGoldStandard(
-            _df=final_set,
+            _df=cls.parse_positive_curation(gold_standard_curation).transform(
+                cls.expand_gold_standard_with_negatives, v2g
+            ),
             _schema=L2GGoldStandard.get_schema(),
         )

From ab29c9a70281186c947f5ce93b4c8bdd4ea4bc96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 12:18:12 +0000
Subject: [PATCH 07/25] test: add `test_parse_positive_curation`

---
 .../open_targets/test_l2g_gold_standard.py         | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/datasource/open_targets/test_l2g_gold_standard.py b/tests/datasource/open_targets/test_l2g_gold_standard.py
index d3435e624..bced217e0 100644
--- a/tests/datasource/open_targets/test_l2g_gold_standard.py
+++ b/tests/datasource/open_targets/test_l2g_gold_standard.py
@@ -9,23 +9,27 @@
 from otg.datasource.open_targets.l2g_gold_standard import OpenTargetsL2GGoldStandard
 
 if TYPE_CHECKING:
-    from otg.dataset.study_locus_overlap import StudyLocusOverlap
     from otg.dataset.v2g import V2G
 
 
 def test_open_targets_as_l2g_gold_standard(
     sample_l2g_gold_standard: DataFrame,
     mock_v2g: V2G,
-    mock_study_locus_overlap: StudyLocusOverlap,
-    sample_otp_interactions: DataFrame,
 ) -> None:
     """Test L2G gold standard from OTG curation."""
     assert isinstance(
         OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
             sample_l2g_gold_standard,
             mock_v2g,
-            mock_study_locus_overlap,
-            sample_otp_interactions,
         ),
         L2GGoldStandard,
     )
+
+
+def test_parse_positive_curation(
+    sample_l2g_gold_standard: DataFrame,
+) -> None:
+    """Test parsing curation as the positive set."""
+    expected_cols = ["studyLocusId", "studyId", "variantId", "geneId", "sources"]
+    df = OpenTargetsL2GGoldStandard.parse_positive_curation(sample_l2g_gold_standard)
+    assert df.columns == expected_cols, "GS parsing has a different schema."

From dd95d9c51fec65f754d702a67301925fa7d97827 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 13:03:32 +0000
Subject: [PATCH 08/25] test: fix and test logic in
 `expand_gold_standard_with_negatives`

---
 .../open_targets/l2g_gold_standard.py         | 45 +++++++++++++------
 .../open_targets/test_l2g_gold_standard.py    | 36 ++++++++++++++-
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 5b47468f7..0611023e1 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -75,18 +75,32 @@ def expand_gold_standard_with_negatives(
             DataFrame: Full set of positive and negative evidence of locus to gene associations
         """
         # TODO: test function
-        return positive_set.join(
-            v2g.df.filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW),
-            on="variantId",
-            how="left",
-        ).withColumn(
-            "goldStandardSet",
-            f.when(
-                (f.col("positives.geneId") == f.col("negatives.geneId"))
-                # to keep the positives that are outside the v2g dataset
-                | (f.col("negatives.geneId").isNull()),
-                f.lit(cls.GS_POSITIVE_LABEL),
-            ).otherwise(cls.GS_NEGATIVE_LABEL),
+        return (
+            positive_set.withColumnRenamed("geneId", "curated_geneId")
+            .join(
+                v2g.df.selectExpr(
+                    "variantId", "geneId as non_curated_geneId", "distance"
+                ).filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW),
+                on="variantId",
+                how="left",
+            )
+            .withColumn(
+                "goldStandardSet",
+                f.when(
+                    (f.col("curated_geneId") == f.col("non_curated_geneId"))
+                    # to keep the positives that are outside the v2g dataset
+                    | (f.col("non_curated_geneId").isNull()),
+                    f.lit(cls.GS_POSITIVE_LABEL),
+                ).otherwise(cls.GS_NEGATIVE_LABEL),
+            )
+            .withColumn(
+                "geneId",
+                f.when(
+                    f.col("goldStandardSet") == cls.GS_POSITIVE_LABEL,
+                    f.col("curated_geneId"),
+                ).otherwise(f.col("non_curated_geneId")),
+            )
+            .drop("distance", "curated_geneId", "non_curated_geneId")
         )
 
     @classmethod
@@ -105,8 +119,11 @@ def as_l2g_gold_standard(
             L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
         """
         return L2GGoldStandard(
-            _df=cls.parse_positive_curation(gold_standard_curation).transform(
-                cls.expand_gold_standard_with_negatives, v2g
+            _df=cls.parse_positive_curation(gold_standard_curation)
+            .transform(cls.expand_gold_standard_with_negatives, v2g)
+            .drop(
+                "variantId",
+                "studyId",
             ),
             _schema=L2GGoldStandard.get_schema(),
         )
diff --git a/tests/datasource/open_targets/test_l2g_gold_standard.py b/tests/datasource/open_targets/test_l2g_gold_standard.py
index bced217e0..7f075f865 100644
--- a/tests/datasource/open_targets/test_l2g_gold_standard.py
+++ b/tests/datasource/open_targets/test_l2g_gold_standard.py
@@ -6,10 +6,11 @@
 from pyspark.sql import DataFrame
 
 from otg.dataset.l2g_gold_standard import L2GGoldStandard
+from otg.dataset.v2g import V2G
 from otg.datasource.open_targets.l2g_gold_standard import OpenTargetsL2GGoldStandard
 
 if TYPE_CHECKING:
-    from otg.dataset.v2g import V2G
+    from pyspark.sql.session import SparkSession
 
 
 def test_open_targets_as_l2g_gold_standard(
@@ -33,3 +34,36 @@ def test_parse_positive_curation(
     expected_cols = ["studyLocusId", "studyId", "variantId", "geneId", "sources"]
     df = OpenTargetsL2GGoldStandard.parse_positive_curation(sample_l2g_gold_standard)
     assert df.columns == expected_cols, "GS parsing has a different schema."
+
+
+def test_expand_gold_standard_with_negatives(spark: SparkSession) -> None:
+    """Test expanding positive set with negative set."""
+    sample_positive_set = spark.createDataFrame(
+        [
+            ("variant1", "gene1", "study1"),
+            ("variant2", "gene2", "study1"),
+        ],
+        ["variantId", "geneId", "studyId"],
+    )
+    sample_v2g_df = spark.createDataFrame(
+        [
+            ("variant1", "gene1", 5, "X", "X", "X"),
+            ("variant1", "gene3", 10, "X", "X", "X"),
+        ],
+        ["variantId", "geneId", "distance", "chromosome", "datatypeId", "datasourceId"],
+    )
+
+    expected_expanded_gs = spark.createDataFrame(
+        [
+            ("variant1", "study1", "negative", "gene3"),
+            ("variant1", "study1", "positive", "gene1"),
+            ("variant2", "study1", "positive", "gene2"),
+        ],
+        ["variantId", "geneId", "goldStandardSet", "studyId"],
+    )
+    observed_df = OpenTargetsL2GGoldStandard.expand_gold_standard_with_negatives(
+        sample_positive_set, V2G(_df=sample_v2g_df, _schema=V2G.get_schema())
+    )
+    assert (
+        observed_df.collect() == expected_expanded_gs.collect()
+    ), "GS expansion is not as expected."

From 8347b2f896d5a808a57d924050cd7f498eb30a2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 15:09:10 +0000
Subject: [PATCH 09/25] test: add
 `test_expand_gold_standard_with_negatives_same_positives`

---
 .../open_targets/test_l2g_gold_standard.py    | 92 +++++++++++++------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/tests/datasource/open_targets/test_l2g_gold_standard.py b/tests/datasource/open_targets/test_l2g_gold_standard.py
index 7f075f865..0ae3e0a08 100644
--- a/tests/datasource/open_targets/test_l2g_gold_standard.py
+++ b/tests/datasource/open_targets/test_l2g_gold_standard.py
@@ -3,6 +3,7 @@
 
 from typing import TYPE_CHECKING
 
+import pytest
 from pyspark.sql import DataFrame
 
 from otg.dataset.l2g_gold_standard import L2GGoldStandard
@@ -36,34 +37,67 @@ def test_parse_positive_curation(
     assert df.columns == expected_cols, "GS parsing has a different schema."
 
 
-def test_expand_gold_standard_with_negatives(spark: SparkSession) -> None:
+class TestExpandGoldStandardWithNegatives:
     """Test expanding positive set with negative set."""
-    sample_positive_set = spark.createDataFrame(
-        [
-            ("variant1", "gene1", "study1"),
-            ("variant2", "gene2", "study1"),
-        ],
-        ["variantId", "geneId", "studyId"],
-    )
-    sample_v2g_df = spark.createDataFrame(
-        [
-            ("variant1", "gene1", 5, "X", "X", "X"),
-            ("variant1", "gene3", 10, "X", "X", "X"),
-        ],
-        ["variantId", "geneId", "distance", "chromosome", "datatypeId", "datasourceId"],
-    )
 
-    expected_expanded_gs = spark.createDataFrame(
-        [
-            ("variant1", "study1", "negative", "gene3"),
-            ("variant1", "study1", "positive", "gene1"),
-            ("variant2", "study1", "positive", "gene2"),
-        ],
-        ["variantId", "geneId", "goldStandardSet", "studyId"],
-    )
-    observed_df = OpenTargetsL2GGoldStandard.expand_gold_standard_with_negatives(
-        sample_positive_set, V2G(_df=sample_v2g_df, _schema=V2G.get_schema())
-    )
-    assert (
-        observed_df.collect() == expected_expanded_gs.collect()
-    ), "GS expansion is not as expected."
+    observed_df: DataFrame
+    expected_expanded_gs: DataFrame
+    sample_positive_set: DataFrame
+
+    def test_expand_gold_standard_with_negatives_logic(
+        self: TestExpandGoldStandardWithNegatives, spark: SparkSession
+    ) -> None:
+        """Test expanding positive set with negative set coincides with expected results."""
+        assert (
+            self.observed_df.collect() == self.expected_expanded_gs.collect()
+        ), "GS expansion is not as expected."
+
+    def test_expand_gold_standard_with_negatives_same_positives(
+        self: TestExpandGoldStandardWithNegatives, spark: SparkSession
+    ) -> None:
+        """Test expanding positive set with negative set doesn't remove any positives."""
+        assert (
+            self.observed_df.filter("goldStandardSet == 'positive'").count()
+            == self.sample_positive_set.count()
+        ), "GS expansion has removed positives."
+
+    @pytest.fixture(autouse=True)
+    def _setup(self: TestExpandGoldStandardWithNegatives, spark: SparkSession) -> None:
+        """Prepare fixtures for TestExpandGoldStandardWithNegatives."""
+        self.sample_positive_set = spark.createDataFrame(
+            [
+                ("variant1", "gene1", "study1"),
+                ("variant2", "gene2", "study1"),
+            ],
+            ["variantId", "geneId", "studyId"],
+        )
+
+        sample_v2g_df = spark.createDataFrame(
+            [
+                ("variant1", "gene1", 5, "X", "X", "X"),
+                ("variant1", "gene3", 10, "X", "X", "X"),
+            ],
+            [
+                "variantId",
+                "geneId",
+                "distance",
+                "chromosome",
+                "datatypeId",
+                "datasourceId",
+            ],
+        )
+
+        self.expected_expanded_gs = spark.createDataFrame(
+            [
+                ("variant1", "study1", "negative", "gene3"),
+                ("variant1", "study1", "positive", "gene1"),
+                ("variant2", "study1", "positive", "gene2"),
+            ],
+            ["variantId", "geneId", "goldStandardSet", "studyId"],
+        )
+        self.observed_df = (
+            OpenTargetsL2GGoldStandard.expand_gold_standard_with_negatives(
+                self.sample_positive_set,
+                V2G(_df=sample_v2g_df, _schema=V2G.get_schema()),
+            )
+        )

From ca94412df9a0ebc950b3b10cb0b32c403eacc0d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Fri, 17 Nov 2023 15:40:56 +0000
Subject: [PATCH 10/25] test: testing for `process_gene_interactions`

---
 src/otg/dataset/l2g_gold_standard.py          | 31 +++++++++++++------
 .../open_targets/l2g_gold_standard.py         |  1 -
 tests/dataset/test_l2g.py                     | 16 +++++++++-
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
index b5f1189a9..2a68762a8 100644
--- a/src/otg/dataset/l2g_gold_standard.py
+++ b/src/otg/dataset/l2g_gold_standard.py
@@ -55,6 +55,15 @@ def from_otg_curation(
             .remove_false_negatives(interactions_df)
         )
 
+    @classmethod
+    def get_schema(cls: type[L2GGoldStandard]) -> StructType:
+        """Provides the schema for the L2GGoldStandard dataset.
+
+        Returns:
+            StructType: Spark schema for the L2GGoldStandard dataset
+        """
+        return parse_spark_schema("l2g_gold_standard.json")
+
     @classmethod
     def process_gene_interactions(
         cls: Type[L2GGoldStandard], interactions: DataFrame
@@ -62,10 +71,21 @@ def process_gene_interactions(
         """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.
 
         Args:
-            interactions (DataFrame): Gene-gene interactions dataset
+            interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform
 
         Returns:
             DataFrame: Top scoring gene-gene interaction per pair of genes
+
+        Examples:
+            >>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
+            >>> L2GGoldStandard.process_gene_interactions(interactions).show()
+            +-------+-------+-----+
+            |geneIdA|geneIdB|score|
+            +-------+-------+-----+
+            |  gene1|  gene2|  0.8|
+            |  gene2|  gene3|  0.7|
+            +-------+-------+-----+
+            <BLANKLINE>
         """
         return get_record_with_maximum_value(
             interactions,
@@ -77,15 +97,6 @@ def process_gene_interactions(
             "scoring as score",
         )
 
-    @classmethod
-    def get_schema(cls: type[L2GGoldStandard]) -> StructType:
-        """Provides the schema for the L2GGoldStandard dataset.
-
-        Returns:
-            StructType: Spark schema for the L2GGoldStandard dataset
-        """
-        return parse_spark_schema("l2g_gold_standard.json")
-
     def filter_unique_associations(
         self: L2GGoldStandard,
         study_locus_overlap: StudyLocusOverlap,
diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 0611023e1..cc8f4b710 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -74,7 +74,6 @@ def expand_gold_standard_with_negatives(
         Returns:
             DataFrame: Full set of positive and negative evidence of locus to gene associations
         """
-        # TODO: test function
         return (
             positive_set.withColumnRenamed("geneId", "curated_geneId")
             .join(
diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py
index 140a26c01..2ed278e07 100644
--- a/tests/dataset/test_l2g.py
+++ b/tests/dataset/test_l2g.py
@@ -1,10 +1,15 @@
-"""Tests on LD index."""
+"""Tests on L2G datasets."""
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from otg.dataset.l2g_feature_matrix import L2GFeatureMatrix
 from otg.dataset.l2g_gold_standard import L2GGoldStandard
 from otg.dataset.l2g_prediction import L2GPrediction
 
+if TYPE_CHECKING:
+    from pyspark.sql import DataFrame
+
 
 def test_feature_matrix(mock_l2g_feature_matrix: L2GFeatureMatrix) -> None:
     """Test L2G Feature Matrix creation with mock data."""
@@ -16,6 +21,15 @@ def test_gold_standard(mock_l2g_gold_standard: L2GFeatureMatrix) -> None:
     assert isinstance(mock_l2g_gold_standard, L2GGoldStandard)
 
 
+def test_process_gene_interactions(sample_otp_interactions: DataFrame) -> None:
+    """Tests processing of gene interactions from OTP."""
+    expected_cols = ["geneIdA", "geneIdB", "score"]
+    observed_df = L2GGoldStandard.process_gene_interactions(sample_otp_interactions)
+    assert (
+        observed_df.columns == expected_cols
+    ), "Gene interactions has a different schema."
+
+
 def test_predictions(mock_l2g_predictions: L2GFeatureMatrix) -> None:
     """Test L2G predictions creation with mock data."""
     assert isinstance(mock_l2g_predictions, L2GPrediction)

From 6a339761e80361ee3aa5a8ae1fb40bf785b665cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 12:32:43 +0000
Subject: [PATCH 11/25] chore: add `variantId` to gold standards schema

---
 src/otg/assets/schemas/l2g_gold_standard.json | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json
index 98e3e906b..d9d79ce09 100644
--- a/src/otg/assets/schemas/l2g_gold_standard.json
+++ b/src/otg/assets/schemas/l2g_gold_standard.json
@@ -7,6 +7,12 @@
       "nullable": false,
       "metadata": {}
     },
+    {
+      "name": "variantId",
+      "type": "string",
+      "nullable": false,
+      "metadata": {}
+    },
     {
       "name": "geneId",
       "type": "string",

From c75a6634bf55113c3bff15fbab2d169778c07eb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 12:34:37 +0000
Subject: [PATCH 12/25] chore: change `sources` in gold standards schema to a
 nullable

---
 src/otg/assets/schemas/l2g_gold_standard.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json
index d9d79ce09..ba494b9aa 100644
--- a/src/otg/assets/schemas/l2g_gold_standard.json
+++ b/src/otg/assets/schemas/l2g_gold_standard.json
@@ -28,7 +28,7 @@
     {
       "metadata": {},
       "name": "sources",
-      "nullable": false,
+      "nullable": true,
       "type": {
         "containsNull": true,
         "elementType": "string",

From 80077267bac0b948e8618581bc3dfa10b39388cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 13:01:02 +0000
Subject: [PATCH 13/25] test: add `test_filter_unique_associations`

---
 src/otg/dataset/l2g_gold_standard.py |  3 +-
 tests/dataset/test_l2g.py            | 56 +++++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
index 2a68762a8..59f437606 100644
--- a/src/otg/dataset/l2g_gold_standard.py
+++ b/src/otg/dataset/l2g_gold_standard.py
@@ -109,7 +109,7 @@ def filter_unique_associations(
         Returns:
             L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
         """
-        # TODO: Test this logic
+        cols_to_keep = self.df.columns
         self.df = (
             self.df.alias("left")
             .join(
@@ -120,6 +120,7 @@ def filter_unique_associations(
                 | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
                 how="left",
             )
+            .select(*cols_to_keep)
             .distinct()
         )
         return self
diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py
index 2ed278e07..fc93f6cad 100644
--- a/tests/dataset/test_l2g.py
+++ b/tests/dataset/test_l2g.py
@@ -6,9 +6,10 @@
 from otg.dataset.l2g_feature_matrix import L2GFeatureMatrix
 from otg.dataset.l2g_gold_standard import L2GGoldStandard
 from otg.dataset.l2g_prediction import L2GPrediction
+from otg.dataset.study_locus_overlap import StudyLocusOverlap
 
 if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
+    from pyspark.sql import DataFrame, SparkSession
 
 
 def test_feature_matrix(mock_l2g_feature_matrix: L2GFeatureMatrix) -> None:
@@ -33,3 +34,56 @@ def test_process_gene_interactions(sample_otp_interactions: DataFrame) -> None:
 def test_predictions(mock_l2g_predictions: L2GFeatureMatrix) -> None:
     """Test L2G predictions creation with mock data."""
     assert isinstance(mock_l2g_predictions, L2GPrediction)
+
+
+def test_filter_unique_associations(spark: SparkSession) -> None:
+    """Test filter_unique_associations."""
+    mock_l2g_gs_df = spark.createDataFrame(
+        [
+            (1, "variant1", "gene1", "positive"),
+            (
+                2,
+                "variant2",
+                "gene1",
+                "negative",
+            ),  # in the same locus as sl1 and pointing to same gene, has to be dropped
+            (
+                3,
+                "variant3",
+                "gene1",
+                "positive",
+            ),  # in diff locus as sl1 and pointing to same gene, has to be kept
+            (
+                4,
+                "variant4",
+                "gene2",
+                "positive",
+            ),  # in same locus as sl1 and pointing to diff gene, has to be kept
+        ],
+        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+    )
+
+    mock_sl_overlap_df = spark.createDataFrame(
+        [(1, 2, "variant2"), (1, 4, "variant4")],
+        "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING",
+    )
+
+    expected_df = spark.createDataFrame(
+        [
+            (1, "variant1", "gene1", "positive"),
+            (3, "variant3", "gene1", "positive"),
+            (4, "variant4", "gene2", "positive"),
+        ],
+        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+    )
+
+    mock_l2g_gs = L2GGoldStandard(
+        _df=mock_l2g_gs_df, _schema=L2GGoldStandard.get_schema()
+    )
+    mock_sl_overlap = StudyLocusOverlap(
+        _df=mock_sl_overlap_df, _schema=StudyLocusOverlap.get_schema()
+    )
+
+    observed_df = mock_l2g_gs.filter_unique_associations(mock_sl_overlap).df
+
+    assert observed_df.collect() == expected_df.collect()

From 9c0a042306f0e56ac42d1c67f354e65c6c252500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 17:01:19 +0000
Subject: [PATCH 14/25] feat(overlaps): add and test method to transform the
 overlaps as a square matrix

---
 src/otg/dataset/study_locus_overlap.py    | 17 +++++++++++++++
 tests/dataset/test_study_locus_overlap.py | 26 +++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/src/otg/dataset/study_locus_overlap.py b/src/otg/dataset/study_locus_overlap.py
index 5902f613f..d0730d723 100644
--- a/src/otg/dataset/study_locus_overlap.py
+++ b/src/otg/dataset/study_locus_overlap.py
@@ -47,3 +47,20 @@ def from_associations(
             StudyLocusOverlap: Study-locus overlap dataset
         """
         return study_locus.find_overlaps(study_index)
+
+    def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
+        """Convert the dataset to a square matrix.
+
+        Returns:
+            StudyLocusOverlap: Square matrix of the dataset
+        """
+        return StudyLocusOverlap(
+            _df=self.df.unionByName(
+                self.df.selectExpr(
+                    "leftStudyLocusId as rightStudyLocusId",
+                    "rightStudyLocusId as leftStudyLocusId",
+                    "tagVariantId",
+                )
+            ).distinct(),
+            _schema=self.get_schema(),
+        )
diff --git a/tests/dataset/test_study_locus_overlap.py b/tests/dataset/test_study_locus_overlap.py
index b16311d6c..23c3a4e65 100644
--- a/tests/dataset/test_study_locus_overlap.py
+++ b/tests/dataset/test_study_locus_overlap.py
@@ -1,6 +1,8 @@
 """Test study locus overlap dataset."""
 from __future__ import annotations
 
+from pyspark.sql import SparkSession
+
 from otg.dataset.study_locus_overlap import StudyLocusOverlap
 
 
@@ -9,3 +11,27 @@ def test_study_locus_overlap_creation(
 ) -> None:
     """Test study locus overlap creation with mock data."""
     assert isinstance(mock_study_locus_overlap, StudyLocusOverlap)
+
+
+def test_convert_to_square_matrix(spark: SparkSession) -> None:
+    """Test _convert_to_square_matrix."""
+    mock_sl_overlap = StudyLocusOverlap(
+        _df=spark.createDataFrame(
+            [
+                (1, 2, "variant2"),
+            ],
+            "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING",
+        ),
+        _schema=StudyLocusOverlap.get_schema(),
+    )
+
+    expected_df = spark.createDataFrame(
+        [
+            (1, 2, "variant2"),
+            (2, 1, "variant2"),
+        ],
+        "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING",
+    )
+    observed_df = mock_sl_overlap._convert_to_square_matrix().df
+
+    assert observed_df.collect() == expected_df.collect()

From dc7c423d6f668f7c8b8d4a072a85058beaf21a76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 17:02:08 +0000
Subject: [PATCH 15/25] chore(overlaps): chromosome and statistics are not
 mandatory fields in the schema

---
 src/otg/assets/schemas/study_locus_overlap.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/otg/assets/schemas/study_locus_overlap.json b/src/otg/assets/schemas/study_locus_overlap.json
index 962a1186d..103321f79 100644
--- a/src/otg/assets/schemas/study_locus_overlap.json
+++ b/src/otg/assets/schemas/study_locus_overlap.json
@@ -15,7 +15,7 @@
     {
       "metadata": {},
       "name": "chromosome",
-      "nullable": false,
+      "nullable": true,
       "type": "string"
     },
     {
@@ -27,7 +27,7 @@
     {
       "metadata": {},
       "name": "statistics",
-      "nullable": false,
+      "nullable": true,
       "type": {
         "fields": [
           {

From 28031b8a14dfad0ef91d7160d138567a7ead0e6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 17:05:35 +0000
Subject: [PATCH 16/25] feat(l2g_gold_standard): change
 `filter_unique_associations` logic

---
 src/otg/dataset/l2g_gold_standard.py | 38 ++++++++++++++++++++--------
 tests/dataset/test_l2g.py            |  2 +-
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
index 59f437606..f838f0173 100644
--- a/src/otg/dataset/l2g_gold_standard.py
+++ b/src/otg/dataset/l2g_gold_standard.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Type
 
 import pyspark.sql.functions as f
+from pyspark.sql import Window
 
 from otg.common.schemas import parse_spark_schema
 from otg.common.spark_helpers import get_record_with_maximum_value
@@ -101,7 +102,12 @@ def filter_unique_associations(
         self: L2GGoldStandard,
         study_locus_overlap: StudyLocusOverlap,
     ) -> L2GGoldStandard:
-        """Refines the gold standard to filter out loci that are not independent. redundant loci by testing they are truly independent.
+        """Refines the gold standard to filter out loci that are not independent.
+
+        Rules:
+        - If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one.
+        - If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one.
+        - If two loci point to different genes, and have overlapping variants, we keep both.
 
         Args:
             study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.
@@ -109,21 +115,33 @@ def filter_unique_associations(
         Returns:
             L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
         """
+        squared_overlaps = study_locus_overlap._convert_to_square_matrix()
         cols_to_keep = self.df.columns
-        self.df = (
+        unique_associations = (
             self.df.alias("left")
+            # identify all the study loci that point to the same gene
+            .withColumn(
+                "sl_same_gene",
+                f.collect_set("studyLocusId").over(Window.partitionBy("geneId")),
+            )
+            # identify all the study loci that have an overlapping variant
             .join(
-                study_locus_overlap.df.select(
-                    "leftStudyLocusId", "rightStudyLocusId"
-                ).alias("right"),
-                (f.col("left.variantId") == f.col("right.leftStudyLocusId"))
-                | (f.col("left.variantId") == f.col("right.rightStudyLocusId")),
-                how="left",
+                squared_overlaps.df.alias("right"),
+                (f.col("left.studyLocusId") == f.col("right.leftStudyLocusId"))
+                & (f.col("left.variantId") == f.col("right.tagVariantId")),
+                "left",
+            )
+            .withColumn(
+                "overlaps",
+                f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise(
+                    f.lit(False)
+                ),
             )
+            # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
+            .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
             .select(*cols_to_keep)
-            .distinct()
         )
-        return self
+        return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())
 
     def remove_false_negatives(
         self: L2GGoldStandard,
diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py
index fc93f6cad..3ca74ecab 100644
--- a/tests/dataset/test_l2g.py
+++ b/tests/dataset/test_l2g.py
@@ -82,7 +82,7 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
     )
     mock_sl_overlap = StudyLocusOverlap(
         _df=mock_sl_overlap_df, _schema=StudyLocusOverlap.get_schema()
-    )
+    )._convert_to_square_matrix()
 
     observed_df = mock_l2g_gs.filter_unique_associations(mock_sl_overlap).df
 

From aa4246ccea29023e7022663eaec338eeb7a57f18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 20 Nov 2023 17:23:32 +0000
Subject: [PATCH 17/25] test(l2g_gold_standard): add
 `test_remove_false_negatives`

---
 tests/dataset/test_l2g.py | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py
index 3ca74ecab..12c80d874 100644
--- a/tests/dataset/test_l2g.py
+++ b/tests/dataset/test_l2g.py
@@ -87,3 +87,57 @@ def test_filter_unique_associations(spark: SparkSession) -> None:
     observed_df = mock_l2g_gs.filter_unique_associations(mock_sl_overlap).df
 
     assert observed_df.collect() == expected_df.collect()
+
+
+def test_remove_false_negatives(spark: SparkSession) -> None:
+    """Test `remove_false_negatives`."""
+    mock_l2g_gs_df = spark.createDataFrame(
+        [
+            (1, "variant1", "gene1", "positive"),
+            (
+                2,
+                "variant2",
+                "gene2",
+                "negative",
+            ),  # gene2 is a partner of gene1, has to be dropped
+            (
+                3,
+                "variant3",
+                "gene3",
+                "negative",
+            ),  # gene 3 is not a partner of gene1, has to be kept
+            (
+                4,
+                "variant4",
+                "gene4",
+                "positive",
+            ),  # gene 4 is a partner of gene1, has to be kept because it's positive
+        ],
+        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+    )
+
+    mock_interactions_df = spark.createDataFrame(
+        [
+            ("gene1", "gene2", 0.8),
+            ("gene1", "gene3", 0.5),
+            ("gene1", "gene4", 0.8),
+        ],
+        "geneIdA STRING, geneIdB STRING, score DOUBLE",
+    )
+
+    expected_df = spark.createDataFrame(
+        [
+            (1, "variant1", "gene1", "positive"),
+            (3, "variant3", "gene3", "negative"),
+            (4, "variant4", "gene4", "positive"),
+        ],
+        "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING",
+    )
+
+    mock_l2g_gs = L2GGoldStandard(
+        _df=mock_l2g_gs_df, _schema=L2GGoldStandard.get_schema()
+    )
+
+    observed_df = mock_l2g_gs.remove_false_negatives(mock_interactions_df).df
+
+    assert observed_df.collect() == expected_df.collect()

From 0a1ffa0b75f9034ed02a6a333969d037e3e171c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 21 Nov 2023 00:28:25 +0000
Subject: [PATCH 18/25] fix(l2g_gold_standard): fix logic in
 `remove_false_negatives`

---
 src/otg/dataset/l2g_gold_standard.py          | 57 ++++++++++++-------
 .../open_targets/l2g_gold_standard.py         |  1 -
 tests/dataset/test_l2g.py                     |  4 +-
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
index f838f0173..65ba5859b 100644
--- a/src/otg/dataset/l2g_gold_standard.py
+++ b/src/otg/dataset/l2g_gold_standard.py
@@ -116,7 +116,6 @@ def filter_unique_associations(
             L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
         """
         squared_overlaps = study_locus_overlap._convert_to_square_matrix()
-        cols_to_keep = self.df.columns
         unique_associations = (
             self.df.alias("left")
             # identify all the study loci that point to the same gene
@@ -139,7 +138,7 @@ def filter_unique_associations(
             )
             # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
             .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
-            .select(*cols_to_keep)
+            .select(*self.df.columns)
         )
         return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())
 
@@ -155,30 +154,44 @@ def remove_false_negatives(
         Returns:
             L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
         """
-        # TODO: Test this logic
-        self.df = (
+        squared_interactions = interactions_df.unionByName(
+            interactions_df.selectExpr(
+                "geneIdB as geneIdA", "geneIdA as geneIdB", "score"
+            )
+        ).filter(f.col("score") > self.INTERACTION_THRESHOLD)
+        df = (
             self.df.alias("left")
             .join(
-                interactions_df.alias("interactions"),
-                (f.col("left.geneId") == f.col("interactions.geneIdA"))
-                | (f.col("left.geneId") == f.col("interactions.geneIdB")),
-                how="left",
+                # bring gene partners
+                squared_interactions.alias("right"),
+                f.col("left.geneId") == f.col("right.geneIdA"),
+                "left",
             )
-            .withColumn(
-                "interacting",
-                (f.col("score") > self.INTERACTION_THRESHOLD),
+            .withColumnRenamed("geneIdB", "interactorGeneId")
+            .join(
+                # bring gold standard status for gene partners
+                self.df.selectExpr(
+                    "geneId as interactorGeneId",
+                    "goldStandardSet as interactorGeneIdGoldStandardSet",
+                ),
+                "interactorGeneId",
+                "left",
+            )
+            # remove self-interactions
+            .filter(
+                (f.col("geneId") != f.col("interactorGeneId"))
+                | (f.col("interactorGeneId").isNull())
             )
+            # remove false negatives
             .filter(
-                ~(
-                    (
-                        f.col("goldStandardSet") == 0
-                    )  # TODO: goldStandardSet is a string, not an int
-                    & (f.col("interacting"))
-                    & (
-                        (f.col("left.geneId") == f.col("interactions.geneIdA"))
-                        | (f.col("left.geneId") == f.col("interactions.geneIdB"))
-                    )
-                )
+                # drop rows where the GS gene is negative but the interactor is a GS positive
+                ~(f.col("goldStandardSet") == "negative")
+                & (f.col("interactorGeneIdGoldStandardSet") == "positive")
+                |
+                # keep rows where the gene does not interact
+                (f.col("interactorGeneId").isNull())
             )
+            .select(*self.df.columns)
+            .distinct()
         )
-        return self
+        return L2GGoldStandard(_df=df, _schema=self.get_schema())
diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index cc8f4b710..0d95ecbe8 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -121,7 +121,6 @@ def as_l2g_gold_standard(
             _df=cls.parse_positive_curation(gold_standard_curation)
             .transform(cls.expand_gold_standard_with_negatives, v2g)
             .drop(
-                "variantId",
                 "studyId",
             ),
             _schema=L2GGoldStandard.get_schema(),
diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py
index 12c80d874..eb42d01c9 100644
--- a/tests/dataset/test_l2g.py
+++ b/tests/dataset/test_l2g.py
@@ -138,6 +138,8 @@ def test_remove_false_negatives(spark: SparkSession) -> None:
         _df=mock_l2g_gs_df, _schema=L2GGoldStandard.get_schema()
     )
 
-    observed_df = mock_l2g_gs.remove_false_negatives(mock_interactions_df).df
+    observed_df = mock_l2g_gs.remove_false_negatives(mock_interactions_df).df.orderBy(
+        "studyLocusId"
+    )
 
     assert observed_df.collect() == expected_df.collect()

From 2f13b3b76015f610a2a6d2b6cf5b5bf715da6dfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 21 Nov 2023 00:39:28 +0000
Subject: [PATCH 19/25] chore(gold_standards): define gs labels as
 `L2GGoldStandard` attributes

---
 src/otg/dataset/l2g_gold_standard.py                 | 2 ++
 src/otg/datasource/open_targets/l2g_gold_standard.py | 8 +++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py
index 65ba5859b..c6c0b89b9 100644
--- a/src/otg/dataset/l2g_gold_standard.py
+++ b/src/otg/dataset/l2g_gold_standard.py
@@ -24,6 +24,8 @@ class L2GGoldStandard(Dataset):
     """L2G gold standard dataset."""
 
     INTERACTION_THRESHOLD = 0.7
+    GS_POSITIVE_LABEL = "positive"
+    GS_NEGATIVE_LABEL = "negative"
 
     @classmethod
     def from_otg_curation(
diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py
index 0d95ecbe8..532e382fe 100644
--- a/src/otg/datasource/open_targets/l2g_gold_standard.py
+++ b/src/otg/datasource/open_targets/l2g_gold_standard.py
@@ -20,8 +20,6 @@ class OpenTargetsL2GGoldStandard:
     """
 
     LOCUS_TO_GENE_WINDOW = 500_000
-    GS_POSITIVE_LABEL = "positive"
-    GS_NEGATIVE_LABEL = "negative"
 
     @classmethod
     def parse_positive_curation(
@@ -89,13 +87,13 @@ def expand_gold_standard_with_negatives(
                     (f.col("curated_geneId") == f.col("non_curated_geneId"))
                     # to keep the positives that are outside the v2g dataset
                     | (f.col("non_curated_geneId").isNull()),
-                    f.lit(cls.GS_POSITIVE_LABEL),
-                ).otherwise(cls.GS_NEGATIVE_LABEL),
+                    f.lit(L2GGoldStandard.GS_POSITIVE_LABEL),
+                ).otherwise(L2GGoldStandard.GS_NEGATIVE_LABEL),
             )
             .withColumn(
                 "geneId",
                 f.when(
-                    f.col("goldStandardSet") == cls.GS_POSITIVE_LABEL,
+                    f.col("goldStandardSet") == L2GGoldStandard.GS_POSITIVE_LABEL,
                     f.col("curated_geneId"),
                 ).otherwise(f.col("non_curated_geneId")),
             )

From 5f7d928451beaf8734442ac882df752fa40e8749 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Nov 2023 09:42:56 +0000
Subject: [PATCH 20/25] build(deps): bump pyarrow from 11.0.0 to 14.0.1

Bumps [pyarrow](https://github.com/apache/arrow) from 11.0.0 to 14.0.1.
- [Commits](https://github.com/apache/arrow/compare/go/v11.0.0...go/v14.0.1)

---
updated-dependencies:
- dependency-name: pyarrow
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 67 +++++++++++++++++++++++++++++---------------------
 pyproject.toml |  2 +-
 2 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e11bdf074..32f676f53 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -6010,36 +6010,47 @@ files = [
 
 [[package]]
 name = "pyarrow"
-version = "11.0.0"
+version = "14.0.1"
 description = "Python library for Apache Arrow"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-11.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:40bb42afa1053c35c749befbe72f6429b7b5f45710e85059cdd534553ebcf4f2"},
-    {file = "pyarrow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7c28b5f248e08dea3b3e0c828b91945f431f4202f1a9fe84d1012a761324e1ba"},
-    {file = "pyarrow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a37bc81f6c9435da3c9c1e767324ac3064ffbe110c4e460660c43e144be4ed85"},
-    {file = "pyarrow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7c53def8dbbc810282ad308cc46a523ec81e653e60a91c609c2233ae407689"},
-    {file = "pyarrow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:25aa11c443b934078bfd60ed63e4e2d42461682b5ac10f67275ea21e60e6042c"},
-    {file = "pyarrow-11.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:e217d001e6389b20a6759392a5ec49d670757af80101ee6b5f2c8ff0172e02ca"},
-    {file = "pyarrow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ad42bb24fc44c48f74f0d8c72a9af16ba9a01a2ccda5739a517aa860fa7e3d56"},
-    {file = "pyarrow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d942c690ff24a08b07cb3df818f542a90e4d359381fbff71b8f2aea5bf58841"},
-    {file = "pyarrow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f010ce497ca1b0f17a8243df3048055c0d18dcadbcc70895d5baf8921f753de5"},
-    {file = "pyarrow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:2f51dc7ca940fdf17893227edb46b6784d37522ce08d21afc56466898cb213b2"},
-    {file = "pyarrow-11.0.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:1cbcfcbb0e74b4d94f0b7dde447b835a01bc1d16510edb8bb7d6224b9bf5bafc"},
-    {file = "pyarrow-11.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaee8f79d2a120bf3e032d6d64ad20b3af6f56241b0ffc38d201aebfee879d00"},
-    {file = "pyarrow-11.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:410624da0708c37e6a27eba321a72f29d277091c8f8d23f72c92bada4092eb5e"},
-    {file = "pyarrow-11.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2d53ba72917fdb71e3584ffc23ee4fcc487218f8ff29dd6df3a34c5c48fe8c06"},
-    {file = "pyarrow-11.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f12932e5a6feb5c58192209af1d2607d488cb1d404fbc038ac12ada60327fa34"},
-    {file = "pyarrow-11.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:41a1451dd895c0b2964b83d91019e46f15b5564c7ecd5dcb812dadd3f05acc97"},
-    {file = "pyarrow-11.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:becc2344be80e5dce4e1b80b7c650d2fc2061b9eb339045035a1baa34d5b8f1c"},
-    {file = "pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f40be0d7381112a398b93c45a7e69f60261e7b0269cc324e9f739ce272f4f70"},
-    {file = "pyarrow-11.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:362a7c881b32dc6b0eccf83411a97acba2774c10edcec715ccaab5ebf3bb0835"},
-    {file = "pyarrow-11.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:ccbf29a0dadfcdd97632b4f7cca20a966bb552853ba254e874c66934931b9841"},
-    {file = "pyarrow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e99be85973592051e46412accea31828da324531a060bd4585046a74ba45854"},
-    {file = "pyarrow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69309be84dcc36422574d19c7d3a30a7ea43804f12552356d1ab2a82a713c418"},
-    {file = "pyarrow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da93340fbf6f4e2a62815064383605b7ffa3e9eeb320ec839995b1660d69f89b"},
-    {file = "pyarrow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:caad867121f182d0d3e1a0d36f197df604655d0b466f1bc9bafa903aa95083e4"},
-    {file = "pyarrow-11.0.0.tar.gz", hash = "sha256:5461c57dbdb211a632a48facb9b39bbeb8a7905ec95d768078525283caef5f6d"},
+    {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"},
+    {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"},
+    {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"},
+    {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"},
+    {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"},
+    {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"},
+    {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"},
+    {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"},
+    {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"},
+    {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"},
+    {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"},
+    {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"},
+    {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"},
+    {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"},
+    {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"},
+    {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"},
 ]
 
 [package.dependencies]
@@ -8391,4 +8402,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10.8"
-content-hash = "55087f647f35a1d78c6384fe55eaa084aad18debff119f164f4f14707e7465fc"
+content-hash = "8197d06ec721972d642dd466b4de4fed93c9b14a279749ee7eb4f91857f5fce3"
diff --git a/pyproject.toml b/pyproject.toml
index 38e91874a..6ca883855 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ xgboost = "^1.7.3"
 scikit-learn = "^1.2.1"
 numpy = "^1.26.1"
 hail = "0.2.126"
-pyarrow = "^11.0.0"
+pyarrow = "^14.0.1"
 wandb = "^0.16.0"
 
 [tool.poetry.dev-dependencies]

From 4e4e4f5d45af2574d4e4af38d0924896166182da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Mon, 27 Nov 2023 10:46:29 +0100
Subject: [PATCH 21/25] chore: rename study_locus to credible_set for l2g

---
 config/step/locus_to_gene.yaml    |  2 +-
 src/otg/dataset/l2g_prediction.py |  2 +-
 src/otg/l2g.py                    | 14 +++++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/config/step/locus_to_gene.yaml b/config/step/locus_to_gene.yaml
index c05301049..dd4e018fb 100644
--- a/config/step/locus_to_gene.yaml
+++ b/config/step/locus_to_gene.yaml
@@ -8,7 +8,7 @@ wandb_run_name: null
 perform_cross_validation: false
 model_path: ${datasets.l2g_model}
 predictions_path: ${datasets.l2g_predictions}
-study_locus_path: ${datasets.credible_set}
+credible_set_path: ${datasets.credible_set}
 variant_gene_path: ${datasets.v2g}
 colocalisation_path: ${datasets.colocalisation}
 study_index_path: ${datasets.catalog_study_index}
diff --git a/src/otg/dataset/l2g_prediction.py b/src/otg/dataset/l2g_prediction.py
index ce4e34144..a588818cd 100644
--- a/src/otg/dataset/l2g_prediction.py
+++ b/src/otg/dataset/l2g_prediction.py
@@ -41,7 +41,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType:
         return parse_spark_schema("l2g_predictions.json")
 
     @classmethod
-    def from_study_locus(
+    def from_credible_set(
         cls: Type[L2GPrediction],
         model_path: str,
         study_locus: StudyLocus,
diff --git a/src/otg/l2g.py b/src/otg/l2g.py
index df0f886e9..95d477dc6 100644
--- a/src/otg/l2g.py
+++ b/src/otg/l2g.py
@@ -33,7 +33,7 @@ class LocusToGeneStep:
         perform_cross_validation (bool): Whether to perform cross validation.
         model_path (str | None): Path to save the model.
         predictions_path (str | None): Path to save the predictions.
-        study_locus_path (str): Path to study locus Parquet files.
+        credible_set_path (str): Path to credible set Parquet files.
         variant_gene_path (str): Path to variant to gene Parquet files.
         colocalisation_path (str): Path to colocalisation Parquet files.
         study_index_path (str): Path to study index Parquet files.
@@ -52,7 +52,7 @@ class LocusToGeneStep:
     perform_cross_validation: bool = False
     model_path: str = MISSING
     predictions_path: str = MISSING
-    study_locus_path: str = MISSING
+    credible_set_path: str = MISSING
     variant_gene_path: str = MISSING
     colocalisation_path: str = MISSING
     study_index_path: str = MISSING
@@ -109,8 +109,8 @@ def __post_init__(self: LocusToGeneStep) -> None:
                 f"run_mode must be one of 'train' or 'predict', got {self.run_mode}"
             )
         # Load common inputs
-        study_locus = StudyLocus.from_parquet(
-            self.session, self.study_locus_path, recursiveFileLookup=True
+        credible_set = StudyLocus.from_parquet(
+            self.session, self.credible_set_path, recursiveFileLookup=True
         )
         studies = StudyIndex.from_parquet(self.session, self.study_index_path)
         v2g = V2G.from_parquet(self.session, self.variant_gene_path)
@@ -132,7 +132,7 @@ def __post_init__(self: LocusToGeneStep) -> None:
             )
 
             fm = L2GFeatureMatrix.generate_features(
-                study_locus=study_locus,
+                study_locus=credible_set,
                 study_index=studies,
                 variant_gene=v2g,
                 # colocalisation=coloc,
@@ -185,9 +185,9 @@ def __post_init__(self: LocusToGeneStep) -> None:
                 raise ValueError(
                     "model_path and predictions_path must be set for predict mode."
                 )
-            predictions = L2GPrediction.from_study_locus(
+            predictions = L2GPrediction.from_credible_set(
                 self.model_path,
-                study_locus,
+                credible_set,
                 studies,
                 v2g,
                 # coloc

From aa05aa51375fe1c22842e07da0a7565b2cc3a1c7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Nov 2023 11:54:46 +0000
Subject: [PATCH 22/25] build(deps-dev): bump ipython from 8.17.2 to 8.18.1
 (#280)

Bumps [ipython](https://github.com/ipython/ipython) from 8.17.2 to 8.18.1.
- [Release notes](https://github.com/ipython/ipython/releases)
- [Commits](https://github.com/ipython/ipython/compare/8.17.2...8.18.1)

---
updated-dependencies:
- dependency-name: ipython
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock    | 17 ++++++++---------
 pyproject.toml |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 32f676f53..465cbc5e4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3998,24 +3998,23 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio"
 
 [[package]]
 name = "ipython"
-version = "8.17.2"
+version = "8.18.1"
 description = "IPython: Productive Interactive Computing"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "ipython-8.17.2-py3-none-any.whl", hash = "sha256:1e4d1d666a023e3c93585ba0d8e962867f7a111af322efff6b9c58062b3e5444"},
-    {file = "ipython-8.17.2.tar.gz", hash = "sha256:126bb57e1895594bb0d91ea3090bbd39384f6fe87c3d57fd558d0670f50339bb"},
+    {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"},
+    {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"},
 ]
 
 [package.dependencies]
-appnope = {version = "*", markers = "sys_platform == \"darwin\""}
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
 decorator = "*"
 exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
 jedi = ">=0.16"
 matplotlib-inline = "*"
 pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""}
-prompt-toolkit = ">=3.0.30,<3.0.37 || >3.0.37,<3.1.0"
+prompt-toolkit = ">=3.0.41,<3.1.0"
 pygments = ">=2.4.0"
 stack-data = "*"
 traitlets = ">=5"
@@ -5873,13 +5872,13 @@ dev = ["nose", "pipreqs", "twine"]
 
 [[package]]
 name = "prompt-toolkit"
-version = "3.0.39"
+version = "3.0.41"
 description = "Library for building powerful interactive command lines in Python"
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "prompt_toolkit-3.0.39-py3-none-any.whl", hash = "sha256:9dffbe1d8acf91e3de75f3b544e4842382fc06c6babe903ac9acb74dc6e08d88"},
-    {file = "prompt_toolkit-3.0.39.tar.gz", hash = "sha256:04505ade687dc26dc4284b1ad19a83be2f2afe83e7a828ace0c72f3a1df72aac"},
+    {file = "prompt_toolkit-3.0.41-py3-none-any.whl", hash = "sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2"},
+    {file = "prompt_toolkit-3.0.41.tar.gz", hash = "sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0"},
 ]
 
 [package.dependencies]
@@ -8402,4 +8401,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10.8"
-content-hash = "8197d06ec721972d642dd466b4de4fed93c9b14a279749ee7eb4f91857f5fce3"
+content-hash = "fc7dce2fb06e39a21e470de2771bf5487bfd660ef3c4ec4e5dd7385ddeaa16d8"
diff --git a/pyproject.toml b/pyproject.toml
index 6ca883855..c5beaf455 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ pytest-xdist = "^3.4.0"
 
 
 [tool.poetry.group.dev.dependencies]
-ipython = "^8.5.0"
+ipython = "^8.18.1"
 ipykernel = "^6.19.0"
 google-cloud-dataproc = "^5.7.0"
 apache-airflow = "^2.7.3"

From feaf1afd1604f6bbd7d76d9187fb3a36fc07c928 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Nov 2023 12:18:25 +0000
Subject: [PATCH 23/25] build(deps-dev): bump mkdocstrings-python from 1.7.4 to
 1.7.5 (#279)

Bumps [mkdocstrings-python](https://github.com/mkdocstrings/python) from 1.7.4 to 1.7.5.
- [Release notes](https://github.com/mkdocstrings/python/releases)
- [Changelog](https://github.com/mkdocstrings/python/blob/main/CHANGELOG.md)
- [Commits](https://github.com/mkdocstrings/python/compare/1.7.4...1.7.5)

---
updated-dependencies:
- dependency-name: mkdocstrings-python
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 465cbc5e4..6eba09d58 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4959,13 +4959,13 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
 
 [[package]]
 name = "mkdocstrings-python"
-version = "1.7.4"
+version = "1.7.5"
 description = "A Python handler for mkdocstrings."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocstrings_python-1.7.4-py3-none-any.whl", hash = "sha256:70eacbe5f2d5071f2e525ba0b35bc447d398437dfbcd90c63fe6e977551cfe26"},
-    {file = "mkdocstrings_python-1.7.4.tar.gz", hash = "sha256:c2fc34efd70000ec31aee247910006e8dd9d1b9f3957bf46880c3f6e51a8f0d5"},
+    {file = "mkdocstrings_python-1.7.5-py3-none-any.whl", hash = "sha256:5f6246026353f0c0785135db70c3fe9a5d9318990fc7ceb11d62097b8ffdd704"},
+    {file = "mkdocstrings_python-1.7.5.tar.gz", hash = "sha256:c7d143728257dbf1aa550446555a554b760dcd40a763f077189d298502b800be"},
 ]
 
 [package.dependencies]
@@ -8401,4 +8401,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10.8"
-content-hash = "fc7dce2fb06e39a21e470de2771bf5487bfd660ef3c4ec4e5dd7385ddeaa16d8"
+content-hash = "90889ae9da76eb541d14880319735717ebd5c30bd22af7c3af48d3f9206d3784"
diff --git a/pyproject.toml b/pyproject.toml
index c5beaf455..f3db534ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ ruff = "^0.1.3"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.3"
-mkdocstrings-python = "^1.7.4"
+mkdocstrings-python = "^1.7.5"
 mkdocs-material = "*"
 mkdocs-section-index = "^0.3.4"
 mkdocs-git-revision-date-localized-plugin = "^1.2.1"

From 90c9ad3b6b100681a5840695655f351da1b5e5a2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Nov 2023 13:43:06 +0100
Subject: [PATCH 24/25] build(deps-dev): bump ruff from 0.1.3 to 0.1.6 (#276)

Bumps [ruff](https://github.com/astral-sh/ruff) from 0.1.3 to 0.1.6.
- [Release notes](https://github.com/astral-sh/ruff/releases)
- [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md)
- [Commits](https://github.com/astral-sh/ruff/compare/v0.1.3...v0.1.6)

---
updated-dependencies:
- dependency-name: ruff
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock    | 40 ++++++++++++++++++++--------------------
 pyproject.toml |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 6eba09d58..d33f277de 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -7177,28 +7177,28 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.1.3"
-description = "An extremely fast Python linter, written in Rust."
+version = "0.1.6"
+description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.1.3-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b46d43d51f7061652eeadb426a9e3caa1e0002470229ab2fc19de8a7b0766901"},
-    {file = "ruff-0.1.3-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:b8afeb9abd26b4029c72adc9921b8363374f4e7edb78385ffaa80278313a15f9"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca3cf365bf32e9ba7e6db3f48a4d3e2c446cd19ebee04f05338bc3910114528b"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4874c165f96c14a00590dcc727a04dca0cfd110334c24b039458c06cf78a672e"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eec2dd31eed114e48ea42dbffc443e9b7221976554a504767ceaee3dd38edeb8"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dc3ec4edb3b73f21b4aa51337e16674c752f1d76a4a543af56d7d04e97769613"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e3de9ed2e39160800281848ff4670e1698037ca039bda7b9274f849258d26ce"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c595193881922cc0556a90f3af99b1c5681f0c552e7a2a189956141d8666fe8"},
-    {file = "ruff-0.1.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f75e670d529aa2288cd00fc0e9b9287603d95e1536d7a7e0cafe00f75e0dd9d"},
-    {file = "ruff-0.1.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:76dd49f6cd945d82d9d4a9a6622c54a994689d8d7b22fa1322983389b4892e20"},
-    {file = "ruff-0.1.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:918b454bc4f8874a616f0d725590277c42949431ceb303950e87fef7a7d94cb3"},
-    {file = "ruff-0.1.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d8859605e729cd5e53aa38275568dbbdb4fe882d2ea2714c5453b678dca83784"},
-    {file = "ruff-0.1.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:0b6c55f5ef8d9dd05b230bb6ab80bc4381ecb60ae56db0330f660ea240cb0d4a"},
-    {file = "ruff-0.1.3-py3-none-win32.whl", hash = "sha256:3e7afcbdcfbe3399c34e0f6370c30f6e529193c731b885316c5a09c9e4317eef"},
-    {file = "ruff-0.1.3-py3-none-win_amd64.whl", hash = "sha256:7a18df6638cec4a5bd75350639b2bb2a2366e01222825562c7346674bdceb7ea"},
-    {file = "ruff-0.1.3-py3-none-win_arm64.whl", hash = "sha256:12fd53696c83a194a2db7f9a46337ce06445fb9aa7d25ea6f293cf75b21aca9f"},
-    {file = "ruff-0.1.3.tar.gz", hash = "sha256:3ba6145369a151401d5db79f0a47d50e470384d0d89d0d6f7fab0b589ad07c34"},
+    {file = "ruff-0.1.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:88b8cdf6abf98130991cbc9f6438f35f6e8d41a02622cc5ee130a02a0ed28703"},
+    {file = "ruff-0.1.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:5c549ed437680b6105a1299d2cd30e4964211606eeb48a0ff7a93ef70b902248"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cf5f701062e294f2167e66d11b092bba7af6a057668ed618a9253e1e90cfd76"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:05991ee20d4ac4bb78385360c684e4b417edd971030ab12a4fbd075ff535050e"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87455a0c1f739b3c069e2f4c43b66479a54dea0276dd5d4d67b091265f6fd1dc"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:683aa5bdda5a48cb8266fcde8eea2a6af4e5700a392c56ea5fb5f0d4bfdc0240"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:137852105586dcbf80c1717facb6781555c4e99f520c9c827bd414fac67ddfb6"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd98138a98d48a1c36c394fd6b84cd943ac92a08278aa8ac8c0fdefcf7138f35"},
+    {file = "ruff-0.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a0cd909d25f227ac5c36d4e7e681577275fb74ba3b11d288aff7ec47e3ae745"},
+    {file = "ruff-0.1.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8fd1c62a47aa88a02707b5dd20c5ff20d035d634aa74826b42a1da77861b5ff"},
+    {file = "ruff-0.1.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:fd89b45d374935829134a082617954120d7a1470a9f0ec0e7f3ead983edc48cc"},
+    {file = "ruff-0.1.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:491262006e92f825b145cd1e52948073c56560243b55fb3b4ecb142f6f0e9543"},
+    {file = "ruff-0.1.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ea284789861b8b5ca9d5443591a92a397ac183d4351882ab52f6296b4fdd5462"},
+    {file = "ruff-0.1.6-py3-none-win32.whl", hash = "sha256:1610e14750826dfc207ccbcdd7331b6bd285607d4181df9c1c6ae26646d6848a"},
+    {file = "ruff-0.1.6-py3-none-win_amd64.whl", hash = "sha256:4558b3e178145491e9bc3b2ee3c4b42f19d19384eaa5c59d10acf6e8f8b57e33"},
+    {file = "ruff-0.1.6-py3-none-win_arm64.whl", hash = "sha256:03910e81df0d8db0e30050725a5802441c2022ea3ae4fe0609b76081731accbc"},
+    {file = "ruff-0.1.6.tar.gz", hash = "sha256:1b09f29b16c6ead5ea6b097ef2764b42372aebe363722f1605ecbcd2b9207184"},
 ]
 
 [[package]]
@@ -8401,4 +8401,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10.8"
-content-hash = "90889ae9da76eb541d14880319735717ebd5c30bd22af7c3af48d3f9206d3784"
+content-hash = "2d34459308397c1956a5ff15a84d61fc7654959638e63017acc2eeaa92044b32"
diff --git a/pyproject.toml b/pyproject.toml
index f3db534ea..15a2d609e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ pep8-naming = "^0.13.2"
 interrogate = "^1.5.0"
 isort = "^5.12.0"
 darglint = "^1.8.1"
-ruff = "^0.1.3"
+ruff = "^0.1.6"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.3"

From a4a44da2be20e907e5f6f4bc4967afb01edc2e90 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Nov 2023 13:19:28 +0000
Subject: [PATCH 25/25] build(deps-dev): bump mypy from 1.7.0 to 1.7.1 (#278)

Bumps [mypy](https://github.com/python/mypy) from 1.7.0 to 1.7.1.
- [Changelog](https://github.com/python/mypy/blob/master/CHANGELOG.md)
- [Commits](https://github.com/python/mypy/compare/v1.7.0...v1.7.1)

---
updated-dependencies:
- dependency-name: mypy
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock | 56 ++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d33f277de..2ba9c72e6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5115,38 +5115,38 @@ files = [
 
 [[package]]
 name = "mypy"
-version = "1.7.0"
+version = "1.7.1"
 description = "Optional static typing for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mypy-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5da84d7bf257fd8f66b4f759a904fd2c5a765f70d8b52dde62b521972a0a2357"},
-    {file = "mypy-1.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a3637c03f4025f6405737570d6cbfa4f1400eb3c649317634d273687a09ffc2f"},
-    {file = "mypy-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b633f188fc5ae1b6edca39dae566974d7ef4e9aaaae00bc36efe1f855e5173ac"},
-    {file = "mypy-1.7.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d6ed9a3997b90c6f891138e3f83fb8f475c74db4ccaa942a1c7bf99e83a989a1"},
-    {file = "mypy-1.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:1fe46e96ae319df21359c8db77e1aecac8e5949da4773c0274c0ef3d8d1268a9"},
-    {file = "mypy-1.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:df67fbeb666ee8828f675fee724cc2cbd2e4828cc3df56703e02fe6a421b7401"},
-    {file = "mypy-1.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a79cdc12a02eb526d808a32a934c6fe6df07b05f3573d210e41808020aed8b5d"},
-    {file = "mypy-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f65f385a6f43211effe8c682e8ec3f55d79391f70a201575def73d08db68ead1"},
-    {file = "mypy-1.7.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e81ffd120ee24959b449b647c4b2fbfcf8acf3465e082b8d58fd6c4c2b27e46"},
-    {file = "mypy-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:f29386804c3577c83d76520abf18cfcd7d68264c7e431c5907d250ab502658ee"},
-    {file = "mypy-1.7.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:87c076c174e2c7ef8ab416c4e252d94c08cd4980a10967754f91571070bf5fbe"},
-    {file = "mypy-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6cb8d5f6d0fcd9e708bb190b224089e45902cacef6f6915481806b0c77f7786d"},
-    {file = "mypy-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93e76c2256aa50d9c82a88e2f569232e9862c9982095f6d54e13509f01222fc"},
-    {file = "mypy-1.7.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cddee95dea7990e2215576fae95f6b78a8c12f4c089d7e4367564704e99118d3"},
-    {file = "mypy-1.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:d01921dbd691c4061a3e2ecdbfbfad029410c5c2b1ee88946bf45c62c6c91210"},
-    {file = "mypy-1.7.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:185cff9b9a7fec1f9f7d8352dff8a4c713b2e3eea9c6c4b5ff7f0edf46b91e41"},
-    {file = "mypy-1.7.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7a7b1e399c47b18feb6f8ad4a3eef3813e28c1e871ea7d4ea5d444b2ac03c418"},
-    {file = "mypy-1.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9fe455ad58a20ec68599139ed1113b21f977b536a91b42bef3ffed5cce7391"},
-    {file = "mypy-1.7.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d0fa29919d2e720c8dbaf07d5578f93d7b313c3e9954c8ec05b6d83da592e5d9"},
-    {file = "mypy-1.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:2b53655a295c1ed1af9e96b462a736bf083adba7b314ae775563e3fb4e6795f5"},
-    {file = "mypy-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c1b06b4b109e342f7dccc9efda965fc3970a604db70f8560ddfdee7ef19afb05"},
-    {file = "mypy-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bf7a2f0a6907f231d5e41adba1a82d7d88cf1f61a70335889412dec99feeb0f8"},
-    {file = "mypy-1.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:551d4a0cdcbd1d2cccdcc7cb516bb4ae888794929f5b040bb51aae1846062901"},
-    {file = "mypy-1.7.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55d28d7963bef00c330cb6461db80b0b72afe2f3c4e2963c99517cf06454e665"},
-    {file = "mypy-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:870bd1ffc8a5862e593185a4c169804f2744112b4a7c55b93eb50f48e7a77010"},
-    {file = "mypy-1.7.0-py3-none-any.whl", hash = "sha256:96650d9a4c651bc2a4991cf46f100973f656d69edc7faf91844e87fe627f7e96"},
-    {file = "mypy-1.7.0.tar.gz", hash = "sha256:1e280b5697202efa698372d2f39e9a6713a0395a756b1c6bd48995f8d72690dc"},
+    {file = "mypy-1.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:12cce78e329838d70a204293e7b29af9faa3ab14899aec397798a4b41be7f340"},
+    {file = "mypy-1.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1484b8fa2c10adf4474f016e09d7a159602f3239075c7bf9f1627f5acf40ad49"},
+    {file = "mypy-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31902408f4bf54108bbfb2e35369877c01c95adc6192958684473658c322c8a5"},
+    {file = "mypy-1.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f2c2521a8e4d6d769e3234350ba7b65ff5d527137cdcde13ff4d99114b0c8e7d"},
+    {file = "mypy-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:fcd2572dd4519e8a6642b733cd3a8cfc1ef94bafd0c1ceed9c94fe736cb65b6a"},
+    {file = "mypy-1.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4b901927f16224d0d143b925ce9a4e6b3a758010673eeded9b748f250cf4e8f7"},
+    {file = "mypy-1.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7f6985d05a4e3ce8255396df363046c28bea790e40617654e91ed580ca7c51"},
+    {file = "mypy-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:944bdc21ebd620eafefc090cdf83158393ec2b1391578359776c00de00e8907a"},
+    {file = "mypy-1.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9c7ac372232c928fff0645d85f273a726970c014749b924ce5710d7d89763a28"},
+    {file = "mypy-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:f6efc9bd72258f89a3816e3a98c09d36f079c223aa345c659622f056b760ab42"},
+    {file = "mypy-1.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6dbdec441c60699288adf051f51a5d512b0d818526d1dcfff5a41f8cd8b4aaf1"},
+    {file = "mypy-1.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4fc3d14ee80cd22367caaaf6e014494415bf440980a3045bf5045b525680ac33"},
+    {file = "mypy-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c6e4464ed5f01dc44dc9821caf67b60a4e5c3b04278286a85c067010653a0eb"},
+    {file = "mypy-1.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:d9b338c19fa2412f76e17525c1b4f2c687a55b156320acb588df79f2e6fa9fea"},
+    {file = "mypy-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:204e0d6de5fd2317394a4eff62065614c4892d5a4d1a7ee55b765d7a3d9e3f82"},
+    {file = "mypy-1.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:84860e06ba363d9c0eeabd45ac0fde4b903ad7aa4f93cd8b648385a888e23200"},
+    {file = "mypy-1.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8c5091ebd294f7628eb25ea554852a52058ac81472c921150e3a61cdd68f75a7"},
+    {file = "mypy-1.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40716d1f821b89838589e5b3106ebbc23636ffdef5abc31f7cd0266db936067e"},
+    {file = "mypy-1.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5cf3f0c5ac72139797953bd50bc6c95ac13075e62dbfcc923571180bebb662e9"},
+    {file = "mypy-1.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:78e25b2fd6cbb55ddfb8058417df193f0129cad5f4ee75d1502248e588d9e0d7"},
+    {file = "mypy-1.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:75c4d2a6effd015786c87774e04331b6da863fc3fc4e8adfc3b40aa55ab516fe"},
+    {file = "mypy-1.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2643d145af5292ee956aa0a83c2ce1038a3bdb26e033dadeb2f7066fb0c9abce"},
+    {file = "mypy-1.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75aa828610b67462ffe3057d4d8a4112105ed211596b750b53cbfe182f44777a"},
+    {file = "mypy-1.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ee5d62d28b854eb61889cde4e1dbc10fbaa5560cb39780c3995f6737f7e82120"},
+    {file = "mypy-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:72cf32ce7dd3562373f78bd751f73c96cfb441de147cc2448a92c1a308bd0ca6"},
+    {file = "mypy-1.7.1-py3-none-any.whl", hash = "sha256:f7c5d642db47376a0cc130f0de6d055056e010debdaf0707cd2b0fc7e7ef30ea"},
+    {file = "mypy-1.7.1.tar.gz", hash = "sha256:fcb6d9afb1b6208b4c712af0dafdc650f518836065df0d4fb1d800f5d6773db2"},
 ]
 
 [package.dependencies]