feat: flag and filter credible sets (#879)

* feat(flag_and_filter_credible_sets): add code for identifying abnormal credible sets * feat(flag_and_filter_credible_sets): restructure tests * chore(flag_and_filter_credible_sets): update schema for spark loading * feat(flag_and_filter_credible_sets): first comple draft filtering abnormal pips code * fix(flag_and_filter_credible_sets): tweak broken code and unresolved merge * fix(flag_and_filter_credible_sets): amend test logic * fix(flag_and_filter_credible_sets): modify logic to simplify and account for floating point errors * chore(flag_and_filter_credible_sets): simplify logic to boolean
opentargets · Oct 28, 2024 · d12d65d · d12d65d
1 parent cbbf3c5
commit d12d65d
Show file tree

Hide file tree

Showing 3 changed files with 322 additions and 221 deletions.
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -82,6 +82,7 @@ class StudyLocusQualityCheck(Enum):
         IN_MHC (str): Flagging study loci in the MHC region
         REDUNDANT_PICS_TOP_HIT (str): Flagging study loci in studies with PICS results from summary statistics
         EXPLAINED_BY_SUSIE (str): Study locus in region explained by a SuSiE credible set
+        ABNORMAL_PIPS (str): Flagging study loci with a sum of PIPs that are not in [0.99,1]
         OUT_OF_SAMPLE_LD (str): Study locus finemapped without in-sample LD reference
         INVALID_CHROMOSOME (str): Chromosome not in 1:22, X, Y, XY or MT
     """
@@ -113,6 +114,7 @@ class StudyLocusQualityCheck(Enum):
     TOP_HIT = "Study locus from curated top hit"
     EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"
     OUT_OF_SAMPLE_LD = "Study locus finemapped without in-sample LD reference"
+    ABNORMAL_PIPS = "Study locus with a sum of PIPs that not in the expected range [0.99,1]"
     INVALID_CHROMOSOME = "Chromosome not in 1:22, X, Y, XY or MT"
 
 
@@ -391,6 +393,55 @@ def _qc_subsignificant_associations(
             StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG,
         )
 
+    def qc_abnormal_pips(
+        self: StudyLocus,
+        sum_pips_lower_threshold: float = 0.99,
+        sum_pips_upper_threshold: float = 1.0001, # Set slightly above 1 to account for floating point errors
+    ) -> StudyLocus:
+        """Filter study-locus by sum of posterior inclusion probabilities to ensure that the sum of PIPs is within a given range.
+
+        Args:
+            sum_pips_lower_threshold (float): Lower threshold for the sum of PIPs.
+            sum_pips_upper_threshold (float): Upper threshold for the sum of PIPs.
+
+        Returns:
+            StudyLocus: Filtered study-locus dataset.
+        """
+        # QC column might not be present so we have to be ready to handle it:
+        qc_select_expression = (
+            f.col("qualityControls")
+            if "qualityControls" in self.df.columns
+            else f.lit(None).cast(ArrayType(StringType()))
+        )
+
+        flag = (self.df.withColumn(
+            "sumPosteriorProbability",
+            f.aggregate(
+                f.col("locus"),
+                f.lit(0.0),
+                lambda acc, x: acc + x["posteriorProbability"]
+            )).withColumn(
+                "pipOutOfRange",
+                f.when(
+                    (f.col("sumPosteriorProbability") < sum_pips_lower_threshold) |
+                    (f.col("sumPosteriorProbability") > sum_pips_upper_threshold),
+                    True
+                ).otherwise(False)))
+
+        return StudyLocus(
+            _df=(flag
+                # Flagging loci with failed studies:
+                .withColumn(
+                    "qualityControls",
+                    self.update_quality_flag(
+                        qc_select_expression,
+                        f.col("pipOutOfRange"),
+                        StudyLocusQualityCheck.ABNORMAL_PIPS
+                    ),
+                ).drop("sumPosteriorProbability", "pipOutOfRange")),
+            _schema=self.get_schema()
+        )
+
     @staticmethod
     def _overlapping_peaks(
         credset_to_overlap: DataFrame, intra_study_overlap: bool = False

diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py
@@ -45,6 +45,8 @@ def __init__(
             .annotate_study_type(study_index)  # Add study type to study locus
             .qc_redundant_top_hits_from_PICS()  # Flagging top hits from studies with PICS summary statistics
             .qc_explained_by_SuSiE()  # Flagging credible sets in regions explained by SuSiE
+            # Flagging credible sets with PIP > 1 or PIP < 0.99
+            .qc_abnormal_pips(sum_pips_lower_threshold=0.99,sum_pips_upper_threshold=1.0001)
             # Annotates credible intervals and filter to only keep 99% credible sets
             .filter_credible_set(credible_interval=CredibleInterval.IS99)
             # Annotate credible set confidence: