Skip to content

Commit

Permalink
feat: flag and filter credible sets (#879)
Browse files Browse the repository at this point in the history
* feat(flag_and_filter_credible_sets): add code for identifying abnormal credible sets

* feat(flag_and_filter_credible_sets): restructure tests

* chore(flag_and_filter_credible_sets): update schema for spark loading

* feat(flag_and_filter_credible_sets): first comple draft filtering abnormal pips code

* fix(flag_and_filter_credible_sets): tweak broken code and unresolved merge

* fix(flag_and_filter_credible_sets): amend test logic

* fix(flag_and_filter_credible_sets): modify logic to simplify and account for floating point errors

* chore(flag_and_filter_credible_sets): simplify logic to boolean
  • Loading branch information
Tobi1kenobi authored Oct 28, 2024
1 parent cbbf3c5 commit d12d65d
Show file tree
Hide file tree
Showing 3 changed files with 322 additions and 221 deletions.
51 changes: 51 additions & 0 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class StudyLocusQualityCheck(Enum):
IN_MHC (str): Flagging study loci in the MHC region
REDUNDANT_PICS_TOP_HIT (str): Flagging study loci in studies with PICS results from summary statistics
EXPLAINED_BY_SUSIE (str): Study locus in region explained by a SuSiE credible set
ABNORMAL_PIPS (str): Flagging study loci with a sum of PIPs that are not in [0.99,1]
OUT_OF_SAMPLE_LD (str): Study locus finemapped without in-sample LD reference
INVALID_CHROMOSOME (str): Chromosome not in 1:22, X, Y, XY or MT
"""
Expand Down Expand Up @@ -113,6 +114,7 @@ class StudyLocusQualityCheck(Enum):
TOP_HIT = "Study locus from curated top hit"
EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"
OUT_OF_SAMPLE_LD = "Study locus finemapped without in-sample LD reference"
ABNORMAL_PIPS = "Study locus with a sum of PIPs that not in the expected range [0.99,1]"
INVALID_CHROMOSOME = "Chromosome not in 1:22, X, Y, XY or MT"


Expand Down Expand Up @@ -391,6 +393,55 @@ def _qc_subsignificant_associations(
StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG,
)

def qc_abnormal_pips(
self: StudyLocus,
sum_pips_lower_threshold: float = 0.99,
sum_pips_upper_threshold: float = 1.0001, # Set slightly above 1 to account for floating point errors
) -> StudyLocus:
"""Filter study-locus by sum of posterior inclusion probabilities to ensure that the sum of PIPs is within a given range.
Args:
sum_pips_lower_threshold (float): Lower threshold for the sum of PIPs.
sum_pips_upper_threshold (float): Upper threshold for the sum of PIPs.
Returns:
StudyLocus: Filtered study-locus dataset.
"""
# QC column might not be present so we have to be ready to handle it:
qc_select_expression = (
f.col("qualityControls")
if "qualityControls" in self.df.columns
else f.lit(None).cast(ArrayType(StringType()))
)

flag = (self.df.withColumn(
"sumPosteriorProbability",
f.aggregate(
f.col("locus"),
f.lit(0.0),
lambda acc, x: acc + x["posteriorProbability"]
)).withColumn(
"pipOutOfRange",
f.when(
(f.col("sumPosteriorProbability") < sum_pips_lower_threshold) |
(f.col("sumPosteriorProbability") > sum_pips_upper_threshold),
True
).otherwise(False)))

return StudyLocus(
_df=(flag
# Flagging loci with failed studies:
.withColumn(
"qualityControls",
self.update_quality_flag(
qc_select_expression,
f.col("pipOutOfRange"),
StudyLocusQualityCheck.ABNORMAL_PIPS
),
).drop("sumPosteriorProbability", "pipOutOfRange")),
_schema=self.get_schema()
)

@staticmethod
def _overlapping_peaks(
credset_to_overlap: DataFrame, intra_study_overlap: bool = False
Expand Down
2 changes: 2 additions & 0 deletions src/gentropy/study_locus_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def __init__(
.annotate_study_type(study_index) # Add study type to study locus
.qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics
.qc_explained_by_SuSiE() # Flagging credible sets in regions explained by SuSiE
# Flagging credible sets with PIP > 1 or PIP < 0.99
.qc_abnormal_pips(sum_pips_lower_threshold=0.99,sum_pips_upper_threshold=1.0001)
# Annotates credible intervals and filter to only keep 99% credible sets
.filter_credible_set(credible_interval=CredibleInterval.IS99)
# Annotate credible set confidence:
Expand Down
Loading

0 comments on commit d12d65d

Please sign in to comment.