feat: changing studylocus validation to 95 percent credible sets (#921)

* feat: changing studylocus validation to 95 percent credible sets * fix: updating comment in code to reflect 95% credset * fix: removing credset number of partitions * fix: flag name --------- Co-authored-by: Yakov Tsepilov <yt4@sanger.ac.uk>
opentargets · Nov 21, 2024 · 05e47a3 · 05e47a3
1 parent b6303d5
commit 05e47a3
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 12 deletions.
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -113,7 +113,7 @@ class StudyLocusQualityCheck(Enum):
     EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"
     OUT_OF_SAMPLE_LD = "Study locus finemapped without in-sample LD reference"
     ABNORMAL_PIPS = (
-        "Study locus with a sum of PIPs that not in the expected range [0.99,1]"
+        "Study locus with a sum of PIPs that not in the expected range [0.95,1]"
     )
     INVALID_CHROMOSOME = "Chromosome not in 1:22, X, Y, XY or MT"
     TOP_HIT_AND_SUMMARY_STATS = (

diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py
@@ -46,24 +46,24 @@ def __init__(
             .annotate_study_type(study_index)  # Add study type to study locus
             .qc_redundant_top_hits_from_PICS()  # Flagging top hits from studies with PICS summary statistics
             .qc_explained_by_SuSiE()  # Flagging credible sets in regions explained by SuSiE
-            # Flagging credible sets with PIP > 1 or PIP < 0.99
+            # Annotates credible intervals and filter to only keep 95% credible sets
+            .filter_credible_set(credible_interval=CredibleInterval.IS95)
+            # Flagging credible sets with PIP > 1 or PIP < 0.95
             .qc_abnormal_pips(
-                sum_pips_lower_threshold=0.99, sum_pips_upper_threshold=1.0001
+                sum_pips_lower_threshold=0.95, sum_pips_upper_threshold=1.0001
             )
-            # Annotates credible intervals and filter to only keep 99% credible sets
-            .filter_credible_set(credible_interval=CredibleInterval.IS99)
             # Annotate credible set confidence:
             .assign_confidence()
         ).persist()  # we will need this for 2 types of outputs
 
         # Valid study locus partitioned to simplify the finding of overlaps
-        study_locus_with_qc.valid_rows(
-            invalid_qc_reasons, invalid=True
-        ).df.repartitionByRange("chromosome", "position").sortWithinPartitions(
+        study_locus_with_qc.valid_rows(invalid_qc_reasons).df.repartitionByRange(
             "chromosome", "position"
-        ).write.mode(session.write_mode).parquet(invalid_study_locus_path)
-
-        # Infalid study locus
-        study_locus_with_qc.valid_rows(invalid_qc_reasons).df.write.mode(
+        ).sortWithinPartitions("chromosome", "position").write.mode(
             session.write_mode
         ).parquet(valid_study_locus_path)
+
+        # Invalid study locus
+        study_locus_with_qc.valid_rows(invalid_qc_reasons, invalid=True).df.write.mode(
+            session.write_mode
+        ).parquet(invalid_study_locus_path)