From 04b30815ea2a1e19157ee99e4a56db36060d9b05 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Mon, 16 Dec 2024 21:34:25 +0000 Subject: [PATCH 1/2] fix: repair SusieFinemapperStep to work with new SL schema and fix locus missing the lead variantID --- src/gentropy/susie_finemapper.py | 62 ++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 94ad918a5..30b416f77 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -13,7 +13,9 @@ from pyspark.sql import DataFrame, Row, Window from pyspark.sql.functions import desc, row_number from pyspark.sql.types import ( + ArrayType, DoubleType, + FloatType, IntegerType, StringType, StructField, @@ -273,7 +275,7 @@ def susie_inf_to_studylocus( # noqa: C901 ), "variantId", ) - .sort(f.desc("posteriorProbability")) + .sort(f.desc(f.col("posteriorProbability").cast("double"))) .withColumn( "locus", f.collect_list( @@ -896,7 +898,63 @@ def susie_finemapper_one_sl_row_gathered_boundaries( # noqa: C901 logging.warning("Analysis Flags check failed for this study") return None - schema = StudyLocus.get_schema() + schema = StructType( + [ + StructField("studyLocusId", StringType(), True), + StructField("studyType", StringType(), True), + StructField("variantId", StringType(), True), + StructField("chromosome", StringType(), True), + StructField("position", IntegerType(), True), + StructField("region", StringType(), True), + StructField("studyId", StringType(), True), + StructField("beta", DoubleType(), True), + StructField("zScore", DoubleType(), True), + StructField("pValueMantissa", FloatType(), True), + StructField("pValueExponent", IntegerType(), True), + StructField("effectAlleleFrequencyFromSource", FloatType(), True), + StructField("standardError", DoubleType(), True), + StructField("subStudyDescription", StringType(), True), + StructField("qualityControls", ArrayType(StringType(), True), True), + StructField("finemappingMethod", StringType(), True), + StructField("credibleSetIndex", IntegerType(), True), + StructField("credibleSetlog10BF", DoubleType(), True), + StructField("purityMeanR2", DoubleType(), True), + StructField("purityMinR2", DoubleType(), True), + StructField("locusStart", IntegerType(), True), + StructField("locusEnd", IntegerType(), True), + StructField("sampleSize", IntegerType(), True), + StructField( + "ldSet", + ArrayType( + StructType( + [ + StructField("tagVariantId", StringType(), True), + StructField("r2Overall", DoubleType(), True), + ] + ), + True, + ), + True, + ), + StructField( + "locus", + ArrayType( + StructType( + [ + StructField("variantId", StringType(), True), + StructField("beta", DoubleType(), True), + StructField("pValueMantissa", FloatType(), True), + StructField("pValueExponent", IntegerType(), True), + StructField("standardError", DoubleType(), True), + ] + ), + False, + ), + True, + ), + StructField("confidence", StringType(), True), + ] + ) gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema) exploded_df = gwas_df.select(f.explode("locus").alias("locus")) From 4f8afaa6d9f5c93a5f43ffa47a6e4f8cdebbeaae Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Wed, 18 Dec 2024 15:25:46 +0000 Subject: [PATCH 2/2] fix: changing schema --- src/gentropy/susie_finemapper.py | 61 ++------------------------------ 1 file changed, 2 insertions(+), 59 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 30b416f77..dd8554bef 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -13,9 +13,7 @@ from pyspark.sql import DataFrame, Row, Window from pyspark.sql.functions import desc, row_number from pyspark.sql.types import ( - ArrayType, DoubleType, - FloatType, IntegerType, StringType, StructField, @@ -898,64 +896,9 @@ def susie_finemapper_one_sl_row_gathered_boundaries( # noqa: C901 logging.warning("Analysis Flags check failed for this study") return None - schema = StructType( - [ - StructField("studyLocusId", StringType(), True), - StructField("studyType", StringType(), True), - StructField("variantId", StringType(), True), - StructField("chromosome", StringType(), True), - StructField("position", IntegerType(), True), - StructField("region", StringType(), True), - StructField("studyId", StringType(), True), - StructField("beta", DoubleType(), True), - StructField("zScore", DoubleType(), True), - StructField("pValueMantissa", FloatType(), True), - StructField("pValueExponent", IntegerType(), True), - StructField("effectAlleleFrequencyFromSource", FloatType(), True), - StructField("standardError", DoubleType(), True), - StructField("subStudyDescription", StringType(), True), - StructField("qualityControls", ArrayType(StringType(), True), True), - StructField("finemappingMethod", StringType(), True), - StructField("credibleSetIndex", IntegerType(), True), - StructField("credibleSetlog10BF", DoubleType(), True), - StructField("purityMeanR2", DoubleType(), True), - StructField("purityMinR2", DoubleType(), True), - StructField("locusStart", IntegerType(), True), - StructField("locusEnd", IntegerType(), True), - StructField("sampleSize", IntegerType(), True), - StructField( - "ldSet", - ArrayType( - StructType( - [ - StructField("tagVariantId", StringType(), True), - StructField("r2Overall", DoubleType(), True), - ] - ), - True, - ), - True, - ), - StructField( - "locus", - ArrayType( - StructType( - [ - StructField("variantId", StringType(), True), - StructField("beta", DoubleType(), True), - StructField("pValueMantissa", FloatType(), True), - StructField("pValueExponent", IntegerType(), True), - StructField("standardError", DoubleType(), True), - ] - ), - False, - ), - True, - ), - StructField("confidence", StringType(), True), - ] + gwas_df = session.spark.createDataFrame( + [study_locus_row], StudyLocus.get_schema() ) - gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema) exploded_df = gwas_df.select(f.explode("locus").alias("locus")) result_df = exploded_df.select(