From ce9ae033bc11af0aa76d0f27ffa9d31a7509a869 Mon Sep 17 00:00:00 2001 From: Szymon Szyszkowski Date: Tue, 1 Oct 2024 14:27:58 +0100 Subject: [PATCH] fix(ukb_ppp_study_index): update column name to match schema --- notebooks/Release_QC_metrics.ipynb | 2 +- .../datasource/ukb_ppp_eur/study_index.py | 31 ++++++++----------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/notebooks/Release_QC_metrics.ipynb b/notebooks/Release_QC_metrics.ipynb index 4eb27015b..5f9bf77c0 100644 --- a/notebooks/Release_QC_metrics.ipynb +++ b/notebooks/Release_QC_metrics.ipynb @@ -419,7 +419,7 @@ "# Number of studies\n", "eqtl_index=session.spark.read.parquet(eqtl_index_path, recursiveFileLookup=True)\n", "# Number of tissues, list of tissues\n", - "#eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().show(truncate=False)\n", + "#eqtl_index.select(f.col(\"biosampleFromSourceId\")).distinct().show(truncate=False)\n", "\n", "# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n", "# eqtl catalog susie:\n", diff --git a/src/gentropy/datasource/ukb_ppp_eur/study_index.py b/src/gentropy/datasource/ukb_ppp_eur/study_index.py index f694b9a47..8a3105f5d 100644 --- a/src/gentropy/datasource/ukb_ppp_eur/study_index.py +++ b/src/gentropy/datasource/ukb_ppp_eur/study_index.py @@ -1,4 +1,5 @@ """Study Index for Finngen data source.""" + from __future__ import annotations import pyspark.sql.functions as f @@ -29,9 +30,7 @@ def from_source( """ # In order to populate the nSamples column, we need to peek inside the summary stats dataframe. num_of_samples = ( - spark - .read - .parquet(raw_summary_stats_path) + spark.read.parquet(raw_summary_stats_path) .filter(f.col("chromosome") == "22") .groupBy("studyId") .agg(f.first("N").cast("integer").alias("nSamples")) @@ -45,7 +44,7 @@ def from_source( f.lit("UKB_PPP_EUR").alias("projectId"), f.col("_gentropy_study_id").alias("studyId"), f.col("UKBPPP_ProteinID").alias("traitFromSource"), - f.lit("UBERON_0001969").alias("tissueFromSourceId"), + f.lit("UBERON_0001969").alias("biosampleFromSourceId"), f.col("ensembl_id").alias("geneId"), f.lit(True).alias("hasSumstats"), f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"), @@ -53,21 +52,17 @@ def from_source( .join(num_of_samples, "studyId", "inner") ) # Add population structure. - study_index_df = ( - study_index_df - .withColumn( - "discoverySamples", - f.array( - f.struct( - f.col("nSamples").cast("integer").alias("sampleSize"), - f.lit("European").alias("ancestry"), - ) + study_index_df = study_index_df.withColumn( + "discoverySamples", + f.array( + f.struct( + f.col("nSamples").cast("integer").alias("sampleSize"), + f.lit("European").alias("ancestry"), ) - ) - .withColumn( - "ldPopulationStructure", - cls.aggregate_and_map_ancestries(f.col("discoverySamples")), - ) + ), + ).withColumn( + "ldPopulationStructure", + cls.aggregate_and_map_ancestries(f.col("discoverySamples")), ) return StudyIndex(