Skip to content

Commit f353ed1

Browse files
author
Szymon Szyszkowski
committed
fix(ukb_ppp_study_index): update column name to match schema
1 parent a5588ae commit f353ed1

File tree

2 files changed

+14
-19
lines changed

2 files changed

+14
-19
lines changed

notebooks/Release_QC_metrics.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@
419419
"# Number of studies\n",
420420
"eqtl_index=session.spark.read.parquet(eqtl_index_path, recursiveFileLookup=True)\n",
421421
"# Number of tissues, list of tissues\n",
422-
"#eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().show(truncate=False)\n",
422+
"#eqtl_index.select(f.col(\"biosampleFromSourceId\")).distinct().show(truncate=False)\n",
423423
"\n",
424424
"# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n",
425425
"# eqtl catalog susie:\n",

src/gentropy/datasource/ukb_ppp_eur/study_index.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Study Index for Finngen data source."""
2+
23
from __future__ import annotations
34

45
import pyspark.sql.functions as f
@@ -29,9 +30,7 @@ def from_source(
2930
"""
3031
# In order to populate the nSamples column, we need to peek inside the summary stats dataframe.
3132
num_of_samples = (
32-
spark
33-
.read
34-
.parquet(raw_summary_stats_path)
33+
spark.read.parquet(raw_summary_stats_path)
3534
.filter(f.col("chromosome") == "22")
3635
.groupBy("studyId")
3736
.agg(f.first("N").cast("integer").alias("nSamples"))
@@ -45,29 +44,25 @@ def from_source(
4544
f.lit("UKB_PPP_EUR").alias("projectId"),
4645
f.col("_gentropy_study_id").alias("studyId"),
4746
f.col("UKBPPP_ProteinID").alias("traitFromSource"),
48-
f.lit("UBERON_0001969").alias("tissueFromSourceId"),
47+
f.lit("UBERON_0001969").alias("biosampleFromSourceId"),
4948
f.col("ensembl_id").alias("geneId"),
5049
f.lit(True).alias("hasSumstats"),
5150
f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"),
5251
)
5352
.join(num_of_samples, "studyId", "inner")
5453
)
5554
# Add population structure.
56-
study_index_df = (
57-
study_index_df
58-
.withColumn(
59-
"discoverySamples",
60-
f.array(
61-
f.struct(
62-
f.col("nSamples").cast("integer").alias("sampleSize"),
63-
f.lit("European").alias("ancestry"),
64-
)
55+
study_index_df = study_index_df.withColumn(
56+
"discoverySamples",
57+
f.array(
58+
f.struct(
59+
f.col("nSamples").cast("integer").alias("sampleSize"),
60+
f.lit("European").alias("ancestry"),
6561
)
66-
)
67-
.withColumn(
68-
"ldPopulationStructure",
69-
cls.aggregate_and_map_ancestries(f.col("discoverySamples")),
70-
)
62+
),
63+
).withColumn(
64+
"ldPopulationStructure",
65+
cls.aggregate_and_map_ancestries(f.col("discoverySamples")),
7166
)
7267

7368
return StudyIndex(

0 commit comments

Comments
 (0)