Skip to content

Commit

Permalink
fix(ukb_ppp_study_index): update column name to match schema
Browse files Browse the repository at this point in the history
  • Loading branch information
Szymon Szyszkowski committed Oct 1, 2024
1 parent c3b8c2c commit ce9ae03
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 19 deletions.
2 changes: 1 addition & 1 deletion notebooks/Release_QC_metrics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@
"# Number of studies\n",
"eqtl_index=session.spark.read.parquet(eqtl_index_path, recursiveFileLookup=True)\n",
"# Number of tissues, list of tissues\n",
"#eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().show(truncate=False)\n",
"#eqtl_index.select(f.col(\"biosampleFromSourceId\")).distinct().show(truncate=False)\n",
"\n",
"# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n",
"# eqtl catalog susie:\n",
Expand Down
31 changes: 13 additions & 18 deletions src/gentropy/datasource/ukb_ppp_eur/study_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Study Index for Finngen data source."""

from __future__ import annotations

import pyspark.sql.functions as f
Expand Down Expand Up @@ -29,9 +30,7 @@ def from_source(
"""
# In order to populate the nSamples column, we need to peek inside the summary stats dataframe.
num_of_samples = (
spark
.read
.parquet(raw_summary_stats_path)
spark.read.parquet(raw_summary_stats_path)
.filter(f.col("chromosome") == "22")
.groupBy("studyId")
.agg(f.first("N").cast("integer").alias("nSamples"))
Expand All @@ -45,29 +44,25 @@ def from_source(
f.lit("UKB_PPP_EUR").alias("projectId"),
f.col("_gentropy_study_id").alias("studyId"),
f.col("UKBPPP_ProteinID").alias("traitFromSource"),
f.lit("UBERON_0001969").alias("tissueFromSourceId"),
f.lit("UBERON_0001969").alias("biosampleFromSourceId"),
f.col("ensembl_id").alias("geneId"),
f.lit(True).alias("hasSumstats"),
f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"),
)
.join(num_of_samples, "studyId", "inner")
)
# Add population structure.
study_index_df = (
study_index_df
.withColumn(
"discoverySamples",
f.array(
f.struct(
f.col("nSamples").cast("integer").alias("sampleSize"),
f.lit("European").alias("ancestry"),
)
study_index_df = study_index_df.withColumn(
"discoverySamples",
f.array(
f.struct(
f.col("nSamples").cast("integer").alias("sampleSize"),
f.lit("European").alias("ancestry"),
)
)
.withColumn(
"ldPopulationStructure",
cls.aggregate_and_map_ancestries(f.col("discoverySamples")),
)
),
).withColumn(
"ldPopulationStructure",
cls.aggregate_and_map_ancestries(f.col("discoverySamples")),
)

return StudyIndex(
Expand Down

0 comments on commit ce9ae03

Please sign in to comment.