-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #150 from opentargets/tskir-3095-finngen-sumstat
[Preprocess #1] Business logic for FinnGen summary stats ingestion
- Loading branch information
Showing
9 changed files
with
146 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
"""Summary statistics ingestion for FinnGen.""" | ||
|
||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
|
||
import pyspark.sql.functions as f | ||
import pyspark.sql.types as t | ||
|
||
from otg.common.utils import calculate_confidence_interval, parse_pvalue | ||
from otg.dataset.summary_statistics import SummaryStatistics | ||
|
||
if TYPE_CHECKING: | ||
from pyspark.sql import DataFrame | ||
|
||
|
||
@dataclass | ||
class FinnGenSummaryStats(SummaryStatistics): | ||
"""Summary statistics dataset for FinnGen.""" | ||
|
||
@classmethod | ||
def from_finngen_harmonized_summary_stats( | ||
cls: type[FinnGenSummaryStats], | ||
summary_stats_df: DataFrame, | ||
) -> FinnGenSummaryStats: | ||
"""Ingests all summary statst for all FinnGen studies.""" | ||
processed_summary_stats_df = ( | ||
summary_stats_df | ||
# Drop rows which don't have proper position. | ||
.filter(f.col("pos").cast(t.IntegerType()).isNotNull()).select( | ||
# From the full path, extracts just the filename, and converts to upper case to get the study ID. | ||
f.upper(f.regexp_extract(f.input_file_name(), r"([^/]+)\.gz", 1)).alias( | ||
"studyId" | ||
), | ||
# Add variant information. | ||
f.concat_ws( | ||
"_", | ||
f.col("#chrom"), | ||
f.col("pos"), | ||
f.col("ref"), | ||
f.col("alt"), | ||
).alias("variantId"), | ||
f.col("#chrom").alias("chromosome"), | ||
f.col("pos").cast(t.IntegerType()).alias("position"), | ||
# Parse p-value into mantissa and exponent. | ||
*parse_pvalue(f.col("pval")), | ||
# Add beta, standard error, and allele frequency information. | ||
f.col("beta").cast("double"), | ||
f.col("sebeta").cast("double").alias("standardError"), | ||
f.col("af_alt").cast("float").alias("effectAlleleFrequencyFromSource"), | ||
) | ||
# Calculating the confidence intervals. | ||
.select( | ||
"*", | ||
*calculate_confidence_interval( | ||
f.col("pValueMantissa"), | ||
f.col("pValueExponent"), | ||
f.col("beta"), | ||
f.col("standardError"), | ||
), | ||
) | ||
) | ||
|
||
# Initializing summary statistics object: | ||
return cls( | ||
_df=processed_summary_stats_df, | ||
_schema=cls.get_schema(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
"""Tests for study index dataset from FinnGen.""" | ||
|
||
from __future__ import annotations | ||
|
||
from pyspark.sql import DataFrame | ||
|
||
from otg.dataset.summary_statistics import SummaryStatistics | ||
from otg.datasource.finngen.summary_stats import FinnGenSummaryStats | ||
|
||
|
||
def test_finngen_summary_stats_from_source( | ||
sample_finngen_summary_stats: DataFrame, | ||
) -> None: | ||
"""Test summary statistics from source.""" | ||
assert isinstance( | ||
FinnGenSummaryStats.from_finngen_harmonized_summary_stats( | ||
sample_finngen_summary_stats | ||
), | ||
SummaryStatistics, | ||
) |