Skip to content

Commit

Permalink
fix: resolving diverged remote
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Dec 1, 2023
2 parents 18fa777 + 8a6699d commit c95bbec
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 3 deletions.
8 changes: 6 additions & 2 deletions src/otg/clump.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def __post_init__(self: ClumpStep) -> None:
Raises:
ValueError: If study index and LD index paths are not provided for study locus.
"""
input_cols = self.session.spark.read.parquet(self.input_path).columns
input_cols = self.session.spark.read.parquet(
self.input_path, recursiveFileLookup=True
).columns
if "studyLocusId" in input_cols:
if self.study_index_path is None or self.ld_index_path is None:
raise ValueError(
Expand All @@ -59,7 +61,9 @@ def __post_init__(self: ClumpStep) -> None:
study_index=study_index, ld_index=ld_index
).clump()
else:
sumstats = SummaryStatistics.from_parquet(self.session, self.input_path)
sumstats = SummaryStatistics.from_parquet(
self.session, self.input_path, recursiveFileLookup=True
)
clumped_study_locus = sumstats.window_based_clumping(
locus_collect_distance=self.locus_collect_distance
)
Expand Down
2 changes: 1 addition & 1 deletion src/otg/datasource/gwas_catalog/study_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _resolve_study_id(study_id: Column, sub_study_description: Column) -> Column
"""
split_w = Window.partitionBy(study_id).orderBy(sub_study_description)
row_number = f.dense_rank().over(split_w)
substudy_count = f.count(row_number).over(split_w)
substudy_count = f.approx_count_distinct(row_number).over(split_w)
return f.when(substudy_count == 1, study_id).otherwise(
f.concat_ws("_", study_id, row_number)
)
Expand Down
70 changes: 70 additions & 0 deletions tests/datasource/gwas_catalog/test_gwas_catalog_study_splitter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
"""Tests GWAS Catalog study splitter."""
from __future__ import annotations

from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
import pytest

from otg.datasource.gwas_catalog.associations import GWASCatalogAssociations
from otg.datasource.gwas_catalog.study_index import GWASCatalogStudyIndex
from otg.datasource.gwas_catalog.study_splitter import GWASCatalogStudySplitter

if TYPE_CHECKING:
from pyspark.sql import SparkSession


def test_gwas_catalog_splitter_split(
mock_study_index_gwas_catalog: GWASCatalogStudyIndex,
Expand All @@ -17,3 +25,65 @@ def test_gwas_catalog_splitter_split(

assert isinstance(d1, GWASCatalogStudyIndex)
assert isinstance(d2, GWASCatalogAssociations)


@pytest.mark.parametrize(
"observed, expected",
[
# Test 1 - it shouldn't split
(
# observed - 2 associations with the same subStudy annotation
[
(
"varA",
"GCST003436",
"Endometrial cancer|no_pvalue_text|EFO_1001512",
),
(
"varB",
"GCST003436",
"Endometrial cancer|no_pvalue_text|EFO_1001512",
),
],
# expected - 2 associations with the same unsplit updatedStudyId
[
("GCST003436",),
("GCST003436",),
],
),
# Test 2 - it should split
(
# observed - 2 associations with the different subStudy annotation
[
(
"varA",
"GCST003436",
"Endometrial cancer|no_pvalue_text|EFO_1001512",
),
(
"varB",
"GCST003436",
"Uterine carcinoma|no_pvalue_text|EFO_0002919",
),
],
# expected - 2 associations with the same unsplit updatedStudyId
[
("GCST003436",),
("GCST003436_2",),
],
),
],
)
def test__resolve_study_id(
spark: SparkSession, observed: list[Any], expected: list[Any]
) -> None:
"""Test _resolve_study_id."""
observed_df = spark.createDataFrame(
observed, schema=["variantId", "studyId", "subStudyDescription"]
).select(
GWASCatalogStudySplitter._resolve_study_id(
f.col("studyId"), f.col("subStudyDescription").alias("updatedStudyId")
)
)
expected_df = spark.createDataFrame(expected, schema=["updatedStudyId"])
assert observed_df.collect() == expected_df.collect()

0 comments on commit c95bbec

Please sign in to comment.