Skip to content

Commit 5d47819

Browse files
authored
Merge branch 'main' into il-l2g-benchmark
2 parents 5755dbe + 6d092ae commit 5d47819

File tree

9 files changed

+157
-69
lines changed

9 files changed

+157
-69
lines changed

config/datasets/gcp.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ colocalisation: ${datasets.outputs}/colocalisation
3737
v2g: ${datasets.outputs}/v2g
3838
ld_index: ${datasets.outputs}/ld_index
3939
catalog_study_index: ${datasets.outputs}/catalog_study_index
40-
catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
40+
catalog_study_locus: ${datasets.study_locus}/catalog_curated
4141
finngen_study_index: ${datasets.outputs}/finngen_study_index
4242
finngen_summary_stats: ${datasets.outputs}/finngen_summary_stats
4343
from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats

config/step/gwas_catalog.yaml renamed to config/step/gwas_catalog_ingestion.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,5 @@ catalog_ancestry_files: ${datasets.catalog_ancestries}
44
catalog_associations_file: ${datasets.catalog_associations}
55
catalog_sumstats_lut: ${datasets.catalog_sumstats_lut}
66
variant_annotation_path: ${datasets.variant_annotation}
7-
ld_index_path: ${datasets.ld_index}
87
catalog_studies_out: ${datasets.catalog_study_index}
98
catalog_associations_out: ${datasets.catalog_study_locus}

docs/python_api/step/gwas_catalog.md

Lines changed: 0 additions & 5 deletions
This file was deleted.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
title: GWAS Catalog
3+
---
4+
5+
::: otg.gwas_catalog_ingestion.GWASCatalogIngestionStep

poetry.lock

Lines changed: 47 additions & 43 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,16 @@ scipy = "^1.11.4"
1818
hydra-core = "^1.3.2"
1919
pyliftover = "^0.4"
2020
xgboost = "^1.7.3"
21-
numpy = "^1.26.1"
21+
numpy = "^1.26.2"
2222
hail = "0.2.126"
2323
wandb = "^0.16.1"
2424
google = "^3.0.0"
2525
omegaconf = "^2.3.0"
26-
typing-extensions = "^4.8.0"
26+
typing-extensions = "^4.9.0"
2727
scikit-learn = "^1.3.2"
2828

2929
[tool.poetry.dev-dependencies]
30-
pre-commit = "^3.5.0"
30+
pre-commit = "^3.6.0"
3131
black = {version = "^22.12.0", allow-prereleases = true}
3232
mypy = "^1.7"
3333
pep8-naming = "^0.13.2"

src/airflow/dags/common_airflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def submit_step(
219219
task_id = step_id
220220
return submit_pyspark_job(
221221
cluster_name=cluster_name,
222-
task_id=step_id,
222+
task_id=task_id,
223223
python_module_path=f"{INITIALISATION_BASE_PATH}/{PYTHON_CLI}",
224224
trigger_rule=trigger_rule,
225225
args=[f"step={step_id}"]
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
"""Airflow DAG for the preprocessing of GWAS Catalog's harmonised summary statistics and curated associations."""
2+
from __future__ import annotations
3+
4+
from pathlib import Path
5+
6+
import common_airflow as common
7+
from airflow.models.dag import DAG
8+
from airflow.utils.task_group import TaskGroup
9+
from airflow.utils.trigger_rule import TriggerRule
10+
11+
CLUSTER_NAME = "otg-preprocess-gwascatalog"
12+
AUTOSCALING = "otg-preprocess-gwascatalog"
13+
14+
SUMSTATS = "gs://open-targets-gwas-summary-stats/harmonised"
15+
RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX"
16+
17+
with DAG(
18+
dag_id=Path(__file__).stem,
19+
description="Open Targets Genetics — GWAS Catalog preprocess",
20+
default_args=common.shared_dag_args,
21+
**common.shared_dag_kwargs,
22+
):
23+
with TaskGroup(group_id="summary_stats_preprocessing") as summary_stats_group:
24+
summary_stats_window_clumping = common.submit_step(
25+
cluster_name=CLUSTER_NAME,
26+
step_id="clump",
27+
task_id="catalog_sumstats_window_clumping",
28+
other_args=[
29+
f"step.input_path={SUMSTATS}",
30+
f"step.clumped_study_locus_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog",
31+
],
32+
)
33+
summary_stats_ld_clumping = common.submit_step(
34+
cluster_name=CLUSTER_NAME,
35+
step_id="clump",
36+
task_id="catalog_sumstats_ld_clumping",
37+
other_args=[
38+
f"step.input_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog",
39+
"step.ld_index_path={RELEASEBUCKET}/ld_index",
40+
"step.study_index_path={RELEASEBUCKET}/study_index/catalog",
41+
"step.clumped_study_locus_path={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog",
42+
],
43+
trigger_rule=TriggerRule.ALL_DONE,
44+
)
45+
summary_stats_pics = common.submit_step(
46+
cluster_name=CLUSTER_NAME,
47+
step_id="pics",
48+
task_id="catalog_sumstats_pics",
49+
other_args=[
50+
"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog",
51+
"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/from_sumstats/catalog",
52+
],
53+
trigger_rule=TriggerRule.ALL_DONE,
54+
)
55+
summary_stats_window_clumping >> summary_stats_ld_clumping >> summary_stats_pics
56+
57+
with TaskGroup(group_id="curation_preprocessing") as curation_group:
58+
parse_study_and_curated_assocs = common.submit_step(
59+
cluster_name=CLUSTER_NAME,
60+
step_id="gwas_catalog_ingestion",
61+
task_id="catalog_ingestion",
62+
)
63+
64+
curation_ld_clumping = common.submit_step(
65+
cluster_name=CLUSTER_NAME,
66+
step_id="clump",
67+
task_id="catalog_curation_ld_clumping",
68+
other_args=[
69+
"step.input_path={RELEASEBUCKET}/study_locus/catalog_curated",
70+
"step.ld_index_path={RELEASEBUCKET}/ld_index",
71+
"step.study_index_path={RELEASEBUCKET}/study_index/catalog",
72+
"step.clumped_study_locus_path={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated",
73+
],
74+
trigger_rule=TriggerRule.ALL_DONE,
75+
)
76+
77+
curation_pics = common.submit_step(
78+
cluster_name=CLUSTER_NAME,
79+
step_id="pics",
80+
task_id="catalog_curation_pics",
81+
other_args=[
82+
"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated",
83+
"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/catalog_curated",
84+
],
85+
trigger_rule=TriggerRule.ALL_DONE,
86+
)
87+
parse_study_and_curated_assocs >> curation_ld_clumping >> curation_pics
88+
89+
(
90+
common.create_cluster(
91+
CLUSTER_NAME, autoscaling_policy=AUTOSCALING, num_workers=5
92+
)
93+
>> common.install_dependencies(CLUSTER_NAME)
94+
>> [summary_stats_group, curation_group]
95+
>> common.delete_cluster(CLUSTER_NAME)
96+
)

0 commit comments

Comments
 (0)