Skip to content

Commit de886c2

Browse files
Merge branch 'dev' into spark-image
2 parents fdcc26a + d3eabd8 commit de886c2

File tree

21 files changed

+897
-56
lines changed

21 files changed

+897
-56
lines changed

.github/workflows/artifact.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
---
2+
name: Build and Push to Artifact Registry
3+
4+
"on":
5+
push:
6+
branches: ["*"]
7+
tags: ["*"]
8+
paths:
9+
- images/**
10+
11+
env:
12+
PROJECT_ID: open-targets-genetics-dev
13+
REGION: europe-west1
14+
GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
15+
REPOSITORY: gentropy-app
16+
PYTHON_VERSION_DEFAULT: "3.10.8"
17+
18+
jobs:
19+
build-push-artifact:
20+
runs-on: ubuntu-latest
21+
steps:
22+
- name: "Checkout"
23+
uses: "actions/checkout@v3"
24+
25+
- name: "auth"
26+
uses: "google-github-actions/auth@v2"
27+
with:
28+
credentials_json: "${{ secrets.GC_SERVICE_ACCOUNT_KEY }}"
29+
30+
- name: "Set up Cloud SDK"
31+
uses: "google-github-actions/setup-gcloud@v2"
32+
33+
- name: "Use gcloud CLI"
34+
run: "gcloud info"
35+
36+
- name: Set up QEMU
37+
uses: docker/setup-qemu-action@v3
38+
39+
- name: Set up Docker Buildx
40+
uses: docker/setup-buildx-action@v3
41+
42+
- name: "Docker auth"
43+
run: |-
44+
gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
45+
46+
- name: Quick Docker build (gentropy image overloaded with orchestration logic, AMD64 only, with layer cache)
47+
uses: docker/build-push-action@v6
48+
with:
49+
platforms: linux/amd64
50+
push: true
51+
tags: "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/ot_gentropy:${{ github.ref_name }}"
52+
context: .
53+
cache-from: type=gha
54+
cache-to: type=gha,mode=max
55+
file: images/gentropy/Dockerfile

.github/workflows/push_config.yaml

Lines changed: 0 additions & 17 deletions
This file was deleted.

Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,18 @@ upload-gwas-catalog-buckets-readme: ## Upload gwas_catalog readme to the bucket(
5555
@gsutil rsync docs/datasources/gwas_catalog_data gs://gwas_catalog_top_hits/docs
5656

5757
update-bucket-docs: upload-eqtl-catalogue-bucket-readme upload-ukb-ppp-bucket-readme upload-finngen-bucket-readme upload-gwas-catalog-buckets-readme ## Upload readmes to the datasource buckets
58+
59+
60+
build-gentropy-gcs-image: ## build image that overwrited gentropy with tools specific for orchestration and google cloud
61+
@docker buildx build \
62+
--platform=linux/amd64,linux/arm64 \
63+
-t europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app/ot_gentropy:dev \
64+
--push \
65+
-f images/gentropy/Dockerfile \
66+
--no-cache .
67+
68+
setup-harmonisation-test: ## Prepare the test bucket with raw summary statistics for the harmonisation test.
69+
@gsutil rm gs://ot_orchestration/test/gwas_catalog_inputs/harmonisation_manifest.csv
70+
@gsutil -m rm -r gs://ot_orchestration/test/gwas_catalog_inputs/harmonisation_summary
71+
@gsutil -m rm -r gs://ot_orchestration/test/gwas_catalog_inputs/harmonised_summary_statistics
72+
@gsutil -m rm -r gs://ot_orchestration/test/gwas_catalog_inputs/summary_statistics_qc

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
This catalog describes how the orchestration works in the current state
44

5-
### How to generate dag svg files
5+
## How to generate dag svg files
66

77
1. Locate your global `airflow.cfg` file and update the [core] dag_folder in `airflow.cfg` to point to the `src` directory of the orchestration repository or set the `AIRFLOW__CORE__DAGS_FOLDER` environment variable.
88

docs/datasources/gwas_catalog_data/README.md

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Data stored under 4 buckets:
1313

1414
Bucket `gs://gwas_catalog_inputs` contains:
1515

16-
```
16+
```bash
1717
gs://gwas_catalog_inputs/gwas_catalog_associations_ontology_annotated.tsv
1818
gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv
1919
gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv
@@ -61,7 +61,7 @@ as failing.
6161
<details>
6262
<summary>Expand to see the example of manifest file</summary>
6363

64-
```
64+
```bash
6565
rawSumstatPath,study,harmonisedSumstatPath,isHarmonised,qcPath,qcPerformed
6666
gs://gwas_catalog_inputs/raw_summary_statistics/GCST000001-GCST001000/GCST000028/harmonised/17463246-GCST000028-EFO_0001360.h.tsv.gz,GCST000028,gs://gwas_catalog_inputs/harmonised_summary_statistics/GCST000028/,True,gs://gwas_catalog_inputs/summary_statistics_qc/GCST000028/,True
6767
```
@@ -77,7 +77,7 @@ This is the dataset containing meta information about the status of finemapping.
7777

7878
The files are stored under the per study directory in the form like below:
7979

80-
```
80+
```bash
8181
gs://gwas_catalog_inputs/harmonisation_summary/GCST90077749/202410141529/harmonisation.csv
8282
gs://gwas_catalog_inputs/harmonisation_summary/GCST90077749/202410141529/harmonisation.log
8383
gs://gwas_catalog_inputs/harmonisation_summary/GCST90077749/latest/harmonisation.csv
@@ -102,7 +102,7 @@ The file reports following metrics:
102102
<details>
103103
<summary>Expand to see the example</summary>
104104

105-
```
105+
```bash
106106
study,harmonisationExitCode,qcExitCode,rawSumstatFile,rawSumstatFileSize,rawUnzippedSumstatFileSize
107107
GCST90077749,0,1,gs://gwas_catalog_inputs/raw_summary_statistics/GCST90077001-GCST90078000/GCST90077749/harmonised/34662886-GCST90077749-EFO_1001919.h.tsv.gz,18M,62M
108108
```
@@ -116,7 +116,7 @@ This file contains logs from the harmonisation script collected during it's exec
116116
<details>
117117
<summary>Expand to see the example</summary>
118118

119-
```
119+
```bash
120120
[2024.10.14 15:33] Copying raw summary statistics from gs://gwas_catalog_inputs/raw_summary_statistics/GCST90078001-GCST90079000/GCST90079000/harmonised/GCST90079000.h.tsv.gz to GCST90079000.h.tsv.gz
121121
[2024.10.14 15:34] Raw file size 17M
122122
[2024.10.14 15:34] Unzipping GCST90079000.h.tsv.gz to GCST90079000.h.tsv
@@ -179,11 +179,40 @@ datasets: {}
179179

180180
This directory contains various analysis performed on harmonisation results.
181181

182+
## Gwas catalog harmonisation & qc dag
183+
184+
The `gwas_catalog_harmonisation` dag is used to perform the harmonisation and quality checks on the raw summary statistics. The dag configuration and topology can be found in `gwas_catalog_harmonisation.yaml` file under the config directory. Since this task is computationally expensive, it is run in parallel by the google batch operators. The dag contains 2 steps:
185+
186+
1. Harmonisation done by [gwas_catalog_sumstat_preprocess](https://opentargets.github.io/gentropy/python_api/steps/gwas_catalog_sumstat_preprocess/)
187+
2. Quality Control of the harmonised summary statistics done by [sumstat_qc_step](https://opentargets.github.io/gentropy/python_api/steps/summary_statistics_qc/)
188+
189+
To run the dag, one need to prepare the input files and gentropy overwritten docker image.
190+
191+
### Gentropy overwritten docker image
192+
193+
The image in the `/images/gentropy/Dockerfile` is based on the [gentropy image](https://github.com/opentargets/gentropy/blob/dev/Dockerfile). The additional packages are added to the image to make it compatible with Open Targets infrastructure in google cloud, that include:
194+
195+
- google cloud sdk (with gsutil)
196+
- bash script to run the gentropy harmonisation pipeline
197+
198+
> [!WARNING]
199+
> Before running the harmonisation pipeline (`gwas_catalog_harmonisation` dag) it is necessary to update the base docker container to reflect the changes in the `gentropy` image. This is done by running the `make build-gentropy-gcs-image` command run in the root of the repository.
200+
201+
## Gentropy image
202+
203+
The image in this directory is based on the [gentropy image](https://github.com/opentargets/gentropy/blob/dev/Dockerfile). The additional packages are added to the image to make it compatible with the Open Targets Platform, that include:
204+
205+
- google cloud sdk (with gsutil)
206+
- bash script to run the gentropy harmonisation pipeline
207+
208+
> [!WARNING]
209+
> Before running the harmonisation pipeline (`gwas_catalog_harmonisation` dag) it is necessary to update the base docker container to reflect the changes in the `gentropy` image. This is done by running the `make build-gentropy-gcs-image` command run in the root of the repository.
210+
182211
## GWAS Catalog top hits
183212

184213
Bucket `gs://gwas_catalog_top_hits` contains:
185214

186-
```
215+
```bash
187216
gs://gwas_catalog_top_hits/credible_sets/
188217
gs://gwas_catalog_top_hits/study_index/
189218
gs://gwas_catalog_top_hits/study_locus_ld_clumped/
@@ -218,7 +247,7 @@ The step that performs [PICS finemapping](https://opentargets.github.io/gentropy
218247

219248
Bucket `gs://gwas_catalog_sumstats_pics` contains:
220249

221-
```
250+
```bash
222251
gs://gwas_catalog_sumstats_pics/credible_sets/
223252
gs://gwas_catalog_sumstats_pics/study_index/
224253
gs://gwas_catalog_sumstats_pics/study_locus_ld_clumped/
@@ -258,7 +287,7 @@ The step that performs [PICS finemapping](https://opentargets.github.io/gentropy
258287

259288
Bucket `gs://gwas_catalog_sumstats_susie` contains:
260289

261-
```
290+
```bash
262291
gs://gwas_catalog_sumstats_susie/credible_set_datasets/
263292
gs://gwas_catalog_sumstats_susie/credible_sets_clean/
264293
gs://gwas_catalog_sumstats_susie/finemapping_logs/

images/gentropy/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app/gentropy:v2.0.1
2+
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
3+
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \
4+
&& curl https://packages.cloud.google.com/apt/doc/apt-key.gpg\
5+
| gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg \
6+
&& apt-get update -y && apt-get install google-cloud-cli -y
7+
COPY --chmod=0755 images/gentropy/scripts/harmonise-sumstats.sh harmonise-sumstats.sh
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/bin/bash
2+
# Script for running harmonisation and qc steps by the google batch job
3+
# Requirements:
4+
# 1. Gentropy & poetry
5+
# 2. gsutil
6+
# 3. gzip
7+
8+
# set -x
9+
10+
readonly RAW_FILE=$1
11+
readonly HARMONISED_FILE=$2
12+
readonly QC_FILE=$3
13+
readonly QC_THRESHOLD=$4
14+
export HYDRA_FULL_ERROR=1
15+
16+
logging() {
17+
log_prompt="[$(date "+%Y.%m.%d %H:%M")]"
18+
echo "${log_prompt} $@" | tee -a ${LOCAL_LOG_FILE}
19+
}
20+
21+
# NOTE: Harmonised path contains ${output_path}/harmonised_sumstats/${study_id}
22+
HARMONISATION_DIR=$(dirname $HARMONISED_FILE)
23+
OUTPUT_PATH=$(dirname $HARMONISATION_DIR)
24+
STUDY_ID=$(basename $HARMONISED_FILE)
25+
LOCAL_LOG_FILE="harmonisation.log"
26+
LOCAL_SUMMARY_FILE=harmonisation.csv
27+
RAW_LOCAL_FILE=$(basename $RAW_FILE)
28+
UNZIPPED_RAW_LOCAL_FILE="${RAW_LOCAL_FILE%.*}"
29+
30+
# Make sure we start with clean setup
31+
if [ -f ${LOCAL_SUMMARY_FILE} ]; then
32+
rm -rf ${LOCAL_SUMMARY_FILE}
33+
fi
34+
echo "study,harmonisationExitCode,qcExitCode,rawSumstatFile,rawSumstatFileSize,rawUnzippedSumstatFileSize" >$LOCAL_SUMMARY_FILE
35+
36+
if [ -f ${LOCAL_LOG_FILE} ]; then
37+
rm -rf ${LOCAL_LOG_FILE}
38+
fi
39+
40+
logging "Copying raw summary statistics from ${RAW_FILE} to ${RAW_LOCAL_FILE}"
41+
gsutil cp $RAW_FILE $RAW_LOCAL_FILE
42+
43+
RAW_FILE_SIZE=$(du -sh ${RAW_LOCAL_FILE} | cut -f1)
44+
logging "Raw file size ${RAW_FILE_SIZE}"
45+
46+
logging "Unzipping ${RAW_LOCAL_FILE} to ${UNZIPPED_RAW_LOCAL_FILE}"
47+
gzip -d $RAW_LOCAL_FILE
48+
49+
UNZIPPED_FILE_SIZE=$(du -sh ${UNZIPPED_RAW_LOCAL_FILE} | cut -f1)
50+
logging "Unzipped file size ${UNZIPPED_FILE_SIZE}"
51+
52+
logging "Running harmonisation on ${UNZIPPED_RAW_LOCAL_FILE} file"
53+
poetry run gentropy step=gwas_catalog_sumstat_preprocess \
54+
step.raw_sumstats_path=$UNZIPPED_RAW_LOCAL_FILE \
55+
step.out_sumstats_path=$HARMONISED_FILE \
56+
step.session.write_mode=overwrite \
57+
+step.session.extended_spark_conf="{spark.jars:https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar}" \
58+
+step.session.extended_spark_conf="{spark.dynamicAllocation.enabled:false}" \
59+
+step.session.extended_spark_conf="{spark.driver.memory:16g}" \
60+
+step.session.extended_spark_conf="{spark.kryoserializer.buffer.max:500m}" \
61+
+step.session.extended_spark_conf="{spark.driver.maxResultSize:5g}" >>${LOCAL_LOG_FILE} 2>&1
62+
# NOTE: can not use tee to redirect, otherwise the exit code will always be 0
63+
HARMONISATION_EXIT_CODE=$?
64+
logging "Harmonisation exit code: ${HARMONISATION_EXIT_CODE}"
65+
66+
logging "Running qc on ${HARMONISED_FILE} file"
67+
poetry run gentropy step=summary_statistics_qc \
68+
step.gwas_path=$HARMONISED_FILE \
69+
step.output_path=$QC_FILE \
70+
step.pval_threshold=$QC_THRESHOLD \
71+
step.session.write_mode=overwrite \
72+
+step.session.extended_spark_conf="{spark.jars:https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar}" \
73+
+step.session.extended_spark_conf="{spark.dynamicAllocation.enabled:false}" \
74+
+step.session.extended_spark_conf="{spark.driver.memory:16g}" \
75+
+step.session.extended_spark_conf="{spark.kryoserializer.buffer.max:500m}" \
76+
+step.session.extended_spark_conf="{spark.driver.maxResultSize:5g}" >>${LOCAL_LOG_FILE} 2>&1
77+
QC_EXIT_CODE=$?
78+
logging "QC exit code: ${QC_EXIT_CODE}"
79+
80+
echo "$STUDY_ID,$HARMONISATION_EXIT_CODE,$QC_EXIT_CODE,$RAW_FILE,$RAW_FILE_SIZE,$UNZIPPED_FILE_SIZE" >>$LOCAL_SUMMARY_FILE
81+
82+
clean_up() {
83+
# ensure the logs from the job and summary of harmonisation & qc are preserved (latest are overwritten and dated are maintained)
84+
DATE=$(date "+%Y%m%d%H%M")
85+
REMOTE_LOG_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/${DATE}/harmonisation.log"
86+
LATEST_REMOTE_LOG_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/latest/harmonisation.log"
87+
REMOTE_SUMMARY_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/${DATE}/harmonisation.csv"
88+
LATEST_REMOTE_SUMMARY_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/latest/harmonisation.csv"
89+
90+
gsutil cp ${LOCAL_LOG_FILE} ${REMOTE_LOG_FILE}
91+
gsutil cp ${LOCAL_LOG_FILE} ${LATEST_REMOTE_LOG_FILE}
92+
93+
gsutil cp ${LOCAL_SUMMARY_FILE} ${REMOTE_SUMMARY_FILE}
94+
gsutil cp ${LOCAL_SUMMARY_FILE} ${LATEST_REMOTE_SUMMARY_FILE}
95+
96+
}
97+
98+
trap clean_up EXIT
99+
100+
# exit with a non-zero exit code fist, otherwise 0
101+
exit $HARMONISATION_EXIT_CODE

poetry.lock

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ pendulum = "^3.0.0"
2929
apache-airflow-providers-apache-beam = "^5.7.1"
3030
requests = "^2.32.3"
3131
pyhocon = "^0.3.61"
32+
pandas = "2.1.4"
3233

3334
[tool.poetry.group.dev.dependencies]
3435
ruff = "^0.4.9"

0 commit comments

Comments
 (0)