opentargets
diff --git a/‎.github/workflows/artifact.yaml
Lines changed: 55 additions & 0 deletions b/‎.github/workflows/artifact.yaml
Lines changed: 55 additions & 0 deletions
diff --git a/‎.github/workflows/push_config.yaml
Lines changed: 0 additions & 17 deletions b/‎.github/workflows/push_config.yaml
Lines changed: 0 additions & 17 deletions
diff --git a/‎Makefile
Lines changed: 15 additions & 0 deletions b/‎Makefile
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/README.md
Lines changed: 1 addition & 1 deletion b/‎docs/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/datasources/gwas_catalog_data/README.md
Lines changed: 37 additions & 8 deletions b/‎docs/datasources/gwas_catalog_data/README.md
Lines changed: 37 additions & 8 deletions
diff --git a/‎images/gentropy/Dockerfile
Lines changed: 7 additions & 0 deletions b/‎images/gentropy/Dockerfile
Lines changed: 7 additions & 0 deletions
diff --git a/‎images/gentropy/scripts/harmonise-sumstats.sh
Lines changed: 101 additions & 0 deletions b/‎images/gentropy/scripts/harmonise-sumstats.sh
Lines changed: 101 additions & 0 deletions
diff --git a/‎poetry.lock
Lines changed: 2 additions & 3 deletions b/‎poetry.lock
Lines changed: 2 additions & 3 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,55 @@
+---
+name: Build and Push to Artifact Registry
+
+"on":
+  push:
+    branches: ["*"]
+    tags: ["*"]
+    paths:
+      - images/**
+
+env:
+  PROJECT_ID: open-targets-genetics-dev
+  REGION: europe-west1
+  GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
+  REPOSITORY: gentropy-app
+  PYTHON_VERSION_DEFAULT: "3.10.8"
+
+jobs:
+  build-push-artifact:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: "actions/checkout@v3"
+
+      - name: "auth"
+        uses: "google-github-actions/auth@v2"
+        with:
+          credentials_json: "${{ secrets.GC_SERVICE_ACCOUNT_KEY }}"
+
+      - name: "Set up Cloud SDK"
+        uses: "google-github-actions/setup-gcloud@v2"
+
+      - name: "Use gcloud CLI"
+        run: "gcloud info"
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: "Docker auth"
+        run: |-
+          gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
+
+      - name: Quick Docker build (gentropy image overloaded with orchestration logic, AMD64 only, with layer cache)
+        uses: docker/build-push-action@v6
+        with:
+          platforms: linux/amd64
+          push: true
+          tags: "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/ot_gentropy:${{ github.ref_name }}"
+          context: .
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          file: images/gentropy/Dockerfile
@@ -55,3 +55,18 @@ upload-gwas-catalog-buckets-readme: ## Upload gwas_catalog readme to the bucket(
 	@gsutil rsync docs/datasources/gwas_catalog_data gs://gwas_catalog_top_hits/docs
 
 update-bucket-docs: upload-eqtl-catalogue-bucket-readme upload-ukb-ppp-bucket-readme upload-finngen-bucket-readme upload-gwas-catalog-buckets-readme ## Upload readmes to the datasource buckets
+
+
+build-gentropy-gcs-image: ## build image that overwrited gentropy with tools specific for orchestration and google cloud
+	@docker buildx build \
+		--platform=linux/amd64,linux/arm64 \
+		-t europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app/ot_gentropy:dev  \
+		--push \
+		-f images/gentropy/Dockerfile \
+		--no-cache .
+
+setup-harmonisation-test: ## Prepare the test bucket with raw summary statistics for the harmonisation test.
+	@gsutil rm gs://ot_orchestration/test/gwas_catalog_inputs/harmonisation_manifest.csv
+	@gsutil -m rm -r gs://ot_orchestration/test/gwas_catalog_inputs/harmonisation_summary
+	@gsutil -m rm -r gs://ot_orchestration/test/gwas_catalog_inputs/harmonised_summary_statistics
+	@gsutil -m rm -r gs://ot_orchestration/test/gwas_catalog_inputs/summary_statistics_qc
@@ -2,7 +2,7 @@
 
 This catalog describes how the orchestration works in the current state
 
-### How to generate dag svg files
+## How to generate dag svg files
 
 1. Locate your global `airflow.cfg` file and update the [core] dag_folder in `airflow.cfg` to point to the `src` directory of the orchestration repository or set the `AIRFLOW__CORE__DAGS_FOLDER` environment variable.
 
 
@@ -13,7 +13,7 @@ Data stored under 4 buckets:
 
 Bucket `gs://gwas_catalog_inputs` contains:
 
-```
+```bash
 gs://gwas_catalog_inputs/gwas_catalog_associations_ontology_annotated.tsv
 gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv
 gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv
@@ -61,7 +61,7 @@ as failing.
 <details>
   <summary>Expand to see the example of manifest file</summary>
 
-```
+```bash
 rawSumstatPath,study,harmonisedSumstatPath,isHarmonised,qcPath,qcPerformed
 gs://gwas_catalog_inputs/raw_summary_statistics/GCST000001-GCST001000/GCST000028/harmonised/17463246-GCST000028-EFO_0001360.h.tsv.gz,GCST000028,gs://gwas_catalog_inputs/harmonised_summary_statistics/GCST000028/,True,gs://gwas_catalog_inputs/summary_statistics_qc/GCST000028/,True
 ```
@@ -77,7 +77,7 @@ This is the dataset containing meta information about the status of finemapping.
 
 The files are stored under the per study directory in the form like below:
 
-```
+```bash
 gs://gwas_catalog_inputs/harmonisation_summary/GCST90077749/202410141529/harmonisation.csv
 gs://gwas_catalog_inputs/harmonisation_summary/GCST90077749/202410141529/harmonisation.log
 gs://gwas_catalog_inputs/harmonisation_summary/GCST90077749/latest/harmonisation.csv
@@ -102,7 +102,7 @@ The file reports following metrics:
 <details>
   <summary>Expand to see the example</summary>
 
-```
+```bash
 study,harmonisationExitCode,qcExitCode,rawSumstatFile,rawSumstatFileSize,rawUnzippedSumstatFileSize
 GCST90077749,0,1,gs://gwas_catalog_inputs/raw_summary_statistics/GCST90077001-GCST90078000/GCST90077749/harmonised/34662886-GCST90077749-EFO_1001919.h.tsv.gz,18M,62M
 ```
@@ -116,7 +116,7 @@ This file contains logs from the harmonisation script collected during it's exec
 <details>
   <summary>Expand to see the example</summary>
 
-```
+```bash
 [2024.10.14 15:33] Copying raw summary statistics from gs://gwas_catalog_inputs/raw_summary_statistics/GCST90078001-GCST90079000/GCST90079000/harmonised/GCST90079000.h.tsv.gz to GCST90079000.h.tsv.gz
 [2024.10.14 15:34] Raw file size 17M
 [2024.10.14 15:34] Unzipping GCST90079000.h.tsv.gz to GCST90079000.h.tsv
@@ -179,11 +179,40 @@ datasets: {}
 
 This directory contains various analysis performed on harmonisation results.
 
+## Gwas catalog harmonisation & qc dag
+
+The `gwas_catalog_harmonisation` dag is used to perform the harmonisation and quality checks on the raw summary statistics. The dag configuration and topology can be found in `gwas_catalog_harmonisation.yaml` file under the config directory. Since this task is computationally expensive, it is run in parallel by the google batch operators. The dag contains 2 steps:
+
+1. Harmonisation done by [gwas_catalog_sumstat_preprocess](https://opentargets.github.io/gentropy/python_api/steps/gwas_catalog_sumstat_preprocess/)
+2. Quality Control of the harmonised summary statistics done by [sumstat_qc_step](https://opentargets.github.io/gentropy/python_api/steps/summary_statistics_qc/)
+
+To run the dag, one need to prepare the input files and gentropy overwritten docker image.
+
+### Gentropy overwritten docker image
+
+The image in the `/images/gentropy/Dockerfile` is based on the [gentropy image](https://github.com/opentargets/gentropy/blob/dev/Dockerfile). The additional packages are added to the image to make it compatible with Open Targets infrastructure in google cloud, that include:
+
+- google cloud sdk (with gsutil)
+- bash script to run the gentropy harmonisation pipeline
+
+> [!WARNING]
+> Before running the harmonisation pipeline (`gwas_catalog_harmonisation` dag) it is necessary to update the base docker container to reflect the changes in the `gentropy` image. This is done by running the `make build-gentropy-gcs-image` command run in the root of the repository.
+
+## Gentropy image
+
+The image in this directory is based on the [gentropy image](https://github.com/opentargets/gentropy/blob/dev/Dockerfile). The additional packages are added to the image to make it compatible with the Open Targets Platform, that include:
+
+- google cloud sdk (with gsutil)
+- bash script to run the gentropy harmonisation pipeline
+
+> [!WARNING]
+> Before running the harmonisation pipeline (`gwas_catalog_harmonisation` dag) it is necessary to update the base docker container to reflect the changes in the `gentropy` image. This is done by running the `make build-gentropy-gcs-image` command run in the root of the repository.
+
 ## GWAS Catalog top hits
 
 Bucket `gs://gwas_catalog_top_hits` contains:
 
-```
+```bash
 gs://gwas_catalog_top_hits/credible_sets/
 gs://gwas_catalog_top_hits/study_index/
 gs://gwas_catalog_top_hits/study_locus_ld_clumped/
@@ -218,7 +247,7 @@ The step that performs [PICS finemapping](https://opentargets.github.io/gentropy
 
 Bucket `gs://gwas_catalog_sumstats_pics` contains:
 
-```
+```bash
 gs://gwas_catalog_sumstats_pics/credible_sets/
 gs://gwas_catalog_sumstats_pics/study_index/
 gs://gwas_catalog_sumstats_pics/study_locus_ld_clumped/
@@ -258,7 +287,7 @@ The step that performs [PICS finemapping](https://opentargets.github.io/gentropy
 
 Bucket `gs://gwas_catalog_sumstats_susie` contains:
 
-```
+```bash
 gs://gwas_catalog_sumstats_susie/credible_set_datasets/
 gs://gwas_catalog_sumstats_susie/credible_sets_clean/
 gs://gwas_catalog_sumstats_susie/finemapping_logs/
 
@@ -0,0 +1,7 @@
+FROM europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app/gentropy:v2.0.1
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
+    | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \
+    && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg\
+    | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg \
+    && apt-get update -y && apt-get install google-cloud-cli -y
+COPY --chmod=0755 images/gentropy/scripts/harmonise-sumstats.sh harmonise-sumstats.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# Script for running harmonisation and qc steps by the google batch job
+# Requirements:
+# 1. Gentropy & poetry
+# 2. gsutil
+# 3. gzip
+
+# set -x
+
+readonly RAW_FILE=$1
+readonly HARMONISED_FILE=$2
+readonly QC_FILE=$3
+readonly QC_THRESHOLD=$4
+export HYDRA_FULL_ERROR=1
+
+logging() {
+    log_prompt="[$(date "+%Y.%m.%d %H:%M")]"
+    echo "${log_prompt} $@" | tee -a ${LOCAL_LOG_FILE}
+}
+
+# NOTE: Harmonised path contains ${output_path}/harmonised_sumstats/${study_id}
+HARMONISATION_DIR=$(dirname $HARMONISED_FILE)
+OUTPUT_PATH=$(dirname $HARMONISATION_DIR)
+STUDY_ID=$(basename $HARMONISED_FILE)
+LOCAL_LOG_FILE="harmonisation.log"
+LOCAL_SUMMARY_FILE=harmonisation.csv
+RAW_LOCAL_FILE=$(basename $RAW_FILE)
+UNZIPPED_RAW_LOCAL_FILE="${RAW_LOCAL_FILE%.*}"
+
+# Make sure we start with clean setup
+if [ -f ${LOCAL_SUMMARY_FILE} ]; then
+    rm -rf ${LOCAL_SUMMARY_FILE}
+fi
+echo "study,harmonisationExitCode,qcExitCode,rawSumstatFile,rawSumstatFileSize,rawUnzippedSumstatFileSize" >$LOCAL_SUMMARY_FILE
+
+if [ -f ${LOCAL_LOG_FILE} ]; then
+    rm -rf ${LOCAL_LOG_FILE}
+fi
+
+logging "Copying raw summary statistics from ${RAW_FILE} to ${RAW_LOCAL_FILE}"
+gsutil cp $RAW_FILE $RAW_LOCAL_FILE
+
+RAW_FILE_SIZE=$(du -sh ${RAW_LOCAL_FILE} | cut -f1)
+logging "Raw file size ${RAW_FILE_SIZE}"
+
+logging "Unzipping ${RAW_LOCAL_FILE} to ${UNZIPPED_RAW_LOCAL_FILE}"
+gzip -d $RAW_LOCAL_FILE
+
+UNZIPPED_FILE_SIZE=$(du -sh ${UNZIPPED_RAW_LOCAL_FILE} | cut -f1)
+logging "Unzipped file size ${UNZIPPED_FILE_SIZE}"
+
+logging "Running harmonisation on ${UNZIPPED_RAW_LOCAL_FILE} file"
+poetry run gentropy step=gwas_catalog_sumstat_preprocess \
+    step.raw_sumstats_path=$UNZIPPED_RAW_LOCAL_FILE \
+    step.out_sumstats_path=$HARMONISED_FILE \
+    step.session.write_mode=overwrite \
+    +step.session.extended_spark_conf="{spark.jars:https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar}" \
+    +step.session.extended_spark_conf="{spark.dynamicAllocation.enabled:false}" \
+    +step.session.extended_spark_conf="{spark.driver.memory:16g}" \
+    +step.session.extended_spark_conf="{spark.kryoserializer.buffer.max:500m}" \
+    +step.session.extended_spark_conf="{spark.driver.maxResultSize:5g}" >>${LOCAL_LOG_FILE} 2>&1
+# NOTE: can not use tee to redirect, otherwise the exit code will always be 0
+HARMONISATION_EXIT_CODE=$?
+logging "Harmonisation exit code: ${HARMONISATION_EXIT_CODE}"
+
+logging "Running qc on ${HARMONISED_FILE} file"
+poetry run gentropy step=summary_statistics_qc \
+    step.gwas_path=$HARMONISED_FILE \
+    step.output_path=$QC_FILE \
+    step.pval_threshold=$QC_THRESHOLD \
+    step.session.write_mode=overwrite \
+    +step.session.extended_spark_conf="{spark.jars:https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar}" \
+    +step.session.extended_spark_conf="{spark.dynamicAllocation.enabled:false}" \
+    +step.session.extended_spark_conf="{spark.driver.memory:16g}" \
+    +step.session.extended_spark_conf="{spark.kryoserializer.buffer.max:500m}" \
+    +step.session.extended_spark_conf="{spark.driver.maxResultSize:5g}" >>${LOCAL_LOG_FILE} 2>&1
+QC_EXIT_CODE=$?
+logging "QC exit code: ${QC_EXIT_CODE}"
+
+echo "$STUDY_ID,$HARMONISATION_EXIT_CODE,$QC_EXIT_CODE,$RAW_FILE,$RAW_FILE_SIZE,$UNZIPPED_FILE_SIZE" >>$LOCAL_SUMMARY_FILE
+
+clean_up() {
+    # ensure the logs from the job and summary of harmonisation & qc are preserved (latest are overwritten and dated are maintained)
+    DATE=$(date "+%Y%m%d%H%M")
+    REMOTE_LOG_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/${DATE}/harmonisation.log"
+    LATEST_REMOTE_LOG_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/latest/harmonisation.log"
+    REMOTE_SUMMARY_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/${DATE}/harmonisation.csv"
+    LATEST_REMOTE_SUMMARY_FILE="${OUTPUT_PATH}/harmonisation_summary/${STUDY_ID}/latest/harmonisation.csv"
+
+    gsutil cp ${LOCAL_LOG_FILE} ${REMOTE_LOG_FILE}
+    gsutil cp ${LOCAL_LOG_FILE} ${LATEST_REMOTE_LOG_FILE}
+
+    gsutil cp ${LOCAL_SUMMARY_FILE} ${REMOTE_SUMMARY_FILE}
+    gsutil cp ${LOCAL_SUMMARY_FILE} ${LATEST_REMOTE_SUMMARY_FILE}
+
+}
+
+trap clean_up EXIT
+
+# exit with a non-zero exit code fist, otherwise 0
+exit $HARMONISATION_EXIT_CODE
@@ -29,6 +29,7 @@ pendulum = "^3.0.0"
 apache-airflow-providers-apache-beam = "^5.7.1"
 requests = "^2.32.3"
 pyhocon = "^0.3.61"
+pandas = "2.1.4"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.4.9"