Skip to content

Commit

Permalink
feat(dataproc): ability to version gentropy for dataproc cluster (#774)
Browse files Browse the repository at this point in the history
  • Loading branch information
project-defiant authored Sep 23, 2024
1 parent b93842a commit a29222e
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 22 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/artifact.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ env:
REGION: europe-west1
GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
REPOSITORY: gentropy-app
PYTHON_VERSION_DEFAULT: "3.10.8"

jobs:
build-push-artifact:
Expand Down Expand Up @@ -67,3 +68,18 @@ jobs:
tags: "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/custom_ensembl_vep:${{ github.ref_name }}"
context: .
file: "src/vep/Dockerfile"

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION_DEFAULT }}
- name: Install and configure Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true

- name: Build and push spark cluster dependencies
run: |
make build
32 changes: 15 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
PROJECT_ID ?= open-targets-genetics-dev
REGION ?= europe-west1
APP_NAME ?= $$(cat pyproject.toml| grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
VERSION_NO ?= $$(poetry version --short)
CLEAN_VERSION_NO := $(shell echo "$(VERSION_NO)" | tr -cd '[:alnum:]')
BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/
BUCKET_COMPOSER_DAGS=gs://europe-west1-ot-workflows-fe147745-bucket/dags/
APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
REF ?= $$(git rev-parse --abbrev-ref HEAD)
PACKAGE_VERSION ?= $$(poetry version --short)
CLEAN_PACKAGE_VERSION := $(shell echo "$(PACKAGE_VERSION)" | tr -cd '[:alnum:]')
BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${APP_NAME}/${REF}

.PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST))

Expand Down Expand Up @@ -38,35 +38,33 @@ build-documentation: ## Create local server with documentation
create-dev-cluster: build ## Spin up a simple dataproc cluster with all dependencies for development purposes
@echo "Creating Dataproc Dev Cluster"
@gcloud config set project ${PROJECT_ID}
@gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_VERSION_NO}-$(USER)" \
@gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \
--image-version 2.1 \
--region ${REGION} \
--master-machine-type n1-standard-16 \
--initialization-actions=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/install_dependencies_on_cluster.sh \
--metadata="PACKAGE=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/gentropy-${VERSION_NO}-py3-none-any.whl,CONFIGTAR=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/config.tar.gz" \
--initialization-actions=$(BUCKET_NAME)/install_dependencies_on_cluster.sh \
--metadata="PACKAGE=$(BUCKET_NAME)/${APP_NAME}-${PACKAGE_VERSION}-py3-none-any.whl" \
--secondary-worker-type spot \
--worker-machine-type n1-standard-4 \
--worker-boot-disk-size 500 \
--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
--optional-components=JUPYTER \
--enable-component-gateway \
--max-idle=30m
--max-idle=60m

make update-dev-cluster: build ## Reinstalls the package on the dev-cluster
@echo "Updating Dataproc Dev Cluster"
@gcloud config set project ${PROJECT_ID}
gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_VERSION_NO}" \
gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_PACKAGE_VERSION}" \
--region ${REGION} \
--jars=${BUCKET_NAME}/install_dependencies_on_cluster.sh \
-e='sh chmod 750 $${PWD}/install_dependencies_on_cluster.sh; sh $${PWD}/install_dependencies_on_cluster.sh'

build: clean ## Build Python package with dependencies
@gcloud config set project ${PROJECT_ID}
@echo "Packaging Code and Dependencies for ${APP_NAME}-${VERSION_NO}"
@echo "Packaging Code and Dependencies for ${APP_NAME}-${PACKAGE_VERSION}"
@poetry build
@tar -czf dist/config.tar.gz config/
@echo "Uploading to Dataproc"
@gsutil cp src/gentropy/cli.py ${BUCKET_NAME}
@gsutil cp ./dist/${APP_NAME}-${VERSION_NO}-py3-none-any.whl ${BUCKET_NAME}
@gsutil cp ./dist/config.tar.gz ${BUCKET_NAME}
@gsutil cp ./utils/install_dependencies_on_cluster.sh ${BUCKET_NAME}
@echo "Uploading to ${BUCKET_NAME}"
@gsutil cp src/${APP_NAME}/cli.py ${BUCKET_NAME}/
@gsutil cp ./dist/${APP_NAME}-${PACKAGE_VERSION}-py3-none-any.whl ${BUCKET_NAME}/
@gsutil cp ./utils/install_dependencies_on_cluster.sh ${BUCKET_NAME}/
16 changes: 16 additions & 0 deletions docs/development/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,19 @@ Some functions on MacOS may throw a java error:
This can be resolved by adding the follow line to your `~/.zshrc`:

`export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`

## Creating development dataproc cluster (OT users only)

To start dataproc cluster in the development mode run

```
make create-dev-cluster
```

The command above will prepare 3 different resources:

- gentropy package
- cli script
- cluster setup script

and based on the branch ref (for example `dev`) will create a namespaced folder under GCS (`gs://genetics_etl_python_playground/initialisation/gentropy/dev`) with the three files described above. These files will be then used to create the cluster environment.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ google = "^3.0.0"
omegaconf = "^2.3.0"
typing-extensions = "^4.9.0"
scikit-learn = "^1.3.2"
pandas = {extras = ["gcp", "parquet"], version = "^2.2.2"}
pandas = { extras = ["gcp", "parquet"], version = "^2.2.2" }
skops = ">=0.9,<0.11"
google-cloud-secret-manager = "^2.20.0"

Expand Down
4 changes: 0 additions & 4 deletions utils/install_dependencies_on_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -exo pipefail

readonly PACKAGE=$(/usr/share/google/get_metadata_value attributes/PACKAGE || true)
readonly CONFIGTAR=$(/usr/share/google/get_metadata_value attributes/CONFIGTAR || true)

function err() {
echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2
Expand Down Expand Up @@ -63,9 +62,6 @@ function main() {
echo "Install package..."
run_with_retry pip install --upgrade ${PACKAGENAME}

echo "Downloading and uncompressing config..."
gsutil cp ${CONFIGTAR} . || err "Failed to download CONFIGTAR"
tar -xvf $(basename ${CONFIGTAR}) || err "Failed to extract CONFIGTAR"
}

main

0 comments on commit a29222e

Please sign in to comment.