From 260db9ec75c2b1c03b5ae9ec263ff541d20b90e0 Mon Sep 17 00:00:00 2001 From: Franz Srambical <79149449+emergenz@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:16:55 +0100 Subject: [PATCH 1/3] fix: use extension-module for pyo3 (#1) works! --- environment/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment/Cargo.toml b/environment/Cargo.toml index a09719f..b161e8d 100644 --- a/environment/Cargo.toml +++ b/environment/Cargo.toml @@ -25,4 +25,4 @@ smallvec = "1.10.0" [dependencies.pyo3] version = "0.20.2" -features = ["abi3-py38"] +features = ["extension-module"] From abbb4bdcdb1176bcc929565f380c4b6915023cb6 Mon Sep 17 00:00:00 2001 From: Franz Srambical Date: Wed, 30 Oct 2024 12:55:26 +0100 Subject: [PATCH 2/3] feat: add launchers + docs for distributed setup --- README.md | 20 ++++++++++++++--- launch/run_bootstrap_distributed.sh | 9 ++++++++ launch/start_redis.sh | 33 +++++++++++++++++++++++++++++ launch/start_worker.sh | 7 ++++++ redis.def | 2 ++ 5 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 launch/run_bootstrap_distributed.sh create mode 100644 launch/start_redis.sh create mode 100644 launch/start_worker.sh create mode 100644 redis.def diff --git a/README.md b/README.md index eff340f..219d00d 100644 --- a/README.md +++ b/README.md @@ -83,10 +83,24 @@ The entry point for the conjecture-prove loop is in [learning/bootstrap.py](boot [learning] $ python bootstrap.py theory=groups ``` -We use hydra for configuration -- the relevant file here is [config/bootstrap.yaml](config/bootstrap.yaml). This will run the loop in "sequential" mode, in a single process. There is a distributed mode, backed by a [https://docs.celeryq.dev/en/stable/](Celery queue), that you can use to leverage multiple CPUs/GPUs, either in the same or different machines (it doesn't matter, as long as they can connect to the queue). The setup is a bit manual: you must first spin up a Redis server, then run Celery worker processes backed by the Redis server, and finally run bootstrap.py with a DISTRIBUTED=1 environment variable: +We use hydra for configuration -- the relevant file here is [config/bootstrap.yaml](config/bootstrap.yaml). This will run the loop in "sequential" mode, in a single process. There is a distributed mode, backed by a [https://docs.celeryq.dev/en/stable/](Celery queue), that you can use to leverage multiple CPUs/GPUs, either in the same or different machines (it doesn't matter, as long as they can connect to the queue). -```sh -[learning] $ DISTRIBUTED=1 python bootstrap.py theory=groups +The setup is a bit manual: +1. Build the redis container +``` +apptainer build redis.sif redis.def +``` +1. Start the redis container +``` +sh launch/start_redis.sh +``` +2. Run Celery worker process +``` +sh launch/start_worker.sh +``` +3. Run bootstrap.py in distributed mode +``` +sh launch/run_bootstrap_distributed.sh ``` Feel free to open an issue if you're interested in setting this up, and I can expand on the documentation. The details might get a little bit cluster-specific, though the general setup is just that you need (a) a Redis server, (b) a number of worker processes that connect to it, and (c) a teacher process that runs the bootstrapping loop, also connecting to the same Redis server. diff --git a/launch/run_bootstrap_distributed.sh b/launch/run_bootstrap_distributed.sh new file mode 100644 index 0000000..c2c582c --- /dev/null +++ b/launch/run_bootstrap_distributed.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Read the Redis port +export REDIS=$(cat redis_hostname_port.txt) +export DISTRIBUTED=1 + +# Run the bootstrap script +# TODO(f.srambical): Make theory a command line argument +cd learning/ +python bootstrap.py theory=groups \ No newline at end of file diff --git a/launch/start_redis.sh b/launch/start_redis.sh new file mode 100644 index 0000000..9e8f76c --- /dev/null +++ b/launch/start_redis.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +LOGLEVEL="" +while [[ $# -gt 0 ]]; do + case $1 in + -v|--verbose) + LOGLEVEL="--loglevel verbose" + shift + ;; + esac +done + +# Get a random available port or use a specific one assigned by your cluster +export REDIS_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') +echo "Starting Redis on port $REDIS_PORT" + +# Save the port to a file for other processes to read +echo "$(hostname):$REDIS_PORT" > redis_hostname_port.txt + +# Cleanup function to remove port file and kill Redis when the script exits +cleanup() { + rm -f redis_hostname_port.txt + pkill -f "redis-server --port $REDIS_PORT" + exit +} + +# Set up trap to catch script termination +trap cleanup SIGINT SIGTERM + +# Start Redis container in the foreground +# FIXME(f.srambical): `--save ""` is a quickfix and leads to redis not trying to persist data +# We should instead fix data persistence in the redis container +apptainer run --env REDIS_PORT=$REDIS_PORT redis.sif redis-server --port $REDIS_PORT --protected-mode no --bind 0.0.0.0 $LOGLEVEL diff --git a/launch/start_worker.sh b/launch/start_worker.sh new file mode 100644 index 0000000..296ffc8 --- /dev/null +++ b/launch/start_worker.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Read the Redis port +export REDIS=$(cat redis_hostname_port.txt) + +# Start a Celery worker +cd learning/ +celery -A worker worker --concurrency=1 -n $REDIS \ No newline at end of file diff --git a/redis.def b/redis.def new file mode 100644 index 0000000..0319c56 --- /dev/null +++ b/redis.def @@ -0,0 +1,2 @@ +Bootstrap: docker +From: redis:7.2 \ No newline at end of file From cee7561393ac5e85626dbded607570f5c6cf5b44 Mon Sep 17 00:00:00 2001 From: Franz Srambical Date: Wed, 30 Oct 2024 12:59:01 +0100 Subject: [PATCH 3/3] fix: remove comment --- launch/start_redis.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/launch/start_redis.sh b/launch/start_redis.sh index 9e8f76c..ab5f997 100644 --- a/launch/start_redis.sh +++ b/launch/start_redis.sh @@ -28,6 +28,4 @@ cleanup() { trap cleanup SIGINT SIGTERM # Start Redis container in the foreground -# FIXME(f.srambical): `--save ""` is a quickfix and leads to redis not trying to persist data -# We should instead fix data persistence in the redis container apptainer run --env REDIS_PORT=$REDIS_PORT redis.sif redis-server --port $REDIS_PORT --protected-mode no --bind 0.0.0.0 $LOGLEVEL