Skip to content

Commit

Permalink
update beaker config
Browse files Browse the repository at this point in the history
  • Loading branch information
davidheineman committed Dec 1, 2024
1 parent af836a0 commit 2a3cdb7
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 46 deletions.
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@ git clone https://github.com/davidheineman/acl-search
pip install -r requirements.txt
python src/server.py # (this will download a pre-built index!)

# (getting pip errors?)
# getting pip errors? (install sentencepiece deps)
sudo apt-get update
sudo apt-get install -y pkg-config libsentencepiece-dev

# running on CUDA? (fix broken package path)
INSTALL_PATH=PATH_TO_YOUR_PYTHON_INSTALL # e.g., /root/ai2/miniconda3/envs/acl_search/lib/python3.10
cp ./src/extras/segmented_maxsim.cpp $INSTALL_PATH/site-packages/colbert/modeling/segmented_maxsim.cpp
cp ./src/extras/decompress_residuals.cpp $INSTALL_PATH/site-packages/colbert/search/decompress_residuals.cpp
cp ./src/extras/filter_pids.cpp $INSTALL_PATH/site-packages/colbert/search/filter_pids.cpp
cp ./src/extras/segmented_lookup.cpp $INSTALL_PATH/site-packages/colbert/search/segmented_lookup.cpp
```

## More Features
Expand Down Expand Up @@ -71,15 +78,14 @@ fly launch

**Update Index on HF**
```sh
# For a full pipeline to update an index, see: src/scrape/beaker/index.sh
# Download a fresh set of papers, index and push to hf:
chmod +x src/scrape/beaker/index.sh
./src/scrape/beaker/index.sh

# Build and deploy container for auto-updating:
docker build -t acl-search -f src/scrape/beaker/Dockerfile .
docker run -it -e HF_TOKEN=$HF_TOKEN acl-search # (Optional) test it out!

# Run on slurm
sbatch src/scrape/slurm.sh

# Run on beaker
beaker image delete davidh/acl-search
beaker image create --name acl-search acl-search
Expand Down
2 changes: 1 addition & 1 deletion src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def main():
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
dataset = json.loads(f.read())

# dataset = dataset[:5000] # 5K in 48s/iter on 2 A40s (67K in 2hr)
# dataset = dataset[:100] # 5K in 48s/iter on 2 A40s (67K in 2hr)

# Get the abstracts + titles for indexing
collection = [e.get('title', '') + '\n\n' + e.get('abstract', '') for e in dataset]
Expand Down
14 changes: 9 additions & 5 deletions src/scrape/beaker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
FROM ghcr.io/allenai/pytorch:2.4.0-cuda12.1-python3.11
FROM ghcr.io/allenai/cuda:12.1-cudnn8-dev-ubuntu20.04-v1.2.118

ENV CUDA_HOME=/opt/conda
# Set environment variables for CUDA
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

WORKDIR /app

# Install sentencepiece deps
# Install sentencepiece deps (and git lfs)
RUN apt-get update && apt-get install -y \
git-lfs \
pkg-config \
libsentencepiece-dev \
&& apt-get clean
Expand All @@ -19,7 +23,7 @@ COPY ./src/extras/decompress_residuals.cpp /opt/conda/lib/python3.11/site-packag
COPY ./src/extras/filter_pids.cpp /opt/conda/lib/python3.11/site-packages/colbert/search/filter_pids.cpp
COPY ./src/extras/segmented_lookup.cpp /opt/conda/lib/python3.11/site-packages/colbert/search/segmented_lookup.cpp

# Copy repo
# Copy repo, openreview creds
COPY . .
COPY .openreview .

Expand All @@ -32,7 +36,7 @@ CMD ["./src/scrape/beaker/index.sh"]

# docker build -t acl-search -f src/scrape/beaker/Dockerfile .
# docker run -it acl-search
# docker run -it -e HF_TOKEN=$HF_TOKEN acl-search
# docker run -it -e HF_TOKEN=$HF_TOKEN acl-search
# docker run -it --gpus '"device=0"' -e HF_TOKEN=$HF_TOKEN acl-search
# # docker run --rm acl-search
# beaker image delete davidh/acl-search
Expand Down
10 changes: 5 additions & 5 deletions src/scrape/beaker/beaker-conf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ tasks:
result:
path: /output
resources:
gpuCount: 1
gpuCount: 4
context:
priority: low
preemptible: true
constraints:
cluster:
- ai2/general-cirrascale # no weka
- ai2/pluto-cirrascale # no weka
- ai2/jupiter-cirrascale-2 # h100
# - ai2/general-cirrascale # no weka
# - ai2/pluto-cirrascale # no weka
# - ai2/jupiter-cirrascale-2 # h100 # <- doesn't work with ColBERT's PyTorch 1
# - ai2/allennlp-elara-cirrascale # h100 # <- doesn't work with ColBERT's PyTorch 1
- ai2/saturn-cirrascale # a100
- ai2/allennlp-elara-cirrascale # h100
- ai2/neptune-cirrascale # a100
12 changes: 12 additions & 0 deletions src/scrape/beaker/index.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
#!/bin/bash

echo "Git LFS version:"
git lfs version

echo "Logged in as:"
huggingface-cli whoami

git config --global user.email "dheineman3@gatech.edu"
git config --global user.name "davidheineman"

# clone hf repository
git clone https://huggingface.co/davidheineman/colbert-acl
mv colbert-acl ../hf
Expand All @@ -21,9 +27,15 @@ mkdir -p ../hf
cp -r index ../hf
cp data/papers.json ../hf/papers.json

# Fix hf token spacing
export HF_TOKEN=$(echo "$HF_TOKEN" | tr -d '
' | tr -d '')

# push changes
cd ../hf
git remote set-url origin https://davidheineman:$HF_TOKEN@huggingface.co/davidheineman/colbert-acl
git add .
git status
git commit -m "update index"
git push

Expand Down
30 changes: 0 additions & 30 deletions src/scrape/slurm.sh

This file was deleted.

0 comments on commit 2a3cdb7

Please sign in to comment.