Skip to content

Commit

Permalink
Merge pull request #38 from AGI-Collective/docker
Browse files Browse the repository at this point in the history
added docker server support
  • Loading branch information
Alexis-BX authored Mar 25, 2024
2 parents 19d3f63 + 614a2fb commit 16e129a
Show file tree
Hide file tree
Showing 16 changed files with 406 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ scripts/robin_v2/pretrain_multinodes_[0-9]*.sh
scripts/robin_v2/finetune_lora_multinodes_[0-9]*.sh

playground-original/
*.tar.gz
46 changes: 46 additions & 0 deletions scripts/robin_v2/docker/image_robin_env/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04

WORKDIR /app

ENV DEBIAN_FRONTEND="noninteractive"
ENV HOST_NAME="docker"

RUN apt-get update \
&& apt-get install -y \
software-properties-common \
wget \
git \
git-lfs \
unzip

# Install Python 3.10
RUN add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y \
python3.10 \
python3.10-dev \
python3.10-distutils \
&& ln -s /usr/bin/python3.10 /usr/bin/python \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install Pip
RUN wget https://bootstrap.pypa.io/get-pip.py \
&& python get-pip.py \
&& rm get-pip.py \
&& pip install --upgrade pip

# Install Robin
RUN git clone https://github.com/AGI-Collective/robin.git \
&& cd robin \
&& git checkout frontier/dev \
&& pip install -e ".[train]"

# Setup corrections
RUN pip uninstall -y bitsandbytes \
&& rm -rf /usr/local/lib/python3.10/site-packages/triton \
&& rm -rf /usr/local/lib/python3.10/site-packages/sklearn

RUN pip install flash-attn==2.3.3

RUN pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118
1 change: 1 addition & 0 deletions scripts/robin_v2/docker/image_robin_env/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker build -t robin_env .
21 changes: 21 additions & 0 deletions scripts/robin_v2/docker/image_robin_evals/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM docker.io/library/robin_env

WORKDIR /app

RUN mkdir /app/downloaded_models /export

# Data setup
ADD scienceqa.tar.gz /app/playground/data/eval
ADD gqa.tar.gz /app/playground/data/eval

# RUN git lfs install \
# && cd /app/downloaded_models \
# && git clone https://huggingface.co/agi-collective/mistral-7b-oh-siglip-so400m-finetune-lora \
# && git clone https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B


# launch script
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]

1 change: 1 addition & 0 deletions scripts/robin_v2/docker/image_robin_evals/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker build -t robin_evals .
36 changes: 36 additions & 0 deletions scripts/robin_v2/docker/image_robin_evals/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
MODEL=$1
BASE=$2
version=""

# download models
git lfs install
cd /app/downloaded_models
git clone https://huggingface.co/$MODEL
git clone https://huggingface.co/$BASE

#get updated code
cd /app/robin
git pull
git checkout eval

# setup variables
MODEL=${MODEL##*/}
BASE=${BASE##*/}

EXPORT_PATH=/export/$MODEL

MODEL=/app/downloaded_models/$MODEL
BASE=/app/downloaded_models/$BASE

mkdir -p $EXPORT_PATH

# launch training
cd /app/robin/scripts/v1_5/eval
echo "Launching SQA"
./docker_sqa.sh $MODEL $BASE $version > $EXPORT_PATH/scienceqa/results.log 2>&1

echo "Launching QGA"
./docker_gqa.sh $MODEL $BASE $version > $EXPORT_PATH/gqa/results.log 2>&1

echo "Finished evals!"
10 changes: 10 additions & 0 deletions scripts/robin_v2/docker/image_robin_evals/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -it \
docker run \
--rm \
--detach \
--gpus 1 \
--shm-size 8G \
--volume /home/$(whoami)/evals:/export \
docker.io/library/robin_evals \
agi-collective/mistral-7b-oh-siglip-so400m-finetune-lora \
teknium/OpenHermes-2.5-Mistral-7B \
17 changes: 17 additions & 0 deletions scripts/robin_v2/docker/image_robin_finetune/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM docker.io/library/robin_env

WORKDIR /app

RUN mkdir /app/wandb_cache /app/downloaded_models /export

# Data setup
# ADD LLaVA-Finetune.tar.gz /app/
COPY data_download_parallel.py /app/
RUN apt update && apt-get install imagemagick -y
RUN python /app/data_download_parallel.py

# launch script
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]

1 change: 1 addition & 0 deletions scripts/robin_v2/docker/image_robin_finetune/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker build -t robin_finetune .
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import multiprocessing
import concurrent.futures
import os
import json
import urllib.request as ureq

# BASE_PATH = "images"
BASE_PATH = "/app/LLaVA-Finetune"

def download_dataset(folder_name, urls):
if folder_name == "json":
os.system(f"wget -q https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_v1_5_mix665k.json -P {BASE_PATH}")
return

if isinstance(urls, str):
urls = [urls]

for url in urls:
name = url.split("/")[-1]
os.makedirs(f"{BASE_PATH}/{folder_name}", exist_ok=True)
os.system(f"wget -q {url} -P {BASE_PATH}/{folder_name}")
os.system(f"unzip -q {BASE_PATH}/{folder_name}/{name} -d {BASE_PATH}/{folder_name}")
os.system(f"rm {BASE_PATH}/{folder_name}/*.zip")

def download_image(k, url, path):
ext = os.path.splitext(url)[1]
outputFile = f'{path}/images/%s%s' % (k, ext)
ureq.urlretrieve(url, outputFile)

def download_ocr_data():
folder_name = "ocr_vqa"
os.makedirs(f"{BASE_PATH}/{folder_name}", exist_ok=True)

### DOWNLOAD DATASET.JSON
meta_url = "https://drive.usercontent.google.com/download?id=1r0tyZUwGCc4wIG4RkiglCGNL_nFJjR6Q&export=download&authuser=0&confirm=t&at=APZUnTW8fGOfgvS7p_RjJKw6sXyU:1707402060685"
ureq.urlretrieve(meta_url, f'{BASE_PATH}/{folder_name}/dataset.json')

with open(f'{BASE_PATH}/{folder_name}/dataset.json', 'r') as fp:
data = json.load(fp)

os.makedirs(f'{BASE_PATH}/{folder_name}/images', exist_ok=True)

# for k in data.keys():
# ext = os.path.splitext(data[k]['imageURL'])[1]
# outputFile = f'{BASE_PATH}/{folder_name}/images/%s%s' % (k, ext)
# ureq.urlretrieve(data[k]['imageURL'], outputFile)

pool = multiprocessing.Pool(100)

# Call the run function in parallel using the pool
inputs = [(k, data[k]['imageURL'], f"{BASE_PATH}/{folder_name}") for k in data.keys()]
results = pool.starmap(download_image, inputs)

# Close the pool and wait for all processes to finish
pool.close()
pool.join()

os.system(f"mogrify -format jpg {BASE_PATH}/{folder_name}/images/*.gif")
os.system(f"mogrify -format jpg {BASE_PATH}/{folder_name}/images/*.png")

if __name__ == "__main__":
# Define your arguments
datasets = [
("json", ""),
("coco", "images.cocodataset.org/zips/train2017.zip"),
("gqa", "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"),
("textvqa", "https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip"),
("vg", ("https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip",
"https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip") )
]

os.makedirs(f"{BASE_PATH}", exist_ok=True)

# Create a multiprocessing pool
pool = multiprocessing.Pool()

# Call the run function in parallel using the pool
results = pool.starmap(download_dataset, datasets)

# Close the pool and wait for all processes to finish
pool.close()
pool.join()

# Download OCR Data
download_ocr_data()
74 changes: 74 additions & 0 deletions scripts/robin_v2/docker/image_robin_finetune/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash
MODEL=$1
VISION=$2
GAS=${3:-1}

# download models
git lfs install
cd /app/downloaded_models
git clone https://huggingface.co/$MODEL
git clone https://huggingface.co/$VISION

#get updated code
git pull

# setup variables
MODEL=${MODEL##*/}
VISION=${VISION##*/}

CHECKPOINT_PATH=/app/checkpoints/$MODEL-$VISION
EXPORT_PATH=/export/$MODEL-$VISION

MODEL=/app/downloaded_models/$MODEL
VISION=/app/downloaded_models/$VISION

GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
BATCH_SIZE=$(( 128 / $GPU_COUNT / $GAS ))

DATA_PATH=/app/LLaVA-Finetune

PRETRAIN=$EXPORT_PATH/pretrain

# launch training
cd /app/robin
deepspeed \
robin/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--model_name_or_path $MODEL \
--version v1 \
--data_path $DATA_PATH/llava_v1_5_mix665k.json \
--image_folder $DATA_PATH \
--vision_tower $VISION \
--finetune_ve True \
--lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
--pretrain_mm_mlp_adapter $PRETRAIN/mm_projector.bin \
--group_by_modality_length True \
--image_aspect_ratio pad \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--fp16 True \
--output_dir $CHECKPOINT_PATH/finetune \
--num_train_epochs 1 \
--per_device_train_batch_size $BATCH_SIZE \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps $GAS \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 100 \
--save_total_limit 2 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 False \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb \
--vision_lr 5e-5

mv $CHECKPOINT_PATH/finetune/*.* $EXPORT_PATH/finetune/
11 changes: 11 additions & 0 deletions scripts/robin_v2/docker/image_robin_finetune/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -it \
docker run \
--rm \
--detach \
--gpus 2 \
--shm-size 8G \
--volume /home/$(whoami)/checkpoints:/export \
docker.io/library/robin_finetune \
teknium/OpenHermes-2.5-Mistral-7B \
facebook/metaclip-l14-fullcc2.5b \
4
20 changes: 20 additions & 0 deletions scripts/robin_v2/docker/image_robin_pretrain/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM docker.io/library/robin_env

WORKDIR /app

RUN mkdir /app/wandb_cache /app/downloaded_models /export

# Data setup
#ADD LLaVA-Pretrain.tar.gz /app/
RUN git lfs install \
&& git clone https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain \
&& cd LLaVA-Pretrain \
&& rm -rf .git* \
&& unzip -q images.zip -d images \
&& rm images.zip

# launch script
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]

1 change: 1 addition & 0 deletions scripts/robin_v2/docker/image_robin_pretrain/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker build -t robin_pretrain .
Loading

0 comments on commit 16e129a

Please sign in to comment.