train-model: Sentiment Analysis @ EC2 Spot #55
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "train-model: Sentiment Analysis @ EC2 Spot" | |
on: | |
workflow_dispatch: | |
jobs: | |
start-runner: | |
name: "Start self-hosted EC2 Spot Instance: g4ad.xlarge + 30Gb" | |
runs-on: ubuntu-latest | |
outputs: | |
label: ${{ steps.start-ec2-runner.outputs.label }} | |
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: eu-west-2 | |
- uses: ahmadnassri/action-workflow-queue@v1 | |
- name: Start EC2 runner | |
id: start-ec2-runner | |
uses: digital-defiance/ec2-github-runner@v2 | |
with: | |
mode: start | |
pre-runner-script: | | |
sudo yum update -y && \ | |
sudo yum install docker git libicu -y | |
sudo systemctl enable docker | |
sudo systemctl start docker | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
market-type: spot | |
ec2-image-id: ami-04ae88736992322ef | |
ec2-instance-type: g4ad.xlarge | |
subnet-id: subnet-07512e20d23632607 | |
security-group-id: sg-098d5c6cb021e55a4 | |
# - name: Wait so that I can clone the AMI | |
# run: sleep 100000 | |
training-loop: | |
name: Train model (c, h, b, w, bias, att) | |
needs: start-runner # required to start the main job when the runner is ready | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
strategy: | |
fail-fast: false | |
max-parallel: 1 | |
matrix: | |
MODEL_COORDINATES: [36] | |
MODEL_NUMBER_OF_HEADS: [3, 6] # note that coordinates must be divisible by these | |
MODEL_NUMBER_OF_BLOCKS: [3, 6] | |
MODEL_WORDS: [70] | |
MODEL_TOKENS: [50263] | |
MODEL_BIAS: [false] | |
MODEL_ATTENTION: ["metric", "scaled_dot_product"] | |
TRAIN_BATCH_SIZE: [64] | |
TRAIN_NUMBER_OF_EPOCHS: [10] | |
TRAIN_WARMUP_STEPS: [5] | |
TRAIN_L1_REGULARIZATION: [0] | |
TRAIN_L2_REGULARIZATION: [0] | |
container: | |
image: rocm/pytorch:latest | |
options: --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G | |
env: | |
MODEL_COORDINATES: ${{ matrix.MODEL_COORDINATES }} | |
MODEL_NUMBER_OF_BLOCKS: ${{ matrix.MODEL_NUMBER_OF_BLOCKS }} | |
MODEL_NUMBER_OF_HEADS: ${{ matrix.MODEL_NUMBER_OF_HEADS }} | |
MODEL_WORDS: ${{ matrix.MODEL_WORDS }} | |
MODEL_TOKENS: ${{ matrix.MODEL_TOKENS }} | |
MODEL_BIAS: ${{ matrix.MODEL_BIAS }} | |
MODEL_ATTENTION: ${{ matrix.MODEL_ATTENTION }} | |
TRAIN_BATCH_SIZE: ${{ matrix.TRAIN_BATCH_SIZE }} | |
TRAIN_NUMBER_OF_EPOCHS: ${{ matrix.TRAIN_NUMBER_OF_EPOCHS }} | |
TRAIN_L1_REGULARIZATION: ${{ matrix.TRAIN_L1_REGULARIZATION }} | |
TRAIN_L2_REGULARIZATION: ${{ matrix.TRAIN_L2_REGULARIZATION }} | |
TRAIN_WARMUP_STEPS: ${{ matrix.TRAIN_WARMUP_STEPS }} | |
MLFLOW_EXPERIMENT_ID: 6 | |
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }} | |
MLFLOW_TRACKING_USERNAME: ${{ secrets.MLFLOW_TRACKING_USERNAME }} | |
MLFLOW_TRACKING_PASSWORD: ${{ secrets.MLFLOW_TRACKING_PASSWORD }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_REGION: eu-west-2 | |
GIT_PYTHON_REFRESH: quiet | |
steps: | |
- name: Install dependencies | |
run: pip install boto3==1.34.8 psutil==5.9.7 mlflow==2.9.2 pydantic==2.5.3 pydantic-settings==2.1.0 tqdm==4.66.1 tiktoken==0.5.2 pynvml | |
- uses: actions/checkout@v3 | |
- name: Perform Training Loop | |
run: python -m train.worker_sentiment_analysis | |
stop-runner: | |
name: Stop self-hosted EC2 Spot instance | |
needs: | |
- start-runner # required to get output from the start-runner job | |
- training-loop # required to wait when the main job is done | |
runs-on: ubuntu-latest | |
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: eu-west-2 | |
- name: Stop EC2 runner | |
uses: digital-defiance/ec2-github-runner@v2 | |
with: | |
mode: stop | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
label: ${{ needs.start-runner.outputs.label }} | |
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |