Skip to content

train-model: Sentiment Analysis @ EC2 Spot #45

train-model: Sentiment Analysis @ EC2 Spot

train-model: Sentiment Analysis @ EC2 Spot #45

name: "train-model: Sentiment Analysis @ EC2 Spot"
on:
workflow_dispatch:
jobs:
start-runner:
name: "Start self-hosted EC2 Spot Instance: inf1.2xlarge + 30Gb"
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-2
- uses: ahmadnassri/action-workflow-queue@v1
- name: Start EC2 runner
id: start-ec2-runner
uses: digital-defiance/ec2-github-runner@v2
with:
mode: start
pre-runner-script: |
sudo yum update -y && \
sudo yum install docker git libicu -y
sudo systemctl enable docker
sudo systemctl start docker
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
market-type: spot
ec2-image-id: ami-0ac78de7f262baaf2
ec2-instance-type: inf1.2xlarge
subnet-id: subnet-07512e20d23632607
security-group-id: sg-098d5c6cb021e55a4
# - name: Waite so that I can clone the AMI
# run: sleep 100000
training-loop:
name: Train model (c, h, b, w, bias, att)
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
strategy:
fail-fast: false
max-parallel: 1
matrix:
MODEL_COORDINATES: [36]
MODEL_NUMBER_OF_HEADS: [3, 6] # note that coordinates must be divisible by these
MODEL_NUMBER_OF_BLOCKS: [3, 6]
MODEL_WORDS: [70]
MODEL_TOKENS: [50263]
MODEL_BIAS: [false]
MODEL_ATTENTION: ["metric", "scaled_dot_product"]
TRAIN_BATCH_SIZE: [64]
TRAIN_EPOCHS: [25]
TRAIN_WARMUP_STEPS: [2]
TRAIN_L1_REGULARIZATION: [0]
TRAIN_L2_REGULARIZATION: [0]
container:
image: ghcr.io/digital-defiance/llm-voice-chat:main
env:
MODEL_COORDINATES: ${{ matrix.MODEL_COORDINATES }}
MODEL_NUMBER_OF_BLOCKS: ${{ matrix.MODEL_NUMBER_OF_BLOCKS }}
MODEL_NUMBER_OF_HEADS: ${{ matrix.MODEL_NUMBER_OF_HEADS }}
MODEL_WORDS: ${{ matrix.MODEL_WORDS }}
MODEL_TOKENS: ${{ matrix.MODEL_TOKENS }}
MODEL_BIAS: ${{ matrix.MODEL_BIAS }}
MODEL_ATTENTION: ${{ matrix.MODEL_ATTENTION }}
TRAIN_BATCH_SIZE: ${{ matrix.TRAIN_BATCH_SIZE }}
TRAIN_EPOCHS: ${{ matrix.TRAIN_EPOCHS }}
TRAIN_L1_REGULARIZATION: ${{ matrix.TRAIN_L1_REGULARIZATION }}
TRAIN_L2_REGULARIZATION: ${{ matrix.TRAIN_L2_REGULARIZATION }}
TRAIN_WARMUP_STEPS: ${{ matrix.TRAIN_WARMUP_STEPS }}
MLFLOW_EXPERIMENT_ID: 6
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
MLFLOW_TRACKING_USERNAME: ${{ secrets.MLFLOW_TRACKING_USERNAME }}
MLFLOW_TRACKING_PASSWORD: ${{ secrets.MLFLOW_TRACKING_PASSWORD }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_REGION: eu-west-2
GIT_PYTHON_REFRESH: quiet
steps:
- uses: actions/checkout@v3
- name: Perform Training Loop
run: python -m train.worker_sentiment_analysis
stop-runner:
name: Stop self-hosted EC2 Spot instance
needs:
- start-runner # required to get output from the start-runner job
- training-loop # required to wait when the main job is done
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-2
- name: Stop EC2 runner
uses: digital-defiance/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}