Skip to content

add tensor parallel embedding test #1

add tensor parallel embedding test

add tensor parallel embedding test #1

name: nvidia-rtx-3090 tests
on:
workflow_dispatch:
workflow_call:
concurrency:
group: unit_tests-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}
cancel-in-progress: true
jobs:
inter-layer:
runs-on: [ nvidia ]
strategy:
matrix:
ginter: [ 1, 2 ]
steps:
- uses: actions/checkout@v3
- name: Install AxoNN
run: |
pip install -e .
- name: Download dataset
run: |
python -c "import torchvision; torchvision.datasets.MNIST(root=\"./axonn/tests\", download=True, train=True)"
- name: Train
run: |
export G_inter=${{ matrix.ginter }}
export G_data=$(( 2 / G_inter ))
echo "training with G_inter = ${G_inter}, G_data = $(( 2 / G_inter )) ${{ matrix.memopt }}"
mpirun -mca orte_allowed_exit_without_sync 1 -n 2 pytest --with-mpi ./axonn/tests/test_vit.py
- name: Uninstall AxoNN
run: |
pip uninstall --yes axonn
intra-layer:
runs-on: [ nvidia ]
steps:
- uses: actions/checkout@v3
- name: Install AxoNN
run: |
pip install -e .
- name: Run intra-layer FC unit tests
run: |
mpirun -mca orte_allowed_exit_without_sync 1 -n 2 pytest --with-mpi ./axonn/tests/test_intra_layer_fc.py
- name: Run intra-layer Conv unit tests
run: |
mpirun -mca orte_allowed_exit_without_sync 1 -n 2 pytest --with-mpi ./axonn/tests/test_intra_layer_conv.py
- name: Run intra-layer Embedding unit tests
mpirun -mca orte_allowed_exit_without_sync 1 -n 2 pytest --with-mpi ./axonn/tests/test_intra_layer_emb.py -k bw_pass
mpirun -mca orte_allowed_exit_without_sync 1 -n 2 pytest --with-mpi ./axonn/tests/test_intra_layer_emb.py -k fw_pass
- name: Uninstall AxoNN
run: |
pip uninstall --yes axonn