From 24749d4c9bdd70dec2bd56df9f2db00165afe3a1 Mon Sep 17 00:00:00 2001 From: illsilin Date: Mon, 20 Mar 2023 18:06:45 -0700 Subject: [PATCH 01/31] add jenkinsfile --- Jenkinsfile | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 000000000..f20550945 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,270 @@ +def rocmnode(name) { + return 'rocmtest && miopen && ' + name +} + +def show_node_info() { + sh """ + echo "NODE_NAME = \$NODE_NAME" + lsb_release -sd + uname -r + ls /opt/ -la + """ +} + +def runShell(String command){ + def responseCode = sh returnStatus: true, script: "${command} > tmp.txt" + def output = readFile(file: "tmp.txt") + echo "tmp.txt contents: $output" + return (output != "") +} + +def getDockerImageName(){ + def img + img = "${env.CK_DOCKERHUB}:ait_rocm${params.ROCMVERSION}" + return img +} + +def getDockerImage(Map conf=[:]){ + env.DOCKER_BUILDKIT=1 + def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm + def no_cache = conf.get("no_cache", false) + def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg ROCMVERSION='${params.ROCMVERSION}' " + echo "Docker Args: ${dockerArgs}" + def image = getDockerImageName() + //Check if image exists + def retimage + try + { + echo "Pulling image: ${image}" + retimage = docker.image("${image}") + retimage.pull() + } + catch(Exception ex) + { + error "Unable to locate image: ${image}" + } + return [retimage, image] +} + +def build_ait(Map conf=[:]){ + + def build_cmd = """ + export ROCM_PATH=/opt/rocm + export ROC_USE_FGS_KERNARG=0 + # clean up and reinstall ait + pip3 uninstall -y aitemplate + cd python + rm -rf dist build + python3 setup.py bdist_wheel + pip3 install dist/*.whl + #install necessary python modules + pip3 install timm + pip3 uninstall -y torch + pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 + python3 -m pip install transformers click + python3 -c "import torch; print(torch.__version__)" + """ + + def cmd = conf.get("cmd", """ + ${build_cmd} + """) + + cmd += """ + ${execute_cmd} + cd $GITHUB_WORKSPACE/AITemplate/examples/01_resnet-50 + # populate log headers + export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} + echo -n "hostname: ">resnet50.log; hostname >> resnet50.log + echo -n "GPU_arch: " >> resnet50.log; rocminfo | grep "Name:" | grep "gfx" >> resnet50.log + rocminfo | grep "Compute Unit:" >> resnet50.log + echo "git_branch: $GIT_BRANCH" >> resnet50.log + git show --summary | grep commit >> resnet50.log + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> resnet50.log + HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log + """ + echo cmd + sh cmd +} + +def Run_Step(Map conf=[:]){ + show_node_info() + + env.HSA_ENABLE_SDMA=0 + checkout scm + + def image = getDockerImageName() + def prefixpath = conf.get("prefixpath", "/opt/rocm") + + // Jenkins is complaining about the render group + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg ROCMVERSION='${params.ROCMVERSION}' " + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'AITemplate') { + try { + (retimage, image) = getDockerImage(conf) + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES'){ + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + throw new Exception ("GPU not found") + } + else{ + echo "GPU is OK" + } + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 24, unit: 'HOURS') + { + build_ait(conf) + + dir("examples"){ + //sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "resnet50.log" + //archiveArtifacts "bert.log" + //archiveArtifacts "vit.log" + //archiveArtifacts "sdiff.log" + // stash perf files to master + stash name: "resnet50.log" + //stash name: "bert.log" + //stash name: "vit.log" + //stash name: "sdiff.log" + //we will process the results on the master node + } + } + } + } + return retimage +} + +def Run_Step_and_Reboot(Map conf=[:]){ + try{ + Run_Step(conf) + } + catch(e){ + echo "throwing error exception while building CK" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + if (!conf.get("no_reboot", false)) { + reboot() + } + } +} + +def process_results(Map conf=[:]){ + env.HSA_ENABLE_SDMA=0 + checkout scm + def image = getDockerImageName() + def prefixpath = "/opt/rocm" + + // Jenkins is complaining about the render group + def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'AITemplate') { + try { + (retimage, image) = getDockerImage(conf) + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 1, unit: 'HOURS'){ + try{ + dir("examples"){ + // unstash perf files to master + unstash "resnet50.log" + //unstash "bert.log" + //unstash "vit.log" + //unstash "sdiff.log" + sh "python3 process_results.py" + } + } + catch(e){ + echo "throwing error exception while processing performance test results" + echo 'Exception occurred: ' + e.toString() + throw e + } + } + } +} + +pipeline { + agent none + triggers { + parameterizedCron(CRON_SETTINGS) + } + options { + parallelsAlwaysFailFast() + } + parameters { + booleanParam( + string( + name: 'ROCMVERSION', + defaultValue: '5.4.3', + description: 'Specify which ROCM version to use: 5.4.3 (default).') + } + environment{ + dbuser = "${dbuser}" + dbpassword = "${dbpassword}" + dbsship = "${dbsship}" + dbsshport = "${dbsshport}" + dbsshuser = "${dbsshuser}" + dbsshpassword = "${dbsshpassword}" + status_wrapper_creds = "${status_wrapper_creds}" + DOCKER_BUILDKIT = "1" + } + stages{ + stage("Build AITemplate") + { + parallel + { + stage("Build AIT and Run Tests") + { + agent{ label rocmnode("gfx908 || gfx90a") } + environment{ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install """ + execute_args = """ """ + } + steps{ + Run_Step_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + } + } + } + } + stage("Process Performance Test Results") + { + parallel + { + stage("Process results"){ + agent { label 'mici' } + steps{ + process_results() + } + } + } + } + } +} \ No newline at end of file From 68438c145a351be39bbc66598a83647838261438 Mon Sep 17 00:00:00 2001 From: illsilin Date: Mon, 20 Mar 2023 21:01:49 -0700 Subject: [PATCH 02/31] put all test commands into a bash script --- Jenkinsfile | 42 ++++++++++++------------------------- examples/run_tests.sh | 48 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 29 deletions(-) create mode 100644 examples/run_tests.sh diff --git a/Jenkinsfile b/Jenkinsfile index f20550945..318ce4363 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,17 +51,16 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - # clean up and reinstall ait pip3 uninstall -y aitemplate cd python rm -rf dist build python3 setup.py bdist_wheel pip3 install dist/*.whl - #install necessary python modules pip3 install timm pip3 uninstall -y torch pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 python3 -m pip install transformers click + python3 -m pip install diffusers==0.11.1 accelerate python3 -c "import torch; print(torch.__version__)" """ @@ -71,16 +70,6 @@ def build_ait(Map conf=[:]){ cmd += """ ${execute_cmd} - cd $GITHUB_WORKSPACE/AITemplate/examples/01_resnet-50 - # populate log headers - export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} - echo -n "hostname: ">resnet50.log; hostname >> resnet50.log - echo -n "GPU_arch: " >> resnet50.log; rocminfo | grep "Name:" | grep "gfx" >> resnet50.log - rocminfo | grep "Compute Unit:" >> resnet50.log - echo "git_branch: $GIT_BRANCH" >> resnet50.log - git show --summary | grep commit >> resnet50.log - /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> resnet50.log - HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log """ echo cmd sh cmd @@ -101,7 +90,6 @@ def Run_Step(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg ROCMVERSION='${params.ROCMVERSION}' " - def variant = env.STAGE_NAME def retimage @@ -129,18 +117,17 @@ def Run_Step(Map conf=[:]){ timeout(time: 24, unit: 'HOURS') { build_ait(conf) - dir("examples"){ - //sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + sh "./run_tests.sh $HF_TOKEN archiveArtifacts "resnet50.log" - //archiveArtifacts "bert.log" - //archiveArtifacts "vit.log" - //archiveArtifacts "sdiff.log" + archiveArtifacts "bert.log" + archiveArtifacts "vit.log" + archiveArtifacts "sdiff.log" // stash perf files to master stash name: "resnet50.log" - //stash name: "bert.log" - //stash name: "vit.log" - //stash name: "sdiff.log" + stash name: "bert.log" + stash name: "vit.log" + stash name: "sdiff.log" //we will process the results on the master node } } @@ -196,9 +183,9 @@ def process_results(Map conf=[:]){ dir("examples"){ // unstash perf files to master unstash "resnet50.log" - //unstash "bert.log" - //unstash "vit.log" - //unstash "sdiff.log" + unstash "bert.log" + unstash "vit.log" + unstash "sdiff.log" sh "python3 process_results.py" } } @@ -234,6 +221,7 @@ pipeline { dbsshuser = "${dbsshuser}" dbsshpassword = "${dbsshpassword}" status_wrapper_creds = "${status_wrapper_creds}" + HF_TOKEN = "${HF_TOKEN}" DOCKER_BUILDKIT = "1" } stages{ @@ -244,12 +232,8 @@ pipeline { stage("Build AIT and Run Tests") { agent{ label rocmnode("gfx908 || gfx90a") } - environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install """ - execute_args = """ """ - } steps{ - Run_Step_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Run_Step_and_Reboot(no_reboot:true, , prefixpath: '/usr/local') } } } diff --git a/examples/run_tests.sh b/examples/run_tests.sh new file mode 100644 index 000000000..942e74627 --- /dev/null +++ b/examples/run_tests.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# this is a script to run tests during ROCM CI +# input argument: +# Hugging Face token + +export HF_TOKEN=$1 + +function print_log_header(){ + rm -f $1; + export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} + echo -n "hostname: " >$1; hostname >> $1; + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1 + rocminfo | grep "Compute Unit:" >> $1 + echo "git_branch: $GIT_BRANCH" >> $1 + git show --summary | grep commit >> $1 + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1 +} + +echo "Running RESNET50 tests" +cd 01_resnet-50 +print_log_header resnet50.log +HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log + +echo "Running BERT tests" +cd ../03_bert +print_log_header bert.log +for sq in 64 128 384 512 1024 +do + HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log +done + +echo "Running VIT tests" +cd ../04_vit +print_log_header vit.log +HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a vit.log +# test 2 gcd +for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log & + HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log +done + +echo "Running Stable Diffusion tests" +cd ../05_stable_diffusion +print_log_header sdiff.log +HIP_VISIBLE_DEVICES=0,1 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log +HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log From 1e8025a81616760b57fcc1f949a0cb6922ec85f4 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 10:44:40 -0700 Subject: [PATCH 03/31] fix syntax --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 318ce4363..735cb8b27 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -118,7 +118,7 @@ def Run_Step(Map conf=[:]){ { build_ait(conf) dir("examples"){ - sh "./run_tests.sh $HF_TOKEN + sh "./run_tests.sh $HF_TOKEN" archiveArtifacts "resnet50.log" archiveArtifacts "bert.log" archiveArtifacts "vit.log" From 5b773552ac5a99e3b3728acaa3b28a7f3b70f082 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 10:46:58 -0700 Subject: [PATCH 04/31] remove unneccessary line --- Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 735cb8b27..255aecf22 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -207,7 +207,6 @@ pipeline { parallelsAlwaysFailFast() } parameters { - booleanParam( string( name: 'ROCMVERSION', defaultValue: '5.4.3', From 997af707d780d007c04c3ba6422e730aca446cd5 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 10:56:03 -0700 Subject: [PATCH 05/31] remove cron trigger --- Jenkinsfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 255aecf22..cc3446acf 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -200,9 +200,6 @@ def process_results(Map conf=[:]){ pipeline { agent none - triggers { - parameterizedCron(CRON_SETTINGS) - } options { parallelsAlwaysFailFast() } From 918b046bab2993bfd8faf123feed9c2cebc9f20f Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 12:06:28 -0700 Subject: [PATCH 06/31] upgrade rocm to 5.4.3 --- docker/Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 8146b506c..912e0261c 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -15,7 +15,7 @@ # ROCM Docker Image for AITemplate FROM ubuntu:20.04 -ARG ROCMVERSION=5.3 +ARG ROCMVERSION=5.4.3 RUN set -xe From e19f6fa75c65302696432ddc8b51cc2a2e922b1e Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 12:20:41 -0700 Subject: [PATCH 07/31] get rid of execute_cmd --- Jenkinsfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index cc3446acf..5153e6493 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,9 +68,6 @@ def build_ait(Map conf=[:]){ ${build_cmd} """) - cmd += """ - ${execute_cmd} - """ echo cmd sh cmd } From 0b68790eda565ad9a6347391648afcec9807a560 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 13:22:36 -0700 Subject: [PATCH 08/31] move python packages installation into the docker --- Jenkinsfile | 6 ------ docker/Dockerfile.rocm | 7 ++++++- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5153e6493..9d205e1de 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,16 +51,10 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - pip3 uninstall -y aitemplate cd python rm -rf dist build python3 setup.py bdist_wheel pip3 install dist/*.whl - pip3 install timm - pip3 uninstall -y torch - pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 - python3 -m pip install transformers click - python3 -m pip install diffusers==0.11.1 accelerate python3 -c "import torch; print(torch.__version__)" """ diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 912e0261c..ad57022f7 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -97,7 +97,12 @@ RUN bash /Install/install_test_dep.sh RUN bash /Install/install_doc_dep.sh # Install Pytorch -RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 +RUN pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 + +# Install some useful python packages +RUN pip3 install timm +RUN python3 -m pip install transformers click +RUN python3 -m pip install diffusers==0.11.1 accelerate # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata From 43099de96d050d4289ab27d7539c41d8963e04f6 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 13:33:26 -0700 Subject: [PATCH 09/31] do not re-install AIT --- Jenkinsfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9d205e1de..7374c9edb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,10 +51,6 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - cd python - rm -rf dist build - python3 setup.py bdist_wheel - pip3 install dist/*.whl python3 -c "import torch; print(torch.__version__)" """ From 97b00ebc967928eeac9bad5f9594cce97cd1e3a8 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 13:48:23 -0700 Subject: [PATCH 10/31] chmod for run_tests.sh --- examples/run_tests.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/run_tests.sh diff --git a/examples/run_tests.sh b/examples/run_tests.sh old mode 100644 new mode 100755 From aa7d84742eedf384d53026a4c2f934c3749122d1 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 16:45:02 -0700 Subject: [PATCH 11/31] add torchvision and torchaudio, set new HF cache to suppress errors --- Jenkinsfile | 2 ++ docker/Dockerfile.rocm | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7374c9edb..39e06ec57 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,6 +51,8 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 + mkdir /hf_cache + export TRANSFORMERS_CACHE=/hf_cache python3 -c "import torch; print(torch.__version__)" """ diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index ad57022f7..d30b174f0 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -97,7 +97,7 @@ RUN bash /Install/install_test_dep.sh RUN bash /Install/install_doc_dep.sh # Install Pytorch -RUN pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 +RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 # Install some useful python packages RUN pip3 install timm From ba27cfb722ef92e96e01d7e87c4d43887b38d2d6 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 17:23:38 -0700 Subject: [PATCH 12/31] move HF cache to a different path --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 39e06ec57..ef59f7efb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,8 +51,8 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - mkdir /hf_cache - export TRANSFORMERS_CACHE=/hf_cache + mkdir /home/jenkins/hf_cache + export TRANSFORMERS_CACHE=/home/jenkins/hf_cache python3 -c "import torch; print(torch.__version__)" """ From 43af738380debdf21838c49b7cfceb79c06d8c58 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 17:47:55 -0700 Subject: [PATCH 13/31] create cache folder in steps --- Jenkinsfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index ef59f7efb..8bb312ace 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,7 +51,10 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - mkdir /home/jenkins/hf_cache + cd /home + mkdir jenkins + cd jenkins + mkdir hf_cache export TRANSFORMERS_CACHE=/home/jenkins/hf_cache python3 -c "import torch; print(torch.__version__)" """ From 9ad5bd2acb3e1dd893eba490954cfeee9a5b72f0 Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 17:50:40 -0700 Subject: [PATCH 14/31] assume /home/jenkins exists --- Jenkinsfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8bb312ace..e30fe638f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,9 +51,7 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - cd /home - mkdir jenkins - cd jenkins + cd /home/jenkins mkdir hf_cache export TRANSFORMERS_CACHE=/home/jenkins/hf_cache python3 -c "import torch; print(torch.__version__)" From 0af561b1fcd11cb6ae8839943122f825192b29cf Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 21 Mar 2023 18:13:03 -0700 Subject: [PATCH 15/31] use pre-built folder in docker for HF cache --- Jenkinsfile | 2 -- docker/Dockerfile.rocm | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e30fe638f..f542f429a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,8 +51,6 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - cd /home/jenkins - mkdir hf_cache export TRANSFORMERS_CACHE=/home/jenkins/hf_cache python3 -c "import torch; print(torch.__version__)" """ diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index d30b174f0..a524e0835 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -104,6 +104,12 @@ RUN pip3 install timm RUN python3 -m pip install transformers click RUN python3 -m pip install diffusers==0.11.1 accelerate +# Create a folder for Hugging Face cache +RUN cd /home +RUN mkdir jenkins +RUN cd jenkins +RUN mkdir hf_cache + # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata RUN bash /Install/install_detection_deps.sh From c489ef9c77231041b160585d8852e1783af46580 Mon Sep 17 00:00:00 2001 From: illsilin Date: Wed, 22 Mar 2023 13:06:18 -0700 Subject: [PATCH 16/31] temporarily disable vit tests and update log paths --- Jenkinsfile | 24 ++++++++++++------------ examples/run_tests.sh | 10 +++++----- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index f542f429a..1a9b7840a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -107,15 +107,15 @@ def Run_Step(Map conf=[:]){ build_ait(conf) dir("examples"){ sh "./run_tests.sh $HF_TOKEN" - archiveArtifacts "resnet50.log" - archiveArtifacts "bert.log" - archiveArtifacts "vit.log" - archiveArtifacts "sdiff.log" + archiveArtifacts "01_resnet-50/resnet50.log" + archiveArtifacts "03_bert/bert.log" + archiveArtifacts "04_vit/vit.log" + archiveArtifacts "05_stable_diffusion/sdiff.log" // stash perf files to master - stash name: "resnet50.log" - stash name: "bert.log" - stash name: "vit.log" - stash name: "sdiff.log" + stash name: "01_resnet-50/resnet50.log" + stash name: "03_bert/bert.log" + stash name: "04_vit/vit.log" + stash name: "05_stable_diffusion/sdiff.log" //we will process the results on the master node } } @@ -170,10 +170,10 @@ def process_results(Map conf=[:]){ try{ dir("examples"){ // unstash perf files to master - unstash "resnet50.log" - unstash "bert.log" - unstash "vit.log" - unstash "sdiff.log" + unstash "01_resnet-50/resnet50.log" + unstash "03_bert/bert.log" + unstash "04_vit/vit.log" + unstash "05_stable_diffusion/sdiff.log" sh "python3 process_results.py" } } diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 942e74627..05a2a87b3 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -35,11 +35,11 @@ cd ../04_vit print_log_header vit.log HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a vit.log # test 2 gcd -for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 -do - HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log & - HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log -done +#for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 +#do +# HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log & +# HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log +#done echo "Running Stable Diffusion tests" cd ../05_stable_diffusion From 0060f2eee650a2ea82cbfdb07c2df076648575bb Mon Sep 17 00:00:00 2001 From: illsilin Date: Wed, 22 Mar 2023 15:15:23 -0700 Subject: [PATCH 17/31] skip all tests and go to SD, update dockerfile --- Jenkinsfile | 4 ++-- docker/Dockerfile.rocm | 17 +++++++++++------ examples/run_tests.sh | 15 ++++++++------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1a9b7840a..46d90c72b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -51,7 +51,6 @@ def build_ait(Map conf=[:]){ def build_cmd = """ export ROCM_PATH=/opt/rocm export ROC_USE_FGS_KERNARG=0 - export TRANSFORMERS_CACHE=/home/jenkins/hf_cache python3 -c "import torch; print(torch.__version__)" """ @@ -235,4 +234,5 @@ pipeline { } } } -} \ No newline at end of file +} + diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index a524e0835..1427ce83a 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -104,12 +104,6 @@ RUN pip3 install timm RUN python3 -m pip install transformers click RUN python3 -m pip install diffusers==0.11.1 accelerate -# Create a folder for Hugging Face cache -RUN cd /home -RUN mkdir jenkins -RUN cd jenkins -RUN mkdir hf_cache - # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata RUN bash /Install/install_detection_deps.sh @@ -126,3 +120,14 @@ ADD ./static /AITemplate/static ADD ./licenses /AITemplate/licenses ADD ./docker/install/install_ait.sh /AITemplate/ RUN bash /AITemplate/install_ait.sh + +# Create a folder for Hugging Face cache +RUN mkdir /.cache +RUN chmod a+rw /.cache +WORKDIR "/.cache" +RUN mkdir huggingface +RUN chmod a+rw huggingface +WORKDIR "/.cache/huggingface" +RUN mkdir hub +RUN chmod a+rw hub +WORKDIR / diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 05a2a87b3..f844c0c2c 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -5,6 +5,7 @@ # Hugging Face token export HF_TOKEN=$1 +export TRANSFORMERS_CACHE=/.cache/huggingface/hub function print_log_header(){ rm -f $1; @@ -20,20 +21,20 @@ function print_log_header(){ echo "Running RESNET50 tests" cd 01_resnet-50 print_log_header resnet50.log -HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log +#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log echo "Running BERT tests" cd ../03_bert print_log_header bert.log -for sq in 64 128 384 512 1024 -do - HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log -done +#for sq in 64 128 384 512 1024 +#do +# HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log +#done echo "Running VIT tests" cd ../04_vit print_log_header vit.log -HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a vit.log +#HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a vit.log # test 2 gcd #for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 #do @@ -44,5 +45,5 @@ HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a vit.log echo "Running Stable Diffusion tests" cd ../05_stable_diffusion print_log_header sdiff.log -HIP_VISIBLE_DEVICES=0,1 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log +HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log From 0954e8f5e1875dde4e9f8d2cc455d00aed016fe0 Mon Sep 17 00:00:00 2001 From: illsilin Date: Wed, 22 Mar 2023 16:34:45 -0700 Subject: [PATCH 18/31] reduce the number of build threads by half --- examples/run_tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/run_tests.sh b/examples/run_tests.sh index f844c0c2c..98ff2b493 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -6,6 +6,7 @@ export HF_TOKEN=$1 export TRANSFORMERS_CACHE=/.cache/huggingface/hub +export NUM_BUILDERS=$(($(nproc)/2)) function print_log_header(){ rm -f $1; From 955826ceaa1ead14392a38a496615ada07335d22 Mon Sep 17 00:00:00 2001 From: illsilin Date: Wed, 22 Mar 2023 18:29:00 -0700 Subject: [PATCH 19/31] further reduce the number of building threads --- examples/run_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 98ff2b493..6a223d5b2 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -6,7 +6,7 @@ export HF_TOKEN=$1 export TRANSFORMERS_CACHE=/.cache/huggingface/hub -export NUM_BUILDERS=$(($(nproc)/2)) +export NUM_BUILDERS=$(($(nproc)/4)) function print_log_header(){ rm -f $1; From 763154bc91a427a181961ea17f10e5bc1a7ea722 Mon Sep 17 00:00:00 2001 From: illsilin Date: Wed, 22 Mar 2023 20:47:38 -0700 Subject: [PATCH 20/31] change the order of archiving and stashing the logs --- Jenkinsfile | 32 +++++++++++++++++++------------- examples/run_tests.sh | 6 +++--- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 46d90c72b..7d8d983fc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,16 +106,22 @@ def Run_Step(Map conf=[:]){ build_ait(conf) dir("examples"){ sh "./run_tests.sh $HF_TOKEN" - archiveArtifacts "01_resnet-50/resnet50.log" - archiveArtifacts "03_bert/bert.log" - archiveArtifacts "04_vit/vit.log" - archiveArtifacts "05_stable_diffusion/sdiff.log" - // stash perf files to master - stash name: "01_resnet-50/resnet50.log" - stash name: "03_bert/bert.log" + } + dir("examples/01_resnet-50"){ + archiveArtifacts "resnet50.log" + stash name: "resnet50.log" + } + dir("examples/03_bert"){ + archiveArtifacts "bert.log" + stash name: "bert.log" + } + dir("examples/04_vit"){ + archiveArtifacts "vit.log" stash name: "04_vit/vit.log" - stash name: "05_stable_diffusion/sdiff.log" - //we will process the results on the master node + } + dir("examples/05_stable_diffusion/"){ + archiveArtifacts "sdiff.log" + stash name: "sdiff.log" } } } @@ -169,10 +175,10 @@ def process_results(Map conf=[:]){ try{ dir("examples"){ // unstash perf files to master - unstash "01_resnet-50/resnet50.log" - unstash "03_bert/bert.log" - unstash "04_vit/vit.log" - unstash "05_stable_diffusion/sdiff.log" + unstash "resnet50.log" + unstash "bert.log" + unstash "vit.log" + unstash "sdiff.log" sh "python3 process_results.py" } } diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 6a223d5b2..148cd121b 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -22,7 +22,7 @@ function print_log_header(){ echo "Running RESNET50 tests" cd 01_resnet-50 print_log_header resnet50.log -#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log echo "Running BERT tests" cd ../03_bert @@ -46,5 +46,5 @@ print_log_header vit.log echo "Running Stable Diffusion tests" cd ../05_stable_diffusion print_log_header sdiff.log -HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log -HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log +#HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log +#HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log From 457a488b9a5bdfdcdd8858667f8537edfa4de613 Mon Sep 17 00:00:00 2001 From: illsilin Date: Thu, 23 Mar 2023 08:19:03 -0700 Subject: [PATCH 21/31] test stashing the logs --- Jenkinsfile | 2 +- examples/run_tests.sh | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7d8d983fc..20a956a4d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -117,7 +117,7 @@ def Run_Step(Map conf=[:]){ } dir("examples/04_vit"){ archiveArtifacts "vit.log" - stash name: "04_vit/vit.log" + stash name: "vit.log" } dir("examples/05_stable_diffusion/"){ archiveArtifacts "sdiff.log" diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 148cd121b..70c362277 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -6,7 +6,6 @@ export HF_TOKEN=$1 export TRANSFORMERS_CACHE=/.cache/huggingface/hub -export NUM_BUILDERS=$(($(nproc)/4)) function print_log_header(){ rm -f $1; @@ -22,27 +21,27 @@ function print_log_header(){ echo "Running RESNET50 tests" cd 01_resnet-50 print_log_header resnet50.log -HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log +#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log echo "Running BERT tests" cd ../03_bert print_log_header bert.log #for sq in 64 128 384 512 1024 #do -# HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log +# HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log #done +export NUM_BUILDERS=$(($(nproc)/2)) echo "Running VIT tests" cd ../04_vit print_log_header vit.log -#HIP_VISIBLE_DEVICES=0,1 python3 benchmark_ait.py 2>&1 | tee -a vit.log +#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a vit.log # test 2 gcd #for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 #do -# HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log & -# HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log +# HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log #done - +export NUM_BUILDERS=$(($(nproc)/4)) echo "Running Stable Diffusion tests" cd ../05_stable_diffusion print_log_header sdiff.log From 895afda99345c757172af02fb0cafa6955f5ea2d Mon Sep 17 00:00:00 2001 From: illsilin Date: Thu, 23 Mar 2023 08:53:14 -0700 Subject: [PATCH 22/31] re-enable tests --- docker/Dockerfile.rocm | 8 ++++++++ examples/process_results.py | 6 ++++-- examples/run_tests.sh | 24 ++++++++++++------------ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1427ce83a..1b7b38bfe 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -104,6 +104,14 @@ RUN pip3 install timm RUN python3 -m pip install transformers click RUN python3 -m pip install diffusers==0.11.1 accelerate +# Install packages for processing the performance results +RUN pip3 install --upgrade pip +RUN pip3 install sqlalchemy==1.4.46 +RUN pip3 install pymysql +RUN pip3 install pandas +RUN pip3 install setuptools-rust +RUN pip3 install sshtunnel + # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata RUN bash /Install/install_detection_deps.sh diff --git a/examples/process_results.py b/examples/process_results.py index 19f013637..0c47435f2 100644 --- a/examples/process_results.py +++ b/examples/process_results.py @@ -136,10 +136,10 @@ def main(): sql_hostname = '127.0.0.1' sql_username = os.environ["dbuser"] sql_password = os.environ["dbpassword"] - sql_main_database = 'sys' - sql_port = 3306 hostname = os.uname()[1] if hostname == 'jwr-amd-132': + sql_main_database = 'sys' + sql_port = 3306 sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'. format(sql_username, sql_password, sql_hostname, sql_main_database)) conn = sqlEngine.connect() @@ -148,6 +148,8 @@ def main(): ssh_user = os.environ["dbsshuser"] ssh_port = int(os.environ["dbsshport"]) ssh_pass = os.environ["dbsshpassword"] + sql_main_database = 'miopen_perf' + sql_port = 3306 with SSHTunnelForwarder( (ssh_host, ssh_port), ssh_username=ssh_user, diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 70c362277..267fb06e2 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -21,29 +21,29 @@ function print_log_header(){ echo "Running RESNET50 tests" cd 01_resnet-50 print_log_header resnet50.log -#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log echo "Running BERT tests" cd ../03_bert print_log_header bert.log -#for sq in 64 128 384 512 1024 -#do -# HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log -#done +for sq in 64 128 384 512 1024 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log +done export NUM_BUILDERS=$(($(nproc)/2)) echo "Running VIT tests" cd ../04_vit print_log_header vit.log -#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a vit.log +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a vit.log # test 2 gcd -#for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 -#do -# HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log -#done +for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log +done export NUM_BUILDERS=$(($(nproc)/4)) echo "Running Stable Diffusion tests" cd ../05_stable_diffusion print_log_header sdiff.log -#HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log -#HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log +HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log +HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log From 5ac7a91037a939532e1f7c9ec085c72bc2f189d9 Mon Sep 17 00:00:00 2001 From: illsilin Date: Thu, 23 Mar 2023 13:47:24 -0700 Subject: [PATCH 23/31] only stash log files --- Jenkinsfile | 8 ++++---- docker/Dockerfile.rocm | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 20a956a4d..f9333dfea 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -109,19 +109,19 @@ def Run_Step(Map conf=[:]){ } dir("examples/01_resnet-50"){ archiveArtifacts "resnet50.log" - stash name: "resnet50.log" + stash includes: "resnet50.log", name: "resnet50.log" } dir("examples/03_bert"){ archiveArtifacts "bert.log" - stash name: "bert.log" + stash includes: "bert.log", name: "bert.log" } dir("examples/04_vit"){ archiveArtifacts "vit.log" - stash name: "vit.log" + stash includes: "vit.log", name: "vit.log" } dir("examples/05_stable_diffusion/"){ archiveArtifacts "sdiff.log" - stash name: "sdiff.log" + stash includes: "sdiff.log", name: "sdiff.log" } } } diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1b7b38bfe..6ac5f3dc3 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -130,6 +130,8 @@ ADD ./docker/install/install_ait.sh /AITemplate/ RUN bash /AITemplate/install_ait.sh # Create a folder for Hugging Face cache +RUN mkdir /.aitemplate +RUN chmod a+rw /.aitemplate RUN mkdir /.cache RUN chmod a+rw /.cache WORKDIR "/.cache" From 3fbddceed20d21f70e4324058a40c110ce35dafc Mon Sep 17 00:00:00 2001 From: illsilin Date: Thu, 23 Mar 2023 14:24:10 -0700 Subject: [PATCH 24/31] fix the parsing script --- Jenkinsfile | 2 +- examples/run_tests.sh | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index f9333dfea..9cce71162 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -105,7 +105,7 @@ def Run_Step(Map conf=[:]){ { build_ait(conf) dir("examples"){ - sh "./run_tests.sh $HF_TOKEN" + sh "./run_tests.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" } dir("examples/01_resnet-50"){ archiveArtifacts "resnet50.log" diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 267fb06e2..1bf14b72c 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -5,27 +5,29 @@ # Hugging Face token export HF_TOKEN=$1 +export GIT_BRANCH=$2 +export hostname=$3 export TRANSFORMERS_CACHE=/.cache/huggingface/hub + function print_log_header(){ rm -f $1; - export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} - echo -n "hostname: " >$1; hostname >> $1; + echo -n "hostname: " $2 &> $1; echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1 rocminfo | grep "Compute Unit:" >> $1 - echo "git_branch: $GIT_BRANCH" >> $1 + echo "git_branch: " $3 >> $1 git show --summary | grep commit >> $1 /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1 } echo "Running RESNET50 tests" cd 01_resnet-50 -print_log_header resnet50.log +print_log_header resnet50.log $hostname $GIT_BRANCH HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log echo "Running BERT tests" cd ../03_bert -print_log_header bert.log +print_log_header bert.log $hostname $GIT_BRANCH for sq in 64 128 384 512 1024 do HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log @@ -34,7 +36,7 @@ done export NUM_BUILDERS=$(($(nproc)/2)) echo "Running VIT tests" cd ../04_vit -print_log_header vit.log +print_log_header vit.log $hostname $GIT_BRANCH HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a vit.log # test 2 gcd for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 @@ -44,6 +46,6 @@ done export NUM_BUILDERS=$(($(nproc)/4)) echo "Running Stable Diffusion tests" cd ../05_stable_diffusion -print_log_header sdiff.log +print_log_header sdiff.log $hostname $GIT_BRANCH HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log From 6379419c93438580ff88749c5c0118bdea113145 Mon Sep 17 00:00:00 2001 From: illsilin Date: Thu, 23 Mar 2023 21:45:14 -0700 Subject: [PATCH 25/31] minor changes to performance scripts --- examples/process_results.py | 7 ++++--- examples/run_tests.sh | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/process_results.py b/examples/process_results.py index 0c47435f2..db3778b9d 100644 --- a/examples/process_results.py +++ b/examples/process_results.py @@ -134,22 +134,22 @@ def main(): print("Number of tests:",len(results)) sql_hostname = '127.0.0.1' + sql_port = 3306 sql_username = os.environ["dbuser"] sql_password = os.environ["dbpassword"] hostname = os.uname()[1] + if hostname == 'jwr-amd-132': sql_main_database = 'sys' - sql_port = 3306 sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'. format(sql_username, sql_password, sql_hostname, sql_main_database)) conn = sqlEngine.connect() else: + sql_main_database = "miopen_perf" ssh_host = os.environ["dbsship"] ssh_user = os.environ["dbsshuser"] ssh_port = int(os.environ["dbsshport"]) ssh_pass = os.environ["dbsshpassword"] - sql_main_database = 'miopen_perf' - sql_port = 3306 with SSHTunnelForwarder( (ssh_host, ssh_port), ssh_username=ssh_user, @@ -158,6 +158,7 @@ def main(): sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'. format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) conn = sqlEngine.connect() + #save gemm performance tests: for i in range(1,len(results)+1): testlist.append("Test%i"%i) diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 1bf14b72c..b049410c2 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -12,7 +12,7 @@ export TRANSFORMERS_CACHE=/.cache/huggingface/hub function print_log_header(){ rm -f $1; - echo -n "hostname: " $2 &> $1; + echo "hostname: " $2 &> $1; echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1 rocminfo | grep "Compute Unit:" >> $1 echo "git_branch: " $3 >> $1 From 2ec4432be5f4d19ccfe38faecdabb70d4024f978 Mon Sep 17 00:00:00 2001 From: illsilin Date: Fri, 24 Mar 2023 07:41:44 -0700 Subject: [PATCH 26/31] minor changes to performance scripts --- examples/process_results.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/examples/process_results.py b/examples/process_results.py index db3778b9d..09797df7b 100644 --- a/examples/process_results.py +++ b/examples/process_results.py @@ -135,8 +135,10 @@ def main(): print("Number of tests:",len(results)) sql_hostname = '127.0.0.1' sql_port = 3306 - sql_username = os.environ["dbuser"] - sql_password = os.environ["dbpassword"] + sql_username = "jenkins_miopen" + #os.environ["dbuser"] + sql_password = "jenkinsmiopen1234" + #os.environ["dbpassword"] hostname = os.uname()[1] if hostname == 'jwr-amd-132': @@ -146,10 +148,14 @@ def main(): conn = sqlEngine.connect() else: sql_main_database = "miopen_perf" - ssh_host = os.environ["dbsship"] - ssh_user = os.environ["dbsshuser"] - ssh_port = int(os.environ["dbsshport"]) - ssh_pass = os.environ["dbsshpassword"] + ssh_host = "10.216.64.100" + #os.environ["dbsship"] + ssh_user = "miopenpdb" + #os.environ["dbsshuser"] + ssh_port = 20057 + #int(os.environ["dbsshport"]) + ssh_pass = "miopen!234" + #os.environ["dbsshpassword"] with SSHTunnelForwarder( (ssh_host, ssh_port), ssh_username=ssh_user, From 5019556b8ef92196f101d2a126d45960fe273929 Mon Sep 17 00:00:00 2001 From: illsilin Date: Fri, 24 Mar 2023 10:33:58 -0700 Subject: [PATCH 27/31] rename logs, update processing --- Jenkinsfile | 24 +++++++------- examples/process_results.py | 66 ++++++++++++++++++------------------- examples/run_tests.sh | 20 +++++------ 3 files changed, 54 insertions(+), 56 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9cce71162..a85ed41f4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -108,20 +108,20 @@ def Run_Step(Map conf=[:]){ sh "./run_tests.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" } dir("examples/01_resnet-50"){ - archiveArtifacts "resnet50.log" - stash includes: "resnet50.log", name: "resnet50.log" + archiveArtifacts "01_resnet50.log" + stash includes: "01_resnet50.log", name: "01_resnet50.log" } dir("examples/03_bert"){ - archiveArtifacts "bert.log" - stash includes: "bert.log", name: "bert.log" + archiveArtifacts "03_bert.log" + stash includes: "03_bert.log", name: "03_bert.log" } dir("examples/04_vit"){ - archiveArtifacts "vit.log" - stash includes: "vit.log", name: "vit.log" + archiveArtifacts "04_vit.log" + stash includes: "04_vit.log", name: "04_vit.log" } dir("examples/05_stable_diffusion/"){ - archiveArtifacts "sdiff.log" - stash includes: "sdiff.log", name: "sdiff.log" + archiveArtifacts "05_sdiff.log" + stash includes: "05_sdiff.log", name: "05_sdiff.log" } } } @@ -175,10 +175,10 @@ def process_results(Map conf=[:]){ try{ dir("examples"){ // unstash perf files to master - unstash "resnet50.log" - unstash "bert.log" - unstash "vit.log" - unstash "sdiff.log" + unstash "01_resnet50.log" + unstash "03_bert.log" + unstash "04_vit.log" + unstash "05_sdiff.log" sh "python3 process_results.py" } } diff --git a/examples/process_results.py b/examples/process_results.py index 09797df7b..9bbcf790c 100644 --- a/examples/process_results.py +++ b/examples/process_results.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import glob,os, io, argparse, datetime -#import numpy as np +import glob, os, io, argparse, datetime import sqlalchemy from sqlalchemy.types import NVARCHAR, Float, Integer +from sqlalchemy import text import pymysql import pandas as pd from sshtunnel import SSHTunnelForwarder @@ -79,8 +79,8 @@ def parse_logfile(files): return res def get_baseline(table, connection): - query = '''SELECT * from '''+table+''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''+table+''' where git_branch='amd-develop' );''' - return pd.read_sql_query(query, connection) + query = text('''SELECT * from '''+table+''' WHERE Datetime = (SELECT MIN(Datetime) FROM '''+table+''' where Test64 IS NOT NULL );''') + return pd.read_sql(query, connection) def store_new_test_result(table_name, test_results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, connection): params=[str(node_id),str(branch_name),str(commit),str(gpu_arch),compute_units,ngpus,str(rocm_vers),str(compiler_vers),str(datetime.datetime.now())] @@ -93,7 +93,7 @@ def store_new_test_result(table_name, test_results, testlist, node_id, branch_na def compare_test_to_baseline(baseline,test,testlist): regression=0 - if not baseline.empty: + if not len(baseline)==0: base=baseline[testlist].to_numpy(dtype='float') base_list=base[0] ave_perf=0 @@ -132,47 +132,45 @@ def main(): #parse results, get the Tflops value for "Best Perf" kernels results=parse_logfile(files) + for i in range(1,len(results)+1): + testlist.append("Test%i"%i) + table_name="ait_performance" + print("Number of tests:",len(results)) sql_hostname = '127.0.0.1' sql_port = 3306 - sql_username = "jenkins_miopen" - #os.environ["dbuser"] - sql_password = "jenkinsmiopen1234" - #os.environ["dbpassword"] - hostname = os.uname()[1] + sql_username = os.environ["dbuser"] + sql_password = os.environ["dbpassword"] + host = os.uname()[1] - if hostname == 'jwr-amd-132': + if host == 'jwr-amd-132': + print("connecting to local database") sql_main_database = 'sys' sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'. format(sql_username, sql_password, sql_hostname, sql_main_database)) conn = sqlEngine.connect() + baseline = get_baseline(table_name,conn) + store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn) + conn.close() else: + print("connecting to remote database") sql_main_database = "miopen_perf" - ssh_host = "10.216.64.100" - #os.environ["dbsship"] - ssh_user = "miopenpdb" - #os.environ["dbsshuser"] - ssh_port = 20057 - #int(os.environ["dbsshport"]) - ssh_pass = "miopen!234" - #os.environ["dbsshpassword"] + ssh_host = os.environ["dbsship"] + ssh_user = os.environ["dbsshuser"] + ssh_port = int(os.environ["dbsshport"]) + ssh_pass = os.environ["dbsshpassword"] with SSHTunnelForwarder( - (ssh_host, ssh_port), - ssh_username=ssh_user, - ssh_password=ssh_pass, - remote_bind_address=(sql_hostname, sql_port)) as tunnel: - sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'. - format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) - conn = sqlEngine.connect() + (ssh_host, ssh_port), + ssh_username=ssh_user, + ssh_password=ssh_pass, + remote_bind_address=(sql_hostname, sql_port)) as tunnel: - #save gemm performance tests: - for i in range(1,len(results)+1): - testlist.append("Test%i"%i) - table_name="ait_performance" - - baseline = get_baseline(table_name,conn) - store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn) - conn.close() + sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'. + format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) + conn = sqlEngine.connect() + baseline = get_baseline(table_name,conn) + store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn) + conn.close() #compare the results to the baseline if baseline exists regression=0 diff --git a/examples/run_tests.sh b/examples/run_tests.sh index b049410c2..5979ecfac 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -22,30 +22,30 @@ function print_log_header(){ echo "Running RESNET50 tests" cd 01_resnet-50 -print_log_header resnet50.log $hostname $GIT_BRANCH -HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a resnet50.log +print_log_header 01_resnet50.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 01_resnet50.log echo "Running BERT tests" cd ../03_bert -print_log_header bert.log $hostname $GIT_BRANCH +print_log_header 03_bert.log $hostname $GIT_BRANCH for sq in 64 128 384 512 1024 do - HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a bert.log + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a 03_bert.log done export NUM_BUILDERS=$(($(nproc)/2)) echo "Running VIT tests" cd ../04_vit -print_log_header vit.log $hostname $GIT_BRANCH -HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a vit.log +print_log_header 04_vit.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 04_vit.log # test 2 gcd for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 do - HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a vit.log + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a 04_vit.log done export NUM_BUILDERS=$(($(nproc)/4)) echo "Running Stable Diffusion tests" cd ../05_stable_diffusion -print_log_header sdiff.log $hostname $GIT_BRANCH -HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a sdiff.log -HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a sdiff.log +print_log_header 05_sdiff.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a 05_sdiff.log +HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a 05_sdiff.log From 2292048fa4ad489c85dd75540a44fa2800d36aff Mon Sep 17 00:00:00 2001 From: illsilin Date: Sat, 25 Mar 2023 02:09:01 -0700 Subject: [PATCH 28/31] report which files are being parsed --- examples/process_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/process_results.py b/examples/process_results.py index 9bbcf790c..01b5852fd 100644 --- a/examples/process_results.py +++ b/examples/process_results.py @@ -120,6 +120,7 @@ def main(): testlist=[] #parse the test parameters from the logfile for filename in files: + print("processing file: ",filename) branch_name, commit, node_id, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers = get_log_params(filename) print("Branch name:",branch_name) From ad7c61b687f30ff5a7b8c6a4d1d21d06530242d8 Mon Sep 17 00:00:00 2001 From: illsilin Date: Mon, 27 Mar 2023 08:03:15 -0700 Subject: [PATCH 29/31] clean-up any old logs before unstashing new ones --- Jenkinsfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index a85ed41f4..91aeaab6f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -174,7 +174,8 @@ def process_results(Map conf=[:]){ timeout(time: 1, unit: 'HOURS'){ try{ dir("examples"){ - // unstash perf files to master + // clean up any old logs, then unstash perf files to master + sh "rm -rf *.log" unstash "01_resnet50.log" unstash "03_bert.log" unstash "04_vit.log" From fe37b9eb39aa432690bf8d30bb1da3133b499836 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 28 Mar 2023 13:00:44 +0800 Subject: [PATCH 30/31] optimize dockerfile --- docker/Dockerfile.rocm | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 6ac5f3dc3..1d430c396 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -44,9 +44,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- libpthread-stubs0-dev \ llvm-amdgpu \ pkg-config \ - python \ python3 \ - python-dev \ python3-dev \ python3-pip \ software-properties-common \ @@ -100,17 +98,17 @@ RUN bash /Install/install_doc_dep.sh RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 # Install some useful python packages -RUN pip3 install timm -RUN python3 -m pip install transformers click -RUN python3 -m pip install diffusers==0.11.1 accelerate +RUN pip3 install --upgrade pip + +RUN pip3 install transformers click sympy recordtype parameterized einops jinja2 +RUN pip3 install diffusers==0.11.1 accelerate # Install packages for processing the performance results -RUN pip3 install --upgrade pip RUN pip3 install sqlalchemy==1.4.46 -RUN pip3 install pymysql -RUN pip3 install pandas -RUN pip3 install setuptools-rust -RUN pip3 install sshtunnel +RUN pip3 install pymysql pandas setuptools-rust sshtunnel + +# Install lint packages +RUN pip3 install ufmt==2.0.1 click==8.1.3 black==22.12.0 flake8==5.0.4 # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata @@ -130,14 +128,10 @@ ADD ./docker/install/install_ait.sh /AITemplate/ RUN bash /AITemplate/install_ait.sh # Create a folder for Hugging Face cache -RUN mkdir /.aitemplate -RUN chmod a+rw /.aitemplate -RUN mkdir /.cache -RUN chmod a+rw /.cache +RUN mkdir /.aitemplate && chmod a+rw /.aitemplate +RUN mkdir /.cache && chmod a+rw /.cache WORKDIR "/.cache" -RUN mkdir huggingface -RUN chmod a+rw huggingface +RUN mkdir huggingface && chmod a+rw huggingface WORKDIR "/.cache/huggingface" -RUN mkdir hub -RUN chmod a+rw hub +RUN mkdir hub && chmod a+rw hub WORKDIR / From a72f4b9e78773e6d918473dee6ffd08d9f9f48bd Mon Sep 17 00:00:00 2001 From: illsilin Date: Tue, 28 Mar 2023 11:30:14 -0700 Subject: [PATCH 31/31] reduce the number of tests in regular CI, add daily QA --- Jenkinsfile | 21 +++++++++++++++++- examples/run_qa.sh | 51 +++++++++++++++++++++++++++++++++++++++++++ examples/run_tests.sh | 11 ++-------- 3 files changed, 73 insertions(+), 10 deletions(-) create mode 100755 examples/run_qa.sh diff --git a/Jenkinsfile b/Jenkinsfile index 91aeaab6f..88e12687b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -105,7 +105,12 @@ def Run_Step(Map conf=[:]){ { build_ait(conf) dir("examples"){ - sh "./run_tests.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" + if (params.RUN_FULL_QA){ + sh "./run_qa.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" + } + else{ + sh "./run_tests.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" + } } dir("examples/01_resnet-50"){ archiveArtifacts "01_resnet50.log" @@ -192,8 +197,14 @@ def process_results(Map conf=[:]){ } } +//launch amd-develop branch daily at 17:00 UT in FULL_QA mode +CRON_SETTINGS = BRANCH_NAME == "amd-develop" ? '''0 17 * * * % RUN_FULL_QA=true''' : "" + pipeline { agent none + triggers { + parameterizedCron(CRON_SETTINGS) + } options { parallelsAlwaysFailFast() } @@ -202,6 +213,10 @@ pipeline { name: 'ROCMVERSION', defaultValue: '5.4.3', description: 'Specify which ROCM version to use: 5.4.3 (default).') + booleanParam( + name: "RUN_FULL_QA", + defaultValue: false, + description: "Select whether to run small set of performance tests (default) or full QA") } environment{ dbuser = "${dbuser}" @@ -230,6 +245,10 @@ pipeline { } stage("Process Performance Test Results") { + when { + beforeAgent true + expression { params.RUN_FULL_QA.toBoolean() } + } parallel { stage("Process results"){ diff --git a/examples/run_qa.sh b/examples/run_qa.sh new file mode 100755 index 000000000..5979ecfac --- /dev/null +++ b/examples/run_qa.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# this is a script to run tests during ROCM CI +# input argument: +# Hugging Face token + +export HF_TOKEN=$1 +export GIT_BRANCH=$2 +export hostname=$3 +export TRANSFORMERS_CACHE=/.cache/huggingface/hub + + +function print_log_header(){ + rm -f $1; + echo "hostname: " $2 &> $1; + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1 + rocminfo | grep "Compute Unit:" >> $1 + echo "git_branch: " $3 >> $1 + git show --summary | grep commit >> $1 + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1 +} + +echo "Running RESNET50 tests" +cd 01_resnet-50 +print_log_header 01_resnet50.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 01_resnet50.log + +echo "Running BERT tests" +cd ../03_bert +print_log_header 03_bert.log $hostname $GIT_BRANCH +for sq in 64 128 384 512 1024 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a 03_bert.log +done + +export NUM_BUILDERS=$(($(nproc)/2)) +echo "Running VIT tests" +cd ../04_vit +print_log_header 04_vit.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 04_vit.log +# test 2 gcd +for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a 04_vit.log +done +export NUM_BUILDERS=$(($(nproc)/4)) +echo "Running Stable Diffusion tests" +cd ../05_stable_diffusion +print_log_header 05_sdiff.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a 05_sdiff.log +HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a 05_sdiff.log diff --git a/examples/run_tests.sh b/examples/run_tests.sh index 5979ecfac..d51bef013 100755 --- a/examples/run_tests.sh +++ b/examples/run_tests.sh @@ -28,21 +28,14 @@ HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 01_resnet50.log echo "Running BERT tests" cd ../03_bert print_log_header 03_bert.log $hostname $GIT_BRANCH -for sq in 64 128 384 512 1024 -do - HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a 03_bert.log -done +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length 64 2>&1 | tee -a 03_bert.log export NUM_BUILDERS=$(($(nproc)/2)) echo "Running VIT tests" cd ../04_vit print_log_header 04_vit.log $hostname $GIT_BRANCH HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 04_vit.log -# test 2 gcd -for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 -do - HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a 04_vit.log -done + export NUM_BUILDERS=$(($(nproc)/4)) echo "Running Stable Diffusion tests" cd ../05_stable_diffusion