Skip to content

Commit

Permalink
Add Linux ROCm CI Pipeline (#21798)
Browse files Browse the repository at this point in the history
### Description

* Add new ROCm CI pipeline (`Linux ROCm CI Pipeline`) focusing on
inference.
* Resolve test errors; disable flaky tests.

based on test PR #21614.
  • Loading branch information
mindest authored Aug 30, 2024
1 parent 9242596 commit bfa4da4
Show file tree
Hide file tree
Showing 8 changed files with 382 additions and 28 deletions.
2 changes: 1 addition & 1 deletion cmake/onnxruntime_kernel_explorer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ add_dependencies(kernel_explorer onnxruntime_pybind11_state)

enable_testing()
find_package(Python COMPONENTS Interpreter REQUIRED)
add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/rocm/rocm_provider_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
#include "core/providers/rocm/gpu_data_transfer.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
#include "orttraining/training_ops/rocm/communication/nccl_service.h"
#endif

using namespace onnxruntime;

namespace onnxruntime {

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
namespace rocm {
rocm::INcclService& GetINcclService();
}
Expand Down Expand Up @@ -155,7 +155,7 @@ struct ProviderInfo_ROCM_Impl final : ProviderInfo_ROCM {
info = ROCMExecutionProviderInfo::FromProviderOptions(options);
}

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
rocm::INcclService& GetINcclService() override {
return rocm::GetINcclService();
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/rocm/rocm_provider_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct ProviderInfo_ROCM {
virtual int hipGetDeviceCount() = 0;
virtual void ROCMExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::ROCMExecutionProviderInfo& info) = 0;

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
virtual onnxruntime::rocm::INcclService& GetINcclService() = 0;
#endif

Expand Down
62 changes: 41 additions & 21 deletions onnxruntime/test/providers/cpu/model_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ TEST_P(ModelTest, Run) {

// when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure
if (model_path.find(ORT_TSTR("_MNIST")) > 0) {
if (provider_name == "cuda" || provider_name == "openvino") {
if (provider_name == "cuda" || provider_name == "openvino" || provider_name == "rocm") {
per_sample_tolerance = 2.5e-2;
relative_per_sample_tolerance = 1e-2;
}
Expand Down Expand Up @@ -407,9 +407,7 @@ static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
#endif
static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
#ifdef USE_ROCM
static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
#endif
static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
// For any non-Android system, NNAPI will only be used for ort model converter
#if defined(USE_NNAPI) && defined(__ANDROID__)
Expand Down Expand Up @@ -521,22 +519,39 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
ORT_TSTR("operator_pow"),
};

static const ORTCHAR_T* cuda_flaky_tests[] = {ORT_TSTR("fp16_inception_v1"),
ORT_TSTR("fp16_shufflenet"),
ORT_TSTR("fp16_tiny_yolov2"),
ORT_TSTR("candy"),
ORT_TSTR("tinyyolov3"),
ORT_TSTR("mlperf_ssd_mobilenet_300"),
ORT_TSTR("mlperf_ssd_resnet34_1200"),
ORT_TSTR("tf_inception_v1"),
ORT_TSTR("faster_rcnn"),
ORT_TSTR("split_zero_size_splits"),
ORT_TSTR("convtranspose_3d"),
ORT_TSTR("fp16_test_tiny_yolov2-Candy"),
ORT_TSTR("fp16_coreml_FNS-Candy"),
ORT_TSTR("fp16_test_tiny_yolov2"),
ORT_TSTR("fp16_test_shufflenet"),
ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
static const ORTCHAR_T* cuda_rocm_flaky_tests[] = {ORT_TSTR("fp16_inception_v1"),
ORT_TSTR("fp16_shufflenet"),
ORT_TSTR("fp16_tiny_yolov2"),
ORT_TSTR("candy"),
ORT_TSTR("tinyyolov3"),
ORT_TSTR("mlperf_ssd_mobilenet_300"),
ORT_TSTR("mlperf_ssd_resnet34_1200"),
ORT_TSTR("tf_inception_v1"),
ORT_TSTR("faster_rcnn"),
ORT_TSTR("split_zero_size_splits"),
ORT_TSTR("convtranspose_3d"),
ORT_TSTR("fp16_test_tiny_yolov2-Candy"),
ORT_TSTR("fp16_coreml_FNS-Candy"),
ORT_TSTR("fp16_test_tiny_yolov2"),
ORT_TSTR("fp16_test_shufflenet"),
ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
// For ROCm EP, also disable the following tests due to flakiness,
// mainly with precision issue and random memory access fault.
static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
ORT_TSTR("bvlc_reference_caffenet"),
ORT_TSTR("bvlc_reference_rcnn_ilsvrc13"),
ORT_TSTR("coreml_Resnet50_ImageNet"),
ORT_TSTR("mlperf_resnet"),
ORT_TSTR("mobilenetv2-1.0"),
ORT_TSTR("shufflenet"),
// models from model zoo
ORT_TSTR("AlexNet"),
ORT_TSTR("CaffeNet"),
ORT_TSTR("MobileNet v2-7"),
ORT_TSTR("R-CNN ILSVRC13"),
ORT_TSTR("ShuffleNet-v1"),
ORT_TSTR("version-RFB-320"),
ORT_TSTR("version-RFB-640")};
static const ORTCHAR_T* openvino_disabled_tests[] = {
ORT_TSTR("tf_mobilenet_v1_1.0_224"),
ORT_TSTR("bertsquad"),
Expand Down Expand Up @@ -663,8 +678,13 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {

std::unordered_set<std::basic_string<ORTCHAR_T>> all_disabled_tests(std::begin(immutable_broken_tests),
std::end(immutable_broken_tests));
if (provider_name == provider_name_cuda) {
all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests));
bool provider_cuda_or_rocm = provider_name == provider_name_cuda;
if (provider_name == provider_name_rocm) {
provider_cuda_or_rocm = true;
all_disabled_tests.insert(std::begin(rocm_disabled_tests), std::end(rocm_disabled_tests));
}
if (provider_cuda_or_rocm) {
all_disabled_tests.insert(std::begin(cuda_rocm_flaky_tests), std::end(cuda_rocm_flaky_tests));
} else if (provider_name == provider_name_dml) {
all_disabled_tests.insert(std::begin(dml_disabled_tests), std::end(dml_disabled_tests));
} else if (provider_name == provider_name_dnnl) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) {
test.AddOutput<float>("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f});
test.Run(OpTester::ExpectResult::kExpectFailure,
"indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]",
{kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
{kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
}

TEST(Scatter, InvalidIndex) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/python/onnxruntime_test_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -1689,7 +1689,7 @@ def test_register_custom_e_ps_library(self):

available_eps = C.get_available_providers()
# skip amd gpu build
if "kRocmExecutionProvider" in available_eps:
if "ROCMExecutionProvider" in available_eps:
return
if sys.platform.startswith("win"):
shared_library = "test_execution_provider.dll"
Expand Down
238 changes: 238 additions & 0 deletions tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
trigger:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
pr:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
#### end trigger ####

name: 'linux_ci_$(Date:yyyyMMdd)_$(Rev:r)'

# gid of video and render group on gcramdrr1-mi100-085 and -86
variables:
- name: video
value: 44
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"

jobs:
- job: Linux_Build
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
workspace:
clean: all
pool: onnxruntime-Ubuntu2204-AMD-CPU
timeoutInMinutes: 240

steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()

- checkout: self
clean: true
submodules: recursive


- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: Cache@2
inputs:
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
path: $(CCACHE_DIR)
cacheHitVar: CACHE_RESTORED
restoreKeys: |
"$(TODAY)" | "$(Build.SourceBranch)"
"$(TODAY)" |
displayName: Cache Task

- script: mkdir -p $(CCACHE_DIR)
condition: ne(variables.CACHE_RESTORED, 'true')
displayName: Create Cache Dir

- task: CmdLine@2
inputs:
script: |
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(CCACHE_DIR):/cache \
-e CCACHE_DIR=/cache \
--workdir /onnxruntime_src \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
env; \
ccache -s; \
python tools/ci_build/build.py \
--config Release \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
CMAKE_HIP_ARCHITECTURES=gfx90a \
--mpi_home /opt/ompi \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--update \
--build_dir /build \
--build \
--build_shared_lib \
--parallel \
--build_wheel \
--enable_onnx_tests \
--skip_submodule_sync \
--use_cache \
--skip_tests --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest; \
ccache -sv; \
ccache -z"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Build onnxruntime'

- task: CmdLine@2
inputs:
script: |
cd $(Build.BinariesDirectory)/Release
find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
displayName: 'Find Executable Files'

- task: PublishPipelineArtifact@0
displayName: 'Publish Pipeline Artifact'
inputs:
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'

- template: templates/explicitly-defined-final-tasks.yml

- job: Linux_Test
workspace:
clean: all
pool: AMD-GPU
dependsOn:
- Linux_Build
timeoutInMinutes: 120

steps:
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Artifact'
inputs:
buildType: 'current'
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'

- checkout: self
clean: true
submodules: recursive

- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
inputs:
script: |
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--workdir /build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
xargs -a /build/Release/perms.txt chmod a+x; \
python /onnxruntime_src/tools/ci_build/build.py \
--config Release \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
CMAKE_HIP_ARCHITECTURES=gfx90a \
--mpi_home /opt/ompi \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--build_dir /build \
--build_shared_lib \
--parallel \
--build_wheel \
--skip_submodule_sync \
--test --enable_onnx_tests --enable_transformers_tool_test \
--cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run onnxruntime unit tests'

- task: CmdLine@2
inputs:
script: |-
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
-e OPENBLAS_NUM_THREADS=1 \
-e OPENMP_NUM_THREADS=1 \
-e MKL_NUM_THREADS=1 \
-e KERNEL_EXPLORER_BUILD_DIR=/build/Release \
-e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \
-e KERNEL_EXPLORER_TEST_USE_CUPY=1 \
-e CUPY_CACHE_DIR=/build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
condition: succeededOrFailed()

- template: templates/clean-agent-build-directory-step.yml
Loading

0 comments on commit bfa4da4

Please sign in to comment.